12345678910111213141516171819202122232425262728293031323334353637383940414243444546 |
- import argparse
- import logging
- import subprocess
- from toolchain.extractors.extraction_error import ExtractionError
- from toolchain.extractors.text_extractor import TextExtractor
- logger = logging.getLogger(__name__)
- class PdfTextExtractor(TextExtractor):
- DEFAULT_SUBPROCESS_TIMEOUT = 120
- def extract(self, input_path, output_path, config={}):
- logger.info("Extracting text from {0} to {1}.".format(input_path, output_path))
- subprocess_timeout = int(config.get("pdftotext_subprocess_timeout", self.DEFAULT_SUBPROCESS_TIMEOUT))
- try:
- result = subprocess.run([
- self.extraction_tool,
- input_path, output_path,
- ], timeout=subprocess_timeout, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
- rc = result.returncode
- if rc != 0:
- logger.error("Non-zero return code {0} from extraction subprocess; output file may be absent or incomplete.".format(rc))
- raise ExtractionError("Non-zero return code {0} from extraction subprocess.".format(rc))
- except subprocess.TimeoutExpired:
- logger.error("Subprocess did not complete within required {0} seconds; output file may be absent or incomplete.".format(subprocess_timeout))
- raise ExtractionError("Subprocess did not complete within required {0} seconds.".format(subprocess_timeout))
- except Exception as e:
- logger.error("Error extracting file: {0}.".format(e))
- raise ExtractionError(e)
- if __name__ == "__main__":
- argparser = argparse.ArgumentParser()
- argparser.add_argument("pdftotext_path", help="path to pdftotext")
- argparser.add_argument("input_path", help="path to input file")
- argparser.add_argument("output_path", help="path to output file")
- argparser.add_argument("--subprocess_timeout", type=int, default=120, help="timeout limit in seconds for running extraction subprocess")
- args = argparser.parse_args()
- config = {
- "pdftotext_subprocess_timeout" : args.subprocess_timeout,
- }
- PdfTextExtractor(args.pdftotext_path).extract(args.input_path, args.output_path, config)
|