pdf_text_extractor.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. import argparse
  2. import logging
  3. import subprocess
  4. from toolchain.extractors.extraction_error import ExtractionError
  5. from toolchain.extractors.text_extractor import TextExtractor
  6. logger = logging.getLogger(__name__)
  7. class PdfTextExtractor(TextExtractor):
  8. DEFAULT_SUBPROCESS_TIMEOUT = 120
  9. def extract(self, input_path, output_path, config={}):
  10. logger.info("Extracting text from {0} to {1}.".format(input_path, output_path))
  11. subprocess_timeout = int(config.get("pdftotext_subprocess_timeout", self.DEFAULT_SUBPROCESS_TIMEOUT))
  12. try:
  13. result = subprocess.run([
  14. self.extraction_tool,
  15. input_path, output_path,
  16. ], timeout=subprocess_timeout, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
  17. rc = result.returncode
  18. if rc != 0:
  19. logger.error("Non-zero return code {0} from extraction subprocess; output file may be absent or incomplete.".format(rc))
  20. raise ExtractionError("Non-zero return code {0} from extraction subprocess.".format(rc))
  21. except subprocess.TimeoutExpired:
  22. logger.error("Subprocess did not complete within required {0} seconds; output file may be absent or incomplete.".format(subprocess_timeout))
  23. raise ExtractionError("Subprocess did not complete within required {0} seconds.".format(subprocess_timeout))
  24. except Exception as e:
  25. logger.error("Error extracting file: {0}.".format(e))
  26. raise ExtractionError(e)
  27. if __name__ == "__main__":
  28. argparser = argparse.ArgumentParser()
  29. argparser.add_argument("pdftotext_path", help="path to pdftotext")
  30. argparser.add_argument("input_path", help="path to input file")
  31. argparser.add_argument("output_path", help="path to output file")
  32. argparser.add_argument("--subprocess_timeout", type=int, default=120, help="timeout limit in seconds for running extraction subprocess")
  33. args = argparser.parse_args()
  34. config = {
  35. "pdftotext_subprocess_timeout" : args.subprocess_timeout,
  36. }
  37. PdfTextExtractor(args.pdftotext_path).extract(args.input_path, args.output_path, config)