Oniloinsigh
/
stor-toolchain


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546
							import argparse
import logging
import subprocess

from toolchain.extractors.extraction_error import ExtractionError
from toolchain.extractors.text_extractor import TextExtractor

logger = logging.getLogger(__name__)

class PdfTextExtractor(TextExtractor):

    DEFAULT_SUBPROCESS_TIMEOUT = 120

    def extract(self, input_path, output_path, config={}):
        logger.info("Extracting text from {0} to {1}.".format(input_path, output_path))
        subprocess_timeout = int(config.get("pdftotext_subprocess_timeout", self.DEFAULT_SUBPROCESS_TIMEOUT))
        try:
            result = subprocess.run([
                self.extraction_tool,
                input_path, output_path,
            ], timeout=subprocess_timeout, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
            rc = result.returncode
            if rc != 0:
                logger.error("Non-zero return code {0} from extraction subprocess; output file may be absent or incomplete.".format(rc))
                raise ExtractionError("Non-zero return code {0} from extraction subprocess.".format(rc))
        except subprocess.TimeoutExpired:
            logger.error("Subprocess did not complete within required {0} seconds; output file may be absent or incomplete.".format(subprocess_timeout))
            raise ExtractionError("Subprocess did not complete within required {0} seconds.".format(subprocess_timeout))
        except Exception as e:
            logger.error("Error extracting file: {0}.".format(e))
            raise ExtractionError(e)


if __name__ == "__main__":
    argparser = argparse.ArgumentParser()
    argparser.add_argument("pdftotext_path", help="path to pdftotext")
    argparser.add_argument("input_path", help="path to input file")
    argparser.add_argument("output_path", help="path to output file")
    argparser.add_argument("--subprocess_timeout", type=int, default=120, help="timeout limit in seconds for running extraction subprocess")
    args = argparser.parse_args()

    config = {
        "pdftotext_subprocess_timeout" : args.subprocess_timeout,
    }

    PdfTextExtractor(args.pdftotext_path).extract(args.input_path, args.output_path, config)