import argparse import logging import os import pathlib import shutil from collections import namedtuple from toolchain.cleaners.monolingual_cleaner import MonolingualCleaner from toolchain.cleaners.post_alignment_cleaner import PostAlignmentCleaner from toolchain.common.file_size_counter import FileSizeCounter from toolchain.common.language_detector import LanguageDetector from toolchain.common.raw_file_indexer import RawFileIndexer from toolchain.common.templates import OutputPathTemplate from toolchain.common.toolchain_error import ToolchainError from toolchain.docalign.document_aligner import DocumentAligner from toolchain.extractors.editable_text_extractor import EditableTextExtractor from toolchain.extractors.pdf_text_extractor import PdfTextExtractor from toolchain.extractors.plain_text_extractor import PlainTextExtractor from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer from toolchain.sentalign.sentence_aligner import SentenceAligner from toolchain.splitters.editable_sentence_splitter import EditableSentenceSplitter from toolchain.splitters.pdf_sentence_splitter import PdfSentenceSplitter from toolchain.toolchain_processor import ToolchainProcessor from toolchain.writers.tmx_creator import TmxCreator from toolchain.writers.file_concatenator import FileConcatenator logger = logging.getLogger(__name__) ParserTypes = namedtuple("ParserTypes", ["extractor", "extraction_tool", "splitter"]) class DocToTmxProcessor(ToolchainProcessor): HUNALIGN = pathlib.Path(os.environ["HUNALIGNPATH"]).resolve() PDFTOTEXT = pathlib.Path(os.environ["PDFTOTEXTPATH"]).resolve() LIBREOFFICE = pathlib.Path(os.environ["LIBREOFFICEPATH"]).resolve() BASE_OUTPUT_FILESTEM = "doc_{0}" PARSERS = { ".doc" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter), ".docx" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter), ".odt" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter), ".pdf" : ParserTypes(extractor=PdfTextExtractor, extraction_tool=PDFTOTEXT, splitter=PdfSentenceSplitter), ".rtf" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter), ".txt" : ParserTypes(extractor=PlainTextExtractor, extraction_tool=None, splitter=EditableSentenceSplitter), } EXTENSIONS = PARSERS.keys() def process(self, id, input_dir, artefact_dir, output_dir): logger.info("Starting Doc to TMX toolchain for input directory {0}.".format(input_dir)) try: input_dirpath = pathlib.Path(input_dir).resolve() artefact_dirpath = pathlib.Path(artefact_dir).resolve() output_dirpath = pathlib.Path(output_dir).resolve() artefact_dirpath.mkdir(parents=True, exist_ok=True) output_dirpath.mkdir(parents=True, exist_ok=True) return self.process_directory(id, input_dirpath, artefact_dirpath, output_dirpath) except Exception as e: logger.exception(e) logger.error("Error processing documents: {0}.".format(e)) raise def process_directory(self, id, input_basedir, artefact_basedir, output_basedir): langident_dir = self.create_directory(artefact_basedir, "langident") rejected_dir = self.create_directory(artefact_basedir, "rejected") docalign_dir = self.create_directory(artefact_basedir, "docalign") sentalign_dir = self.create_directory(artefact_basedir, "sentalign") clean_dir = self.create_directory(artefact_basedir, "clean") concatenate_dir = self.create_directory(artefact_basedir, "concatenate") bilingual_dir = self.create_directory(output_basedir, "bilingual") monolingual_dir = self.create_directory(output_basedir, "monolingual") index_file = artefact_basedir.joinpath(self.INDEX_FILENAME) RawFileIndexer().index_files(input_basedir, self.EXTENSIONS, index_file) logger.info("Indexed {0} to {1}.".format(input_basedir, index_file)) additional_tmx_args = dict(self.global_tmx_args) additional_tmx_args["o_tmf"] = "Corpus" rejected = 0 files_by_language = {} with open(index_file) as index: for line in index.read().splitlines(): file_rejected = self.preprocess_file(line, rejected_dir, artefact_basedir, files_by_language) if file_rejected: rejected += 1 unaligned_file_list_src = files_by_language.get(self.lang_src, []) unaligned_file_list_tgt = files_by_language.get(self.lang_tgt, []) aligned_document_pairs, unmatched_list_src, unmatched_list_tgt = DocumentAligner(self.docalign_config).align( unaligned_file_list_src, unaligned_file_list_tgt, docalign_dir) monolingual_file_info_src = self.resolve_unmatched_files(id, unmatched_list_src, self.lang_src, clean_dir, rejected_dir, monolingual_dir, self.keep_unmatched_src, self.monolingual_filename_template_source) monolingual_file_info_tgt = self.resolve_unmatched_files(id, unmatched_list_tgt, self.lang_tgt, clean_dir, rejected_dir, monolingual_dir, self.keep_unmatched_tgt, self.monolingual_filename_template_target) cleaned_file_list_src = [] cleaned_file_list_tgt = [] for index, document_pair in enumerate(aligned_document_pairs): pair_label = "sa_{0}".format(index+1) path_docaligned_src = document_pair[0] path_docaligned_tgt = document_pair[1] sentalign_pair_dir = self.create_directory(sentalign_dir, pair_label) sentalign_pair_base_path = sentalign_pair_dir.joinpath(pair_label) path_sentaligned_src = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_src)) path_sentaligned_tgt = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_tgt)) SentenceAligner(self.HUNALIGN).align(path_docaligned_src, path_docaligned_tgt, path_sentaligned_src, path_sentaligned_tgt, sentalign_pair_dir, self.sentalign_config) logger.info("Sentence aligned to {0} and {1}.".format(path_sentaligned_src, path_sentaligned_tgt)) clean_base_path = clean_dir.joinpath(pair_label) path_cleaned_src = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_src)) path_cleaned_tgt = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_tgt)) path_cleaned_rejected = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, "rejected")) PostAlignmentCleaner(self.lang_src, self.lang_tgt, config=self.cleaner_config).clean( path_sentaligned_src, path_sentaligned_tgt, path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected) shutil.copy(path_cleaned_rejected, rejected_dir) cleaned_file_list_src.append(path_cleaned_src) cleaned_file_list_tgt.append(path_cleaned_tgt) logger.info("Cleaned to {0} and {1} with rejections in {2}.".format(path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected)) path_concatenated_src = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_src)) path_concatenated_tgt = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_tgt)) FileConcatenator().concatenate(cleaned_file_list_src, path_concatenated_src) FileConcatenator().concatenate(cleaned_file_list_tgt, path_concatenated_tgt) logger.info("Concatenated to {0} and {1}.".format(path_concatenated_src, path_concatenated_tgt)) parallel_file_sizes = FileSizeCounter().count(path_concatenated_src) logger.info("Counted {0} lines in file {1}.".format(parallel_file_sizes.lines, path_concatenated_src)) parallel_file_info = None if parallel_file_sizes.lines > 0: parallel_file_info = self.create_file_info("bilingual", [self.lang_src, self.lang_tgt], parallel_file_sizes.lines, "translation_units") output_filestem = self.BASE_OUTPUT_FILESTEM.format(id) path_output_tmx = self.tmx_filename_template.format(bilingual_dir.joinpath(output_filestem)) TmxCreator().create(self.tmx_template, path_concatenated_src, path_concatenated_tgt, path_output_tmx, additional_tmx_args) logger.info("Created TMX file at {0}.".format(path_output_tmx)) else: logger.info("No parallel data found for resource {0}, skipping TMX creation.".format(id)) return rejected, list(filter(None, [monolingual_file_info_src, monolingual_file_info_tgt, parallel_file_info])) def preprocess_file(self, index_line, rejected_dir, artefact_basedir, files_by_language): try: index_tokens = index_line.split("\t") index_no = index_tokens[0] input_path = index_tokens[1] lower_extension = pathlib.Path(input_path).suffix.lower() parser_types = self.PARSERS.get(lower_extension, None) if not parser_types: shutil.copy(input_path, rejected_dir) logger.error("No extractor found for type {0}, skipping.".format(lower_extension)) return True logger.info("Preprocessing index {0} file {1}.".format(index_no, input_path)) artefact_dir = self.create_directory(artefact_basedir, index_no) artefact_basepath = artefact_dir.joinpath(index_no) path_extracted = OutputPathTemplate.EXTRACTED.format(artefact_basepath) path_normalized = OutputPathTemplate.NORMALIZED_UNKNOWN.format(artefact_basepath) logger.info("Selecting parsers for {0}: extractor: {1}, extraction tool: {2}, sentence splitter: {3}.".format( input_path, parser_types.extractor, parser_types.extraction_tool, parser_types.splitter)) parser_types.extractor(parser_types.extraction_tool).extract(input_path, path_extracted, config=self.extractor_config) logger.info("Extracted to {0}.".format(path_extracted)) UnicodeNormalizer().normalize(path_extracted, path_normalized, self.CUSTOM_CHARACTER_SUBSTITUTIONS) logger.info("Normalized to {0}.".format(path_normalized)) language = LanguageDetector().detect_in_file(path_normalized, self.langdetect_config) logger.info("Detected {0} as language [{1}].".format(path_normalized, language)) if language not in [self.lang_src, self.lang_tgt]: shutil.copy(path_normalized, rejected_dir) logger.info("Rejected {0} with invalid language [{1}].".format(path_normalized, language)) else: path_split = OutputPathTemplate.SPLIT.format(artefact_basepath, language) if not language in files_by_language: files_by_language[language] = [] files_by_language[language].append(path_split) path_abbreviations = self.abbreviations_paths[language] parser_types.splitter().split(path_abbreviations, path_normalized, path_split) logger.info("Sentence-split to {0}.".format(path_split)) return False except ToolchainError as te: shutil.copy(input_path, rejected_dir) logger.error("Error preprocessing file {0}, skipping.".format(input_path)) return True def resolve_unmatched_files(self, id, file_list, lang, clean_dir, rejected_dir, monolingual_dir, keep_unmatched, monolingual_filename_template): monolingual_paths = [] for filename in file_list: if keep_unmatched: stem = pathlib.Path(filename).stem clean_base_path = clean_dir.joinpath(stem) path_cleaned_retained = OutputPathTemplate.CLEANED.format(clean_base_path, lang) path_cleaned_rejected = OutputPathTemplate.CLEANED.format(clean_base_path, "rejected") MonolingualCleaner(lang, config=self.cleaner_config).clean(filename, path_cleaned_retained, path_cleaned_rejected) logger.info("Cleaned monolingual file {0} to {1}.".format(filename, path_cleaned_retained)) monolingual_paths.append(path_cleaned_retained) shutil.copy(path_cleaned_rejected, rejected_dir) else: shutil.copy(filename, rejected_dir) logger.info("Rejected unmatched file {0}.".format(filename)) if monolingual_paths: concatenated_monolingual_path = monolingual_filename_template.format(monolingual_dir.joinpath(str(id))) FileConcatenator().concatenate(monolingual_paths, concatenated_monolingual_path) logger.info("Concatenated file(s) {0} to combined monolingual file at {1}.".format(str(monolingual_paths), concatenated_monolingual_path)) file_sizes = FileSizeCounter().count(concatenated_monolingual_path) logger.info("Counted {0} lines and {1} words in file {2}.".format(file_sizes.lines, file_sizes.words, concatenated_monolingual_path)) return self.create_file_info("monolingual", [lang], file_sizes.words, "words") return None if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument("id", help="LR identifier") argparser.add_argument("input_dir", help="path to input directory") argparser.add_argument("artefact_dir", help="path to artefact directory") argparser.add_argument("output_dir", help="path to output directory") argparser.add_argument("config_path", help="path to config") args = argparser.parse_args() DocToTmxProcessor(args.config_path).process(args.id, args.input_dir, args.artefact_dir, args.output_dir) print("Output written to {0}".format(args.output_dir))