123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245 |
- import argparse
- import logging
- import os
- import pathlib
- import shutil
- from collections import namedtuple
- from toolchain.cleaners.monolingual_cleaner import MonolingualCleaner
- from toolchain.cleaners.post_alignment_cleaner import PostAlignmentCleaner
- from toolchain.common.file_size_counter import FileSizeCounter
- from toolchain.common.language_detector import LanguageDetector
- from toolchain.common.raw_file_indexer import RawFileIndexer
- from toolchain.common.templates import OutputPathTemplate
- from toolchain.common.toolchain_error import ToolchainError
- from toolchain.docalign.document_aligner import DocumentAligner
- from toolchain.extractors.editable_text_extractor import EditableTextExtractor
- from toolchain.extractors.pdf_text_extractor import PdfTextExtractor
- from toolchain.extractors.plain_text_extractor import PlainTextExtractor
- from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
- from toolchain.sentalign.sentence_aligner import SentenceAligner
- from toolchain.splitters.editable_sentence_splitter import EditableSentenceSplitter
- from toolchain.splitters.pdf_sentence_splitter import PdfSentenceSplitter
- from toolchain.toolchain_processor import ToolchainProcessor
- from toolchain.writers.tmx_creator import TmxCreator
- from toolchain.writers.file_concatenator import FileConcatenator
- logger = logging.getLogger(__name__)
- ParserTypes = namedtuple("ParserTypes", ["extractor", "extraction_tool", "splitter"])
- class DocToTmxProcessor(ToolchainProcessor):
- HUNALIGN = pathlib.Path(os.environ["HUNALIGNPATH"]).resolve()
- PDFTOTEXT = pathlib.Path(os.environ["PDFTOTEXTPATH"]).resolve()
- LIBREOFFICE = pathlib.Path(os.environ["LIBREOFFICEPATH"]).resolve()
- BASE_OUTPUT_FILESTEM = "doc_{0}"
- PARSERS = {
- ".doc" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
- ".docx" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
- ".odt" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
- ".pdf" : ParserTypes(extractor=PdfTextExtractor, extraction_tool=PDFTOTEXT, splitter=PdfSentenceSplitter),
- ".rtf" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
- ".txt" : ParserTypes(extractor=PlainTextExtractor, extraction_tool=None, splitter=EditableSentenceSplitter),
- }
- EXTENSIONS = PARSERS.keys()
- def process(self, id, input_dir, artefact_dir, output_dir):
- logger.info("Starting Doc to TMX toolchain for input directory {0}.".format(input_dir))
- try:
- input_dirpath = pathlib.Path(input_dir).resolve()
- artefact_dirpath = pathlib.Path(artefact_dir).resolve()
- output_dirpath = pathlib.Path(output_dir).resolve()
- artefact_dirpath.mkdir(parents=True, exist_ok=True)
- output_dirpath.mkdir(parents=True, exist_ok=True)
- return self.process_directory(id, input_dirpath, artefact_dirpath, output_dirpath)
- except Exception as e:
- logger.exception(e)
- logger.error("Error processing documents: {0}.".format(e))
- raise
- def process_directory(self, id, input_basedir, artefact_basedir, output_basedir):
- langident_dir = self.create_directory(artefact_basedir, "langident")
- rejected_dir = self.create_directory(artefact_basedir, "rejected")
- docalign_dir = self.create_directory(artefact_basedir, "docalign")
- sentalign_dir = self.create_directory(artefact_basedir, "sentalign")
- clean_dir = self.create_directory(artefact_basedir, "clean")
- concatenate_dir = self.create_directory(artefact_basedir, "concatenate")
- bilingual_dir = self.create_directory(output_basedir, "bilingual")
- monolingual_dir = self.create_directory(output_basedir, "monolingual")
- index_file = artefact_basedir.joinpath(self.INDEX_FILENAME)
- RawFileIndexer().index_files(input_basedir, self.EXTENSIONS, index_file)
- logger.info("Indexed {0} to {1}.".format(input_basedir, index_file))
- additional_tmx_args = dict(self.global_tmx_args)
- additional_tmx_args["o_tmf"] = "Corpus"
- rejected = 0
- files_by_language = {}
- with open(index_file) as index:
- for line in index.read().splitlines():
- file_rejected = self.preprocess_file(line, rejected_dir, artefact_basedir, files_by_language)
- if file_rejected:
- rejected += 1
- unaligned_file_list_src = files_by_language.get(self.lang_src, [])
- unaligned_file_list_tgt = files_by_language.get(self.lang_tgt, [])
- aligned_document_pairs, unmatched_list_src, unmatched_list_tgt = DocumentAligner(self.docalign_config).align(
- unaligned_file_list_src, unaligned_file_list_tgt, docalign_dir)
- monolingual_file_info_src = self.resolve_unmatched_files(id, unmatched_list_src, self.lang_src, clean_dir,
- rejected_dir, monolingual_dir, self.keep_unmatched_src, self.monolingual_filename_template_source)
- monolingual_file_info_tgt = self.resolve_unmatched_files(id, unmatched_list_tgt, self.lang_tgt, clean_dir,
- rejected_dir, monolingual_dir, self.keep_unmatched_tgt, self.monolingual_filename_template_target)
- cleaned_file_list_src = []
- cleaned_file_list_tgt = []
- for index, document_pair in enumerate(aligned_document_pairs):
- pair_label = "sa_{0}".format(index+1)
- path_docaligned_src = document_pair[0]
- path_docaligned_tgt = document_pair[1]
- sentalign_pair_dir = self.create_directory(sentalign_dir, pair_label)
- sentalign_pair_base_path = sentalign_pair_dir.joinpath(pair_label)
- path_sentaligned_src = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_src))
- path_sentaligned_tgt = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_tgt))
- SentenceAligner(self.HUNALIGN).align(path_docaligned_src, path_docaligned_tgt,
- path_sentaligned_src, path_sentaligned_tgt, sentalign_pair_dir, self.sentalign_config)
- logger.info("Sentence aligned to {0} and {1}.".format(path_sentaligned_src, path_sentaligned_tgt))
- clean_base_path = clean_dir.joinpath(pair_label)
- path_cleaned_src = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_src))
- path_cleaned_tgt = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_tgt))
- path_cleaned_rejected = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, "rejected"))
- PostAlignmentCleaner(self.lang_src, self.lang_tgt, config=self.cleaner_config).clean(
- path_sentaligned_src, path_sentaligned_tgt, path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected)
- shutil.copy(path_cleaned_rejected, rejected_dir)
- cleaned_file_list_src.append(path_cleaned_src)
- cleaned_file_list_tgt.append(path_cleaned_tgt)
- logger.info("Cleaned to {0} and {1} with rejections in {2}.".format(path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected))
- path_concatenated_src = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_src))
- path_concatenated_tgt = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_tgt))
- FileConcatenator().concatenate(cleaned_file_list_src, path_concatenated_src)
- FileConcatenator().concatenate(cleaned_file_list_tgt, path_concatenated_tgt)
- logger.info("Concatenated to {0} and {1}.".format(path_concatenated_src, path_concatenated_tgt))
- parallel_file_sizes = FileSizeCounter().count(path_concatenated_src)
- logger.info("Counted {0} lines in file {1}.".format(parallel_file_sizes.lines, path_concatenated_src))
- parallel_file_info = None
- if parallel_file_sizes.lines > 0:
- parallel_file_info = self.create_file_info("bilingual", [self.lang_src, self.lang_tgt],
- parallel_file_sizes.lines, "translation_units")
- output_filestem = self.BASE_OUTPUT_FILESTEM.format(id)
- path_output_tmx = self.tmx_filename_template.format(bilingual_dir.joinpath(output_filestem))
- TmxCreator().create(self.tmx_template, path_concatenated_src, path_concatenated_tgt, path_output_tmx, additional_tmx_args)
- logger.info("Created TMX file at {0}.".format(path_output_tmx))
- else:
- logger.info("No parallel data found for resource {0}, skipping TMX creation.".format(id))
- return rejected, list(filter(None, [monolingual_file_info_src, monolingual_file_info_tgt, parallel_file_info]))
- def preprocess_file(self, index_line, rejected_dir, artefact_basedir, files_by_language):
- try:
- index_tokens = index_line.split("\t")
- index_no = index_tokens[0]
- input_path = index_tokens[1]
- lower_extension = pathlib.Path(input_path).suffix.lower()
- parser_types = self.PARSERS.get(lower_extension, None)
- if not parser_types:
- shutil.copy(input_path, rejected_dir)
- logger.error("No extractor found for type {0}, skipping.".format(lower_extension))
- return True
- logger.info("Preprocessing index {0} file {1}.".format(index_no, input_path))
- artefact_dir = self.create_directory(artefact_basedir, index_no)
- artefact_basepath = artefact_dir.joinpath(index_no)
- path_extracted = OutputPathTemplate.EXTRACTED.format(artefact_basepath)
- path_normalized = OutputPathTemplate.NORMALIZED_UNKNOWN.format(artefact_basepath)
- logger.info("Selecting parsers for {0}: extractor: {1}, extraction tool: {2}, sentence splitter: {3}.".format(
- input_path, parser_types.extractor, parser_types.extraction_tool, parser_types.splitter))
- parser_types.extractor(parser_types.extraction_tool).extract(input_path, path_extracted, config=self.extractor_config)
- logger.info("Extracted to {0}.".format(path_extracted))
- UnicodeNormalizer().normalize(path_extracted, path_normalized, self.CUSTOM_CHARACTER_SUBSTITUTIONS)
- logger.info("Normalized to {0}.".format(path_normalized))
- language = LanguageDetector().detect_in_file(path_normalized, self.langdetect_config)
- logger.info("Detected {0} as language [{1}].".format(path_normalized, language))
- if language not in [self.lang_src, self.lang_tgt]:
- shutil.copy(path_normalized, rejected_dir)
- logger.info("Rejected {0} with invalid language [{1}].".format(path_normalized, language))
- else:
- path_split = OutputPathTemplate.SPLIT.format(artefact_basepath, language)
- if not language in files_by_language:
- files_by_language[language] = []
- files_by_language[language].append(path_split)
- path_abbreviations = self.abbreviations_paths[language]
- parser_types.splitter().split(path_abbreviations, path_normalized, path_split)
- logger.info("Sentence-split to {0}.".format(path_split))
- return False
- except ToolchainError as te:
- shutil.copy(input_path, rejected_dir)
- logger.error("Error preprocessing file {0}, skipping.".format(input_path))
- return True
- def resolve_unmatched_files(self, id, file_list, lang, clean_dir, rejected_dir, monolingual_dir, keep_unmatched, monolingual_filename_template):
- monolingual_paths = []
- for filename in file_list:
- if keep_unmatched:
- stem = pathlib.Path(filename).stem
- clean_base_path = clean_dir.joinpath(stem)
- path_cleaned_retained = OutputPathTemplate.CLEANED.format(clean_base_path, lang)
- path_cleaned_rejected = OutputPathTemplate.CLEANED.format(clean_base_path, "rejected")
- MonolingualCleaner(lang, config=self.cleaner_config).clean(filename, path_cleaned_retained, path_cleaned_rejected)
- logger.info("Cleaned monolingual file {0} to {1}.".format(filename, path_cleaned_retained))
- monolingual_paths.append(path_cleaned_retained)
- shutil.copy(path_cleaned_rejected, rejected_dir)
- else:
- shutil.copy(filename, rejected_dir)
- logger.info("Rejected unmatched file {0}.".format(filename))
- if monolingual_paths:
- concatenated_monolingual_path = monolingual_filename_template.format(monolingual_dir.joinpath(str(id)))
- FileConcatenator().concatenate(monolingual_paths, concatenated_monolingual_path)
- logger.info("Concatenated file(s) {0} to combined monolingual file at {1}.".format(str(monolingual_paths), concatenated_monolingual_path))
- file_sizes = FileSizeCounter().count(concatenated_monolingual_path)
- logger.info("Counted {0} lines and {1} words in file {2}.".format(file_sizes.lines, file_sizes.words, concatenated_monolingual_path))
- return self.create_file_info("monolingual", [lang], file_sizes.words, "words")
- return None
- if __name__ == "__main__":
- argparser = argparse.ArgumentParser()
- argparser.add_argument("id", help="LR identifier")
- argparser.add_argument("input_dir", help="path to input directory")
- argparser.add_argument("artefact_dir", help="path to artefact directory")
- argparser.add_argument("output_dir", help="path to output directory")
- argparser.add_argument("config_path", help="path to config")
- args = argparser.parse_args()
- DocToTmxProcessor(args.config_path).process(args.id, args.input_dir, args.artefact_dir, args.output_dir)
- print("Output written to {0}".format(args.output_dir))
|