doc_to_tmx_processor.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245
  1. import argparse
  2. import logging
  3. import os
  4. import pathlib
  5. import shutil
  6. from collections import namedtuple
  7. from toolchain.cleaners.monolingual_cleaner import MonolingualCleaner
  8. from toolchain.cleaners.post_alignment_cleaner import PostAlignmentCleaner
  9. from toolchain.common.file_size_counter import FileSizeCounter
  10. from toolchain.common.language_detector import LanguageDetector
  11. from toolchain.common.raw_file_indexer import RawFileIndexer
  12. from toolchain.common.templates import OutputPathTemplate
  13. from toolchain.common.toolchain_error import ToolchainError
  14. from toolchain.docalign.document_aligner import DocumentAligner
  15. from toolchain.extractors.editable_text_extractor import EditableTextExtractor
  16. from toolchain.extractors.pdf_text_extractor import PdfTextExtractor
  17. from toolchain.extractors.plain_text_extractor import PlainTextExtractor
  18. from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
  19. from toolchain.sentalign.sentence_aligner import SentenceAligner
  20. from toolchain.splitters.editable_sentence_splitter import EditableSentenceSplitter
  21. from toolchain.splitters.pdf_sentence_splitter import PdfSentenceSplitter
  22. from toolchain.toolchain_processor import ToolchainProcessor
  23. from toolchain.writers.tmx_creator import TmxCreator
  24. from toolchain.writers.file_concatenator import FileConcatenator
  25. logger = logging.getLogger(__name__)
  26. ParserTypes = namedtuple("ParserTypes", ["extractor", "extraction_tool", "splitter"])
  27. class DocToTmxProcessor(ToolchainProcessor):
  28. HUNALIGN = pathlib.Path(os.environ["HUNALIGNPATH"]).resolve()
  29. PDFTOTEXT = pathlib.Path(os.environ["PDFTOTEXTPATH"]).resolve()
  30. LIBREOFFICE = pathlib.Path(os.environ["LIBREOFFICEPATH"]).resolve()
  31. BASE_OUTPUT_FILESTEM = "doc_{0}"
  32. PARSERS = {
  33. ".doc" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
  34. ".docx" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
  35. ".odt" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
  36. ".pdf" : ParserTypes(extractor=PdfTextExtractor, extraction_tool=PDFTOTEXT, splitter=PdfSentenceSplitter),
  37. ".rtf" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
  38. ".txt" : ParserTypes(extractor=PlainTextExtractor, extraction_tool=None, splitter=EditableSentenceSplitter),
  39. }
  40. EXTENSIONS = PARSERS.keys()
  41. def process(self, id, input_dir, artefact_dir, output_dir):
  42. logger.info("Starting Doc to TMX toolchain for input directory {0}.".format(input_dir))
  43. try:
  44. input_dirpath = pathlib.Path(input_dir).resolve()
  45. artefact_dirpath = pathlib.Path(artefact_dir).resolve()
  46. output_dirpath = pathlib.Path(output_dir).resolve()
  47. artefact_dirpath.mkdir(parents=True, exist_ok=True)
  48. output_dirpath.mkdir(parents=True, exist_ok=True)
  49. return self.process_directory(id, input_dirpath, artefact_dirpath, output_dirpath)
  50. except Exception as e:
  51. logger.exception(e)
  52. logger.error("Error processing documents: {0}.".format(e))
  53. raise
  54. def process_directory(self, id, input_basedir, artefact_basedir, output_basedir):
  55. langident_dir = self.create_directory(artefact_basedir, "langident")
  56. rejected_dir = self.create_directory(artefact_basedir, "rejected")
  57. docalign_dir = self.create_directory(artefact_basedir, "docalign")
  58. sentalign_dir = self.create_directory(artefact_basedir, "sentalign")
  59. clean_dir = self.create_directory(artefact_basedir, "clean")
  60. concatenate_dir = self.create_directory(artefact_basedir, "concatenate")
  61. bilingual_dir = self.create_directory(output_basedir, "bilingual")
  62. monolingual_dir = self.create_directory(output_basedir, "monolingual")
  63. index_file = artefact_basedir.joinpath(self.INDEX_FILENAME)
  64. RawFileIndexer().index_files(input_basedir, self.EXTENSIONS, index_file)
  65. logger.info("Indexed {0} to {1}.".format(input_basedir, index_file))
  66. additional_tmx_args = dict(self.global_tmx_args)
  67. additional_tmx_args["o_tmf"] = "Corpus"
  68. rejected = 0
  69. files_by_language = {}
  70. with open(index_file) as index:
  71. for line in index.read().splitlines():
  72. file_rejected = self.preprocess_file(line, rejected_dir, artefact_basedir, files_by_language)
  73. if file_rejected:
  74. rejected += 1
  75. unaligned_file_list_src = files_by_language.get(self.lang_src, [])
  76. unaligned_file_list_tgt = files_by_language.get(self.lang_tgt, [])
  77. aligned_document_pairs, unmatched_list_src, unmatched_list_tgt = DocumentAligner(self.docalign_config).align(
  78. unaligned_file_list_src, unaligned_file_list_tgt, docalign_dir)
  79. monolingual_file_info_src = self.resolve_unmatched_files(id, unmatched_list_src, self.lang_src, clean_dir,
  80. rejected_dir, monolingual_dir, self.keep_unmatched_src, self.monolingual_filename_template_source)
  81. monolingual_file_info_tgt = self.resolve_unmatched_files(id, unmatched_list_tgt, self.lang_tgt, clean_dir,
  82. rejected_dir, monolingual_dir, self.keep_unmatched_tgt, self.monolingual_filename_template_target)
  83. cleaned_file_list_src = []
  84. cleaned_file_list_tgt = []
  85. for index, document_pair in enumerate(aligned_document_pairs):
  86. pair_label = "sa_{0}".format(index+1)
  87. path_docaligned_src = document_pair[0]
  88. path_docaligned_tgt = document_pair[1]
  89. sentalign_pair_dir = self.create_directory(sentalign_dir, pair_label)
  90. sentalign_pair_base_path = sentalign_pair_dir.joinpath(pair_label)
  91. path_sentaligned_src = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_src))
  92. path_sentaligned_tgt = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_tgt))
  93. SentenceAligner(self.HUNALIGN).align(path_docaligned_src, path_docaligned_tgt,
  94. path_sentaligned_src, path_sentaligned_tgt, sentalign_pair_dir, self.sentalign_config)
  95. logger.info("Sentence aligned to {0} and {1}.".format(path_sentaligned_src, path_sentaligned_tgt))
  96. clean_base_path = clean_dir.joinpath(pair_label)
  97. path_cleaned_src = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_src))
  98. path_cleaned_tgt = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_tgt))
  99. path_cleaned_rejected = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, "rejected"))
  100. PostAlignmentCleaner(self.lang_src, self.lang_tgt, config=self.cleaner_config).clean(
  101. path_sentaligned_src, path_sentaligned_tgt, path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected)
  102. shutil.copy(path_cleaned_rejected, rejected_dir)
  103. cleaned_file_list_src.append(path_cleaned_src)
  104. cleaned_file_list_tgt.append(path_cleaned_tgt)
  105. logger.info("Cleaned to {0} and {1} with rejections in {2}.".format(path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected))
  106. path_concatenated_src = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_src))
  107. path_concatenated_tgt = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_tgt))
  108. FileConcatenator().concatenate(cleaned_file_list_src, path_concatenated_src)
  109. FileConcatenator().concatenate(cleaned_file_list_tgt, path_concatenated_tgt)
  110. logger.info("Concatenated to {0} and {1}.".format(path_concatenated_src, path_concatenated_tgt))
  111. parallel_file_sizes = FileSizeCounter().count(path_concatenated_src)
  112. logger.info("Counted {0} lines in file {1}.".format(parallel_file_sizes.lines, path_concatenated_src))
  113. parallel_file_info = None
  114. if parallel_file_sizes.lines > 0:
  115. parallel_file_info = self.create_file_info("bilingual", [self.lang_src, self.lang_tgt],
  116. parallel_file_sizes.lines, "translation_units")
  117. output_filestem = self.BASE_OUTPUT_FILESTEM.format(id)
  118. path_output_tmx = self.tmx_filename_template.format(bilingual_dir.joinpath(output_filestem))
  119. TmxCreator().create(self.tmx_template, path_concatenated_src, path_concatenated_tgt, path_output_tmx, additional_tmx_args)
  120. logger.info("Created TMX file at {0}.".format(path_output_tmx))
  121. else:
  122. logger.info("No parallel data found for resource {0}, skipping TMX creation.".format(id))
  123. return rejected, list(filter(None, [monolingual_file_info_src, monolingual_file_info_tgt, parallel_file_info]))
  124. def preprocess_file(self, index_line, rejected_dir, artefact_basedir, files_by_language):
  125. try:
  126. index_tokens = index_line.split("\t")
  127. index_no = index_tokens[0]
  128. input_path = index_tokens[1]
  129. lower_extension = pathlib.Path(input_path).suffix.lower()
  130. parser_types = self.PARSERS.get(lower_extension, None)
  131. if not parser_types:
  132. shutil.copy(input_path, rejected_dir)
  133. logger.error("No extractor found for type {0}, skipping.".format(lower_extension))
  134. return True
  135. logger.info("Preprocessing index {0} file {1}.".format(index_no, input_path))
  136. artefact_dir = self.create_directory(artefact_basedir, index_no)
  137. artefact_basepath = artefact_dir.joinpath(index_no)
  138. path_extracted = OutputPathTemplate.EXTRACTED.format(artefact_basepath)
  139. path_normalized = OutputPathTemplate.NORMALIZED_UNKNOWN.format(artefact_basepath)
  140. logger.info("Selecting parsers for {0}: extractor: {1}, extraction tool: {2}, sentence splitter: {3}.".format(
  141. input_path, parser_types.extractor, parser_types.extraction_tool, parser_types.splitter))
  142. parser_types.extractor(parser_types.extraction_tool).extract(input_path, path_extracted, config=self.extractor_config)
  143. logger.info("Extracted to {0}.".format(path_extracted))
  144. UnicodeNormalizer().normalize(path_extracted, path_normalized, self.CUSTOM_CHARACTER_SUBSTITUTIONS)
  145. logger.info("Normalized to {0}.".format(path_normalized))
  146. language = LanguageDetector().detect_in_file(path_normalized, self.langdetect_config)
  147. logger.info("Detected {0} as language [{1}].".format(path_normalized, language))
  148. if language not in [self.lang_src, self.lang_tgt]:
  149. shutil.copy(path_normalized, rejected_dir)
  150. logger.info("Rejected {0} with invalid language [{1}].".format(path_normalized, language))
  151. else:
  152. path_split = OutputPathTemplate.SPLIT.format(artefact_basepath, language)
  153. if not language in files_by_language:
  154. files_by_language[language] = []
  155. files_by_language[language].append(path_split)
  156. path_abbreviations = self.abbreviations_paths[language]
  157. parser_types.splitter().split(path_abbreviations, path_normalized, path_split)
  158. logger.info("Sentence-split to {0}.".format(path_split))
  159. return False
  160. except ToolchainError as te:
  161. shutil.copy(input_path, rejected_dir)
  162. logger.error("Error preprocessing file {0}, skipping.".format(input_path))
  163. return True
  164. def resolve_unmatched_files(self, id, file_list, lang, clean_dir, rejected_dir, monolingual_dir, keep_unmatched, monolingual_filename_template):
  165. monolingual_paths = []
  166. for filename in file_list:
  167. if keep_unmatched:
  168. stem = pathlib.Path(filename).stem
  169. clean_base_path = clean_dir.joinpath(stem)
  170. path_cleaned_retained = OutputPathTemplate.CLEANED.format(clean_base_path, lang)
  171. path_cleaned_rejected = OutputPathTemplate.CLEANED.format(clean_base_path, "rejected")
  172. MonolingualCleaner(lang, config=self.cleaner_config).clean(filename, path_cleaned_retained, path_cleaned_rejected)
  173. logger.info("Cleaned monolingual file {0} to {1}.".format(filename, path_cleaned_retained))
  174. monolingual_paths.append(path_cleaned_retained)
  175. shutil.copy(path_cleaned_rejected, rejected_dir)
  176. else:
  177. shutil.copy(filename, rejected_dir)
  178. logger.info("Rejected unmatched file {0}.".format(filename))
  179. if monolingual_paths:
  180. concatenated_monolingual_path = monolingual_filename_template.format(monolingual_dir.joinpath(str(id)))
  181. FileConcatenator().concatenate(monolingual_paths, concatenated_monolingual_path)
  182. logger.info("Concatenated file(s) {0} to combined monolingual file at {1}.".format(str(monolingual_paths), concatenated_monolingual_path))
  183. file_sizes = FileSizeCounter().count(concatenated_monolingual_path)
  184. logger.info("Counted {0} lines and {1} words in file {2}.".format(file_sizes.lines, file_sizes.words, concatenated_monolingual_path))
  185. return self.create_file_info("monolingual", [lang], file_sizes.words, "words")
  186. return None
  187. if __name__ == "__main__":
  188. argparser = argparse.ArgumentParser()
  189. argparser.add_argument("id", help="LR identifier")
  190. argparser.add_argument("input_dir", help="path to input directory")
  191. argparser.add_argument("artefact_dir", help="path to artefact directory")
  192. argparser.add_argument("output_dir", help="path to output directory")
  193. argparser.add_argument("config_path", help="path to config")
  194. args = argparser.parse_args()
  195. DocToTmxProcessor(args.config_path).process(args.id, args.input_dir, args.artefact_dir, args.output_dir)
  196. print("Output written to {0}".format(args.output_dir))