tm_to_tmx_processor.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. import argparse
  2. import logging
  3. import pathlib
  4. import shutil
  5. from collections import namedtuple
  6. from toolchain.cleaners.post_alignment_cleaner import PostAlignmentCleaner
  7. from toolchain.common.file_size_counter import FileSizeCounter
  8. from toolchain.common.raw_file_indexer import RawFileIndexer
  9. from toolchain.common.templates import OutputPathTemplate
  10. from toolchain.common.toolchain_error import ToolchainError
  11. from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
  12. from toolchain.parsers.sdltm_parser import SdltmParser
  13. from toolchain.parsers.tmx_parser import TmxParser
  14. from toolchain.parsers.xliff_parser import XliffParser
  15. from toolchain.toolchain_processor import ToolchainProcessor
  16. from toolchain.writers.tmx_creator import TmxCreator
  17. logger = logging.getLogger(__name__)
  18. FiletypeInfo = namedtuple("FiletypeInfo", ["parser", "o_tmf"])
  19. class TmToTmxProcessor(ToolchainProcessor):
  20. BASE_OUTPUT_FILESTEM = "tm_{0}_{1}"
  21. FILETYPE_INFOS = {
  22. ".sdltm" : FiletypeInfo(parser=SdltmParser, o_tmf="SDLTM"),
  23. ".sdlxliff" : FiletypeInfo(parser=XliffParser, o_tmf="SDLXLIFF"),
  24. ".tmx" : FiletypeInfo(parser=TmxParser, o_tmf="TMX"),
  25. ".xlf" : FiletypeInfo(parser=XliffParser, o_tmf="XLIFF"),
  26. ".xliff" : FiletypeInfo(parser=XliffParser, o_tmf="XLIFF"),
  27. }
  28. EXTENSIONS = FILETYPE_INFOS.keys()
  29. def process(self, id, input_dir, artefact_dir, output_dir):
  30. logger.info("Starting TM to TMX toolchain for input directory {0}.".format(input_dir))
  31. input_basedir = pathlib.Path(input_dir).resolve()
  32. artefact_basedir = pathlib.Path(artefact_dir).resolve()
  33. output_basedir = pathlib.Path(output_dir).resolve()
  34. artefact_basedir.mkdir(parents=True, exist_ok=True)
  35. output_basedir.mkdir(parents=True, exist_ok=True)
  36. bilingual_dir = self.create_directory(output_basedir, "bilingual")
  37. rejected_dir = self.create_directory(artefact_basedir, "rejected")
  38. index_file = artefact_basedir.joinpath(self.INDEX_FILENAME)
  39. RawFileIndexer().index_files(input_basedir, self.EXTENSIONS, index_file)
  40. logger.info("Indexed {0} to {1}.".format(input_basedir, index_file))
  41. rejected = 0
  42. file_infos = []
  43. with open(index_file) as index:
  44. for line in index.read().splitlines():
  45. file_rejected, file_info = self.process_file(id, line, bilingual_dir, rejected_dir, artefact_basedir)
  46. if file_rejected:
  47. rejected += 1
  48. else:
  49. file_infos.append(file_info)
  50. return rejected, file_infos
  51. def process_file(self, id, index_line, output_dir, rejected_dir, artefact_basedir):
  52. index_tokens = index_line.split("\t")
  53. index_no = index_tokens[0]
  54. input_path = pathlib.Path(index_tokens[1])
  55. lower_extension = pathlib.Path(input_path).suffix.lower()
  56. filetype_info = self.FILETYPE_INFOS.get(lower_extension, None)
  57. if not filetype_info:
  58. shutil.copy(input_path, rejected_dir)
  59. logger.error("No parser found for file {0}, skipping.".format(input_path))
  60. return True, None
  61. logger.info("Processing index {0} file {1}.".format(index_no, input_path))
  62. artefact_dir = self.create_directory(artefact_basedir, index_no)
  63. output_filestem = self.BASE_OUTPUT_FILESTEM.format(id, index_no)
  64. output_basepath = output_dir.joinpath(output_filestem)
  65. rejected_basepath = rejected_dir.joinpath(index_no)
  66. artefact_basepath = artefact_dir.joinpath(index_no)
  67. path_parsed_src = pathlib.Path(OutputPathTemplate.PARSED.format(artefact_basepath, self.lang_src))
  68. path_parsed_tgt = pathlib.Path(OutputPathTemplate.PARSED.format(artefact_basepath, self.lang_tgt))
  69. path_normalized_src = pathlib.Path(OutputPathTemplate.NORMALIZED_KNOWN.format(artefact_basepath, self.lang_src))
  70. path_normalized_tgt = pathlib.Path(OutputPathTemplate.NORMALIZED_KNOWN.format(artefact_basepath, self.lang_tgt))
  71. path_cleaned_src = pathlib.Path(OutputPathTemplate.CLEANED.format(artefact_basepath, self.lang_src))
  72. path_cleaned_tgt = pathlib.Path(OutputPathTemplate.CLEANED.format(artefact_basepath, self.lang_tgt))
  73. path_cleaned_rejected = pathlib.Path(OutputPathTemplate.CLEANED.format(rejected_basepath, "rejected"))
  74. path_output_tmx = pathlib.Path(self.tmx_filename_template.format(output_basepath))
  75. parser = filetype_info.parser
  76. logger.info("Selecting parser for {0}: {1}.".format(input_path, parser))
  77. additional_tmx_args = dict(self.global_tmx_args)
  78. additional_tmx_args["o_tmf"] = filetype_info.o_tmf
  79. try:
  80. parser(self.lang_src, self.lang_tgt).parse(str(input_path), path_parsed_src, path_parsed_tgt)
  81. logger.info("Parsed to {0} and {1}.".format(path_parsed_src, path_parsed_tgt))
  82. if path_parsed_src.stat().st_size == 0 or path_parsed_tgt.stat().st_size == 0:
  83. shutil.copy(input_path, rejected_dir)
  84. logger.info("Rejected {0} and {1} with no segments found in source or target.".format(path_parsed_src, path_parsed_tgt))
  85. else:
  86. UnicodeNormalizer().normalize(path_parsed_src, path_normalized_src, self.CUSTOM_CHARACTER_SUBSTITUTIONS)
  87. logger.info("Normalized source to {0}.".format(path_normalized_src))
  88. UnicodeNormalizer().normalize(path_parsed_tgt, path_normalized_tgt, self.CUSTOM_CHARACTER_SUBSTITUTIONS)
  89. logger.info("Normalized target to {0}.".format(path_normalized_tgt))
  90. PostAlignmentCleaner(self.lang_src, self.lang_tgt, config=self.cleaner_config).clean(
  91. path_normalized_src, path_normalized_tgt, path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected)
  92. logger.info("Cleaned to {0} and {1} with rejections in {2}.".format(path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected))
  93. TmxCreator().create(self.tmx_template, path_cleaned_src, path_cleaned_tgt, path_output_tmx, additional_tmx_args)
  94. logger.info("Created TMX file at {0}.".format(path_output_tmx))
  95. file_sizes = FileSizeCounter().count(path_cleaned_src)
  96. logger.info("Counted {0} lines in file {1}.".format(file_sizes.lines, path_cleaned_src))
  97. return False, self.create_file_info("bilingual", [self.lang_src, self.lang_tgt], file_sizes.lines, "translation_units")
  98. except ToolchainError as te:
  99. shutil.copy(input_path, rejected_dir)
  100. logger.error("Error processing file {0}, skipping.".format(input_path))
  101. return True, None
  102. if __name__ == "__main__":
  103. argparser = argparse.ArgumentParser()
  104. argparser.add_argument("id", help="LR identifier")
  105. argparser.add_argument("input_dir", help="path to input directory")
  106. argparser.add_argument("artefact_dir", help="path to artefact directory")
  107. argparser.add_argument("output_dir", help="path to output directory")
  108. argparser.add_argument("config_path", help="path to config")
  109. args = argparser.parse_args()
  110. TmToTmxProcessor(args.config_path).process(args.id, args.input_dir, args.artefact_dir, args.output_dir)
  111. print("Output written to {0}".format(args.output_dir))