123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- import argparse
- import logging
- import pathlib
- import shutil
- from collections import namedtuple
- from toolchain.cleaners.post_alignment_cleaner import PostAlignmentCleaner
- from toolchain.common.file_size_counter import FileSizeCounter
- from toolchain.common.raw_file_indexer import RawFileIndexer
- from toolchain.common.templates import OutputPathTemplate
- from toolchain.common.toolchain_error import ToolchainError
- from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
- from toolchain.parsers.sdltm_parser import SdltmParser
- from toolchain.parsers.tmx_parser import TmxParser
- from toolchain.parsers.xliff_parser import XliffParser
- from toolchain.toolchain_processor import ToolchainProcessor
- from toolchain.writers.tmx_creator import TmxCreator
- logger = logging.getLogger(__name__)
- FiletypeInfo = namedtuple("FiletypeInfo", ["parser", "o_tmf"])
- class TmToTmxProcessor(ToolchainProcessor):
- BASE_OUTPUT_FILESTEM = "tm_{0}_{1}"
- FILETYPE_INFOS = {
- ".sdltm" : FiletypeInfo(parser=SdltmParser, o_tmf="SDLTM"),
- ".sdlxliff" : FiletypeInfo(parser=XliffParser, o_tmf="SDLXLIFF"),
- ".tmx" : FiletypeInfo(parser=TmxParser, o_tmf="TMX"),
- ".xlf" : FiletypeInfo(parser=XliffParser, o_tmf="XLIFF"),
- ".xliff" : FiletypeInfo(parser=XliffParser, o_tmf="XLIFF"),
- }
- EXTENSIONS = FILETYPE_INFOS.keys()
- def process(self, id, input_dir, artefact_dir, output_dir):
- logger.info("Starting TM to TMX toolchain for input directory {0}.".format(input_dir))
- input_basedir = pathlib.Path(input_dir).resolve()
- artefact_basedir = pathlib.Path(artefact_dir).resolve()
- output_basedir = pathlib.Path(output_dir).resolve()
- artefact_basedir.mkdir(parents=True, exist_ok=True)
- output_basedir.mkdir(parents=True, exist_ok=True)
- bilingual_dir = self.create_directory(output_basedir, "bilingual")
- rejected_dir = self.create_directory(artefact_basedir, "rejected")
- index_file = artefact_basedir.joinpath(self.INDEX_FILENAME)
- RawFileIndexer().index_files(input_basedir, self.EXTENSIONS, index_file)
- logger.info("Indexed {0} to {1}.".format(input_basedir, index_file))
- rejected = 0
- file_infos = []
- with open(index_file) as index:
- for line in index.read().splitlines():
- file_rejected, file_info = self.process_file(id, line, bilingual_dir, rejected_dir, artefact_basedir)
- if file_rejected:
- rejected += 1
- else:
- file_infos.append(file_info)
- return rejected, file_infos
- def process_file(self, id, index_line, output_dir, rejected_dir, artefact_basedir):
- index_tokens = index_line.split("\t")
- index_no = index_tokens[0]
- input_path = pathlib.Path(index_tokens[1])
- lower_extension = pathlib.Path(input_path).suffix.lower()
- filetype_info = self.FILETYPE_INFOS.get(lower_extension, None)
- if not filetype_info:
- shutil.copy(input_path, rejected_dir)
- logger.error("No parser found for file {0}, skipping.".format(input_path))
- return True, None
- logger.info("Processing index {0} file {1}.".format(index_no, input_path))
- artefact_dir = self.create_directory(artefact_basedir, index_no)
- output_filestem = self.BASE_OUTPUT_FILESTEM.format(id, index_no)
- output_basepath = output_dir.joinpath(output_filestem)
- rejected_basepath = rejected_dir.joinpath(index_no)
- artefact_basepath = artefact_dir.joinpath(index_no)
- path_parsed_src = pathlib.Path(OutputPathTemplate.PARSED.format(artefact_basepath, self.lang_src))
- path_parsed_tgt = pathlib.Path(OutputPathTemplate.PARSED.format(artefact_basepath, self.lang_tgt))
- path_normalized_src = pathlib.Path(OutputPathTemplate.NORMALIZED_KNOWN.format(artefact_basepath, self.lang_src))
- path_normalized_tgt = pathlib.Path(OutputPathTemplate.NORMALIZED_KNOWN.format(artefact_basepath, self.lang_tgt))
- path_cleaned_src = pathlib.Path(OutputPathTemplate.CLEANED.format(artefact_basepath, self.lang_src))
- path_cleaned_tgt = pathlib.Path(OutputPathTemplate.CLEANED.format(artefact_basepath, self.lang_tgt))
- path_cleaned_rejected = pathlib.Path(OutputPathTemplate.CLEANED.format(rejected_basepath, "rejected"))
- path_output_tmx = pathlib.Path(self.tmx_filename_template.format(output_basepath))
- parser = filetype_info.parser
- logger.info("Selecting parser for {0}: {1}.".format(input_path, parser))
- additional_tmx_args = dict(self.global_tmx_args)
- additional_tmx_args["o_tmf"] = filetype_info.o_tmf
- try:
- parser(self.lang_src, self.lang_tgt).parse(str(input_path), path_parsed_src, path_parsed_tgt)
- logger.info("Parsed to {0} and {1}.".format(path_parsed_src, path_parsed_tgt))
- if path_parsed_src.stat().st_size == 0 or path_parsed_tgt.stat().st_size == 0:
- shutil.copy(input_path, rejected_dir)
- logger.info("Rejected {0} and {1} with no segments found in source or target.".format(path_parsed_src, path_parsed_tgt))
- else:
- UnicodeNormalizer().normalize(path_parsed_src, path_normalized_src, self.CUSTOM_CHARACTER_SUBSTITUTIONS)
- logger.info("Normalized source to {0}.".format(path_normalized_src))
- UnicodeNormalizer().normalize(path_parsed_tgt, path_normalized_tgt, self.CUSTOM_CHARACTER_SUBSTITUTIONS)
- logger.info("Normalized target to {0}.".format(path_normalized_tgt))
- PostAlignmentCleaner(self.lang_src, self.lang_tgt, config=self.cleaner_config).clean(
- path_normalized_src, path_normalized_tgt, path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected)
- logger.info("Cleaned to {0} and {1} with rejections in {2}.".format(path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected))
- TmxCreator().create(self.tmx_template, path_cleaned_src, path_cleaned_tgt, path_output_tmx, additional_tmx_args)
- logger.info("Created TMX file at {0}.".format(path_output_tmx))
- file_sizes = FileSizeCounter().count(path_cleaned_src)
- logger.info("Counted {0} lines in file {1}.".format(file_sizes.lines, path_cleaned_src))
- return False, self.create_file_info("bilingual", [self.lang_src, self.lang_tgt], file_sizes.lines, "translation_units")
- except ToolchainError as te:
- shutil.copy(input_path, rejected_dir)
- logger.error("Error processing file {0}, skipping.".format(input_path))
- return True, None
- if __name__ == "__main__":
- argparser = argparse.ArgumentParser()
- argparser.add_argument("id", help="LR identifier")
- argparser.add_argument("input_dir", help="path to input directory")
- argparser.add_argument("artefact_dir", help="path to artefact directory")
- argparser.add_argument("output_dir", help="path to output directory")
- argparser.add_argument("config_path", help="path to config")
- args = argparser.parse_args()
- TmToTmxProcessor(args.config_path).process(args.id, args.input_dir, args.artefact_dir, args.output_dir)
- print("Output written to {0}".format(args.output_dir))
|