from abc import ABC, abstractmethod import configparser import pathlib class ToolchainProcessor(ABC): INDEX_FILENAME = "index.txt" CUSTOM_CHARACTER_SUBSTITUTIONS = [ ("\u0080", "€"), ("\u0091", "‘"), ("\u0092", "’"), ("\u0093", "“"), ("\u0094", "”"), ("\u0095", "•"), ("\u0096", "–"), ("\u0097", "—"), ] def __init__(self, config_path, additional_tmx_args={}): config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation()) config.read(config_path) self.lang_src = config["Languages"]["srclang"] self.lang_tgt = config["Languages"]["tgtlang"] self.keep_unmatched_src = config.getboolean("Languages", "keep_unmatched_src", fallback=False) self.keep_unmatched_tgt = config.getboolean("Languages", "keep_unmatched_tgt", fallback=False) self.monolingual_filename_template_source = config.get("Output", "monolingual_filename_template_source") self.monolingual_filename_template_target = config.get("Output", "monolingual_filename_template_target") self.tmx_filename_template = config.get("Output", "parallel_filename_template") self.tmx_template = pathlib.Path(config["TMX"]["template"]).resolve() self.global_tmx_args = { "srclang" : self.lang_src, "tgtlang" : self.lang_tgt, "adminlang" : config["Languages"]["adminlang"], "datatype" : config["TMX"]["datatype"], "disclaimer" : config["TMX"]["disclaimer"], "distributor" : config["Common"]["project_name"], "segtype" : config["TMX"]["segtype"], } self.global_tmx_args.update(additional_tmx_args) self.abbreviations_paths = {} for lang, abbreviations_path in config["Abbreviations"].items(): self.abbreviations_paths[lang] = abbreviations_path self.langdetect_config = config["Langdetect"] self.docalign_config = config["Docalign"] self.sentalign_config = config["Sentalign"] self.cleaner_config = config["TextCleaning"] self.extractor_config = config["TextExtraction"] @abstractmethod def process(self, id, input_dir, artefact_dir, output_dir): pass def create_directory(self, basedir, name): dir = basedir.joinpath(name) dir.mkdir(parents=True, exist_ok=True) return dir def create_file_info(self, linguality_type, languages, size, size_unit): file_info = { "linguality_type" : linguality_type, "languages" : languages, "size" : size, "size_unit" : size_unit, "encoding" : "utf8", } if linguality_type == "bilingual": file_info["multilinguality_type"] = "parallel" file_info["format"] = "application/x-tmx+xml" elif linguality_type == "monolingual": file_info["multilinguality_type"] = "not_applicable" file_info["format"] = "text/plain" return file_info