123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081 |
- from abc import ABC, abstractmethod
- import configparser
- import pathlib
- class ToolchainProcessor(ABC):
- INDEX_FILENAME = "index.txt"
- CUSTOM_CHARACTER_SUBSTITUTIONS = [
- ("\u0080", "€"),
- ("\u0091", "‘"),
- ("\u0092", "’"),
- ("\u0093", "“"),
- ("\u0094", "”"),
- ("\u0095", "•"),
- ("\u0096", "–"),
- ("\u0097", "—"),
- ]
- def __init__(self, config_path, additional_tmx_args={}):
- config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
- config.read(config_path)
- self.lang_src = config["Languages"]["srclang"]
- self.lang_tgt = config["Languages"]["tgtlang"]
- self.keep_unmatched_src = config.getboolean("Languages", "keep_unmatched_src", fallback=False)
- self.keep_unmatched_tgt = config.getboolean("Languages", "keep_unmatched_tgt", fallback=False)
- self.monolingual_filename_template_source = config.get("Output", "monolingual_filename_template_source")
- self.monolingual_filename_template_target = config.get("Output", "monolingual_filename_template_target")
- self.tmx_filename_template = config.get("Output", "parallel_filename_template")
- self.tmx_template = pathlib.Path(config["TMX"]["template"]).resolve()
- self.global_tmx_args = {
- "srclang" : self.lang_src,
- "tgtlang" : self.lang_tgt,
- "adminlang" : config["Languages"]["adminlang"],
- "datatype" : config["TMX"]["datatype"],
- "disclaimer" : config["TMX"]["disclaimer"],
- "distributor" : config["Common"]["project_name"],
- "segtype" : config["TMX"]["segtype"],
- }
- self.global_tmx_args.update(additional_tmx_args)
- self.abbreviations_paths = {}
- for lang, abbreviations_path in config["Abbreviations"].items():
- self.abbreviations_paths[lang] = abbreviations_path
- self.langdetect_config = config["Langdetect"]
- self.docalign_config = config["Docalign"]
- self.sentalign_config = config["Sentalign"]
- self.cleaner_config = config["TextCleaning"]
- self.extractor_config = config["TextExtraction"]
- @abstractmethod
- def process(self, id, input_dir, artefact_dir, output_dir):
- pass
- def create_directory(self, basedir, name):
- dir = basedir.joinpath(name)
- dir.mkdir(parents=True, exist_ok=True)
- return dir
- def create_file_info(self, linguality_type, languages, size, size_unit):
- file_info = {
- "linguality_type" : linguality_type,
- "languages" : languages,
- "size" : size,
- "size_unit" : size_unit,
- "encoding" : "utf8",
- }
- if linguality_type == "bilingual":
- file_info["multilinguality_type"] = "parallel"
- file_info["format"] = "application/x-tmx+xml"
- elif linguality_type == "monolingual":
- file_info["multilinguality_type"] = "not_applicable"
- file_info["format"] = "text/plain"
- return file_info
|