toolchain_processor.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. from abc import ABC, abstractmethod
  2. import configparser
  3. import pathlib
  4. class ToolchainProcessor(ABC):
  5. INDEX_FILENAME = "index.txt"
  6. CUSTOM_CHARACTER_SUBSTITUTIONS = [
  7. ("\u0080", "€"),
  8. ("\u0091", "‘"),
  9. ("\u0092", "’"),
  10. ("\u0093", "“"),
  11. ("\u0094", "”"),
  12. ("\u0095", "•"),
  13. ("\u0096", "–"),
  14. ("\u0097", "—"),
  15. ]
  16. def __init__(self, config_path, additional_tmx_args={}):
  17. config = configparser.ConfigParser(interpolation=configparser.ExtendedInterpolation())
  18. config.read(config_path)
  19. self.lang_src = config["Languages"]["srclang"]
  20. self.lang_tgt = config["Languages"]["tgtlang"]
  21. self.keep_unmatched_src = config.getboolean("Languages", "keep_unmatched_src", fallback=False)
  22. self.keep_unmatched_tgt = config.getboolean("Languages", "keep_unmatched_tgt", fallback=False)
  23. self.monolingual_filename_template_source = config.get("Output", "monolingual_filename_template_source")
  24. self.monolingual_filename_template_target = config.get("Output", "monolingual_filename_template_target")
  25. self.tmx_filename_template = config.get("Output", "parallel_filename_template")
  26. self.tmx_template = pathlib.Path(config["TMX"]["template"]).resolve()
  27. self.global_tmx_args = {
  28. "srclang" : self.lang_src,
  29. "tgtlang" : self.lang_tgt,
  30. "adminlang" : config["Languages"]["adminlang"],
  31. "datatype" : config["TMX"]["datatype"],
  32. "disclaimer" : config["TMX"]["disclaimer"],
  33. "distributor" : config["Common"]["project_name"],
  34. "segtype" : config["TMX"]["segtype"],
  35. }
  36. self.global_tmx_args.update(additional_tmx_args)
  37. self.abbreviations_paths = {}
  38. for lang, abbreviations_path in config["Abbreviations"].items():
  39. self.abbreviations_paths[lang] = abbreviations_path
  40. self.langdetect_config = config["Langdetect"]
  41. self.docalign_config = config["Docalign"]
  42. self.sentalign_config = config["Sentalign"]
  43. self.cleaner_config = config["TextCleaning"]
  44. self.extractor_config = config["TextExtraction"]
  45. @abstractmethod
  46. def process(self, id, input_dir, artefact_dir, output_dir):
  47. pass
  48. def create_directory(self, basedir, name):
  49. dir = basedir.joinpath(name)
  50. dir.mkdir(parents=True, exist_ok=True)
  51. return dir
  52. def create_file_info(self, linguality_type, languages, size, size_unit):
  53. file_info = {
  54. "linguality_type" : linguality_type,
  55. "languages" : languages,
  56. "size" : size,
  57. "size_unit" : size_unit,
  58. "encoding" : "utf8",
  59. }
  60. if linguality_type == "bilingual":
  61. file_info["multilinguality_type"] = "parallel"
  62. file_info["format"] = "application/x-tmx+xml"
  63. elif linguality_type == "monolingual":
  64. file_info["multilinguality_type"] = "not_applicable"
  65. file_info["format"] = "text/plain"
  66. return file_info