monolingual_cleaner.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. import argparse
  2. import logging
  3. from toolchain.common.language_detector import LanguageDetector
  4. logger = logging.getLogger(__name__)
  5. class MonolingualCleaner:
  6. DEFAULT_LANGUAGE_DETECTION_THRESHOLD = 40
  7. DEFAULT_REJECTED_LINE_DELIMITER = "@@@"
  8. REJECTION_EMPTY = "empty_segment"
  9. REJECTION_UNEXPECTED_LANGUAGE = "unexpected_language_[{0}]"
  10. def __init__(self, lang, config={}, language_detector=LanguageDetector()):
  11. self.lang = lang
  12. self.language_detector = language_detector
  13. self.language_detection_threshold = int(config.get("language_detection_threshold", self.DEFAULT_LANGUAGE_DETECTION_THRESHOLD))
  14. rejected_line_delimiter = config.get("rejected_line_delimiter", self.DEFAULT_REJECTED_LINE_DELIMITER)
  15. self.rejected_line_template = rejected_line_delimiter.join(["{0}", "{1}"])
  16. def clean(self, input_path, output_path_retained, output_path_rejected):
  17. logger.info("Cleaning {0} to {1} with rejections to {2}.".format(input_path, output_path_retained, output_path_rejected))
  18. with open(input_path) as input, open(output_path_retained, "w") as output_retained, open(output_path_rejected, "w") as output_rejected:
  19. self.clean_text(input, output_retained, output_rejected)
  20. def clean_text(self, input, output_retained, output_rejected):
  21. for line in input:
  22. term = line.rstrip("\n")
  23. should_include, message = self.should_include(term.strip())
  24. if should_include:
  25. self.write_file_line(output_retained, term)
  26. else:
  27. self.write_file_line(output_rejected, message)
  28. def should_include(self, term):
  29. if not term:
  30. message = self.rejected_line_template.format(self.REJECTION_EMPTY, term)
  31. return False, message
  32. if len(term) >= self.language_detection_threshold:
  33. detected_lang = self.language_detector.detect(term)
  34. if detected_lang != self.lang:
  35. reason = self.REJECTION_UNEXPECTED_LANGUAGE.format(detected_lang)
  36. message = self.rejected_line_template.format(reason, term)
  37. return False, message
  38. return True, ""
  39. def write_file_line(self, file, text):
  40. file.write(text + "\n")
  41. if __name__ == "__main__":
  42. argparser = argparse.ArgumentParser()
  43. argparser.add_argument("lang", help="language code")
  44. argparser.add_argument("input_path", help="path to input file")
  45. argparser.add_argument("output_path_retained", help="path to output file")
  46. argparser.add_argument("output_path_rejected", help="path to output rejection file")
  47. argparser.add_argument("--langdetect_threshold", type=int, default=40, help="check language of only lines of this number of characters or more")
  48. argparser.add_argument("--rejected_line_delimiter", type=str, default="@@@", help="string to use to delimit fields of rejection lines")
  49. args = argparser.parse_args()
  50. config = {
  51. "language_detection_threshold" : args.langdetect_threshold,
  52. "rejected_line_delimiter" : args.rejected_line_delimiter,
  53. }
  54. MonolingualCleaner(args.lang, config=config).clean(args.input_path, args.output_path_retained, args.output_path_rejected)