12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- import argparse
- import logging
- from toolchain.common.language_detector import LanguageDetector
- logger = logging.getLogger(__name__)
- class MonolingualCleaner:
- DEFAULT_LANGUAGE_DETECTION_THRESHOLD = 40
- DEFAULT_REJECTED_LINE_DELIMITER = "@@@"
- REJECTION_EMPTY = "empty_segment"
- REJECTION_UNEXPECTED_LANGUAGE = "unexpected_language_[{0}]"
- def __init__(self, lang, config={}, language_detector=LanguageDetector()):
- self.lang = lang
- self.language_detector = language_detector
- self.language_detection_threshold = int(config.get("language_detection_threshold", self.DEFAULT_LANGUAGE_DETECTION_THRESHOLD))
- rejected_line_delimiter = config.get("rejected_line_delimiter", self.DEFAULT_REJECTED_LINE_DELIMITER)
- self.rejected_line_template = rejected_line_delimiter.join(["{0}", "{1}"])
- def clean(self, input_path, output_path_retained, output_path_rejected):
- logger.info("Cleaning {0} to {1} with rejections to {2}.".format(input_path, output_path_retained, output_path_rejected))
- with open(input_path) as input, open(output_path_retained, "w") as output_retained, open(output_path_rejected, "w") as output_rejected:
- self.clean_text(input, output_retained, output_rejected)
- def clean_text(self, input, output_retained, output_rejected):
- for line in input:
- term = line.rstrip("\n")
- should_include, message = self.should_include(term.strip())
- if should_include:
- self.write_file_line(output_retained, term)
- else:
- self.write_file_line(output_rejected, message)
- def should_include(self, term):
- if not term:
- message = self.rejected_line_template.format(self.REJECTION_EMPTY, term)
- return False, message
- if len(term) >= self.language_detection_threshold:
- detected_lang = self.language_detector.detect(term)
- if detected_lang != self.lang:
- reason = self.REJECTION_UNEXPECTED_LANGUAGE.format(detected_lang)
- message = self.rejected_line_template.format(reason, term)
- return False, message
- return True, ""
- def write_file_line(self, file, text):
- file.write(text + "\n")
- if __name__ == "__main__":
- argparser = argparse.ArgumentParser()
- argparser.add_argument("lang", help="language code")
- argparser.add_argument("input_path", help="path to input file")
- argparser.add_argument("output_path_retained", help="path to output file")
- argparser.add_argument("output_path_rejected", help="path to output rejection file")
- argparser.add_argument("--langdetect_threshold", type=int, default=40, help="check language of only lines of this number of characters or more")
- argparser.add_argument("--rejected_line_delimiter", type=str, default="@@@", help="string to use to delimit fields of rejection lines")
- args = argparser.parse_args()
- config = {
- "language_detection_threshold" : args.langdetect_threshold,
- "rejected_line_delimiter" : args.rejected_line_delimiter,
- }
- MonolingualCleaner(args.lang, config=config).clean(args.input_path, args.output_path_retained, args.output_path_rejected)
|