import argparse import logging from toolchain.common.language_detector import LanguageDetector logger = logging.getLogger(__name__) class MonolingualCleaner: DEFAULT_LANGUAGE_DETECTION_THRESHOLD = 40 DEFAULT_REJECTED_LINE_DELIMITER = "@@@" REJECTION_EMPTY = "empty_segment" REJECTION_UNEXPECTED_LANGUAGE = "unexpected_language_[{0}]" def __init__(self, lang, config={}, language_detector=LanguageDetector()): self.lang = lang self.language_detector = language_detector self.language_detection_threshold = int(config.get("language_detection_threshold", self.DEFAULT_LANGUAGE_DETECTION_THRESHOLD)) rejected_line_delimiter = config.get("rejected_line_delimiter", self.DEFAULT_REJECTED_LINE_DELIMITER) self.rejected_line_template = rejected_line_delimiter.join(["{0}", "{1}"]) def clean(self, input_path, output_path_retained, output_path_rejected): logger.info("Cleaning {0} to {1} with rejections to {2}.".format(input_path, output_path_retained, output_path_rejected)) with open(input_path) as input, open(output_path_retained, "w") as output_retained, open(output_path_rejected, "w") as output_rejected: self.clean_text(input, output_retained, output_rejected) def clean_text(self, input, output_retained, output_rejected): for line in input: term = line.rstrip("\n") should_include, message = self.should_include(term.strip()) if should_include: self.write_file_line(output_retained, term) else: self.write_file_line(output_rejected, message) def should_include(self, term): if not term: message = self.rejected_line_template.format(self.REJECTION_EMPTY, term) return False, message if len(term) >= self.language_detection_threshold: detected_lang = self.language_detector.detect(term) if detected_lang != self.lang: reason = self.REJECTION_UNEXPECTED_LANGUAGE.format(detected_lang) message = self.rejected_line_template.format(reason, term) return False, message return True, "" def write_file_line(self, file, text): file.write(text + "\n") if __name__ == "__main__": argparser = argparse.ArgumentParser() argparser.add_argument("lang", help="language code") argparser.add_argument("input_path", help="path to input file") argparser.add_argument("output_path_retained", help="path to output file") argparser.add_argument("output_path_rejected", help="path to output rejection file") argparser.add_argument("--langdetect_threshold", type=int, default=40, help="check language of only lines of this number of characters or more") argparser.add_argument("--rejected_line_delimiter", type=str, default="@@@", help="string to use to delimit fields of rejection lines") args = argparser.parse_args() config = { "language_detection_threshold" : args.langdetect_threshold, "rejected_line_delimiter" : args.rejected_line_delimiter, } MonolingualCleaner(args.lang, config=config).clean(args.input_path, args.output_path_retained, args.output_path_rejected)