123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102 |
- import io
- import unittest
- from unittest.mock import call, Mock
- from toolchain.cleaners.monolingual_cleaner import MonolingualCleaner
- class TestMonolingualCleaner(unittest.TestCase):
- def setUp(self):
- self.language_detector = unittest.mock.Mock()
- self.output_retained = io.StringIO()
- self.output_rejected = io.StringIO()
- config = {
- "language_detection_threshold" : "5",
- "rejected_line_delimiter" : "@@@",
- }
- self.cleaner = MonolingualCleaner("ga", config=config, language_detector=self.language_detector)
- def tearDown(self):
- self.output_retained.close()
- self.output_rejected.close()
- def test_empty(self):
- self.cleaner.clean_text([], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "")
- self.assertEqual(self.output_rejected.getvalue(), "")
- self.language_detector.detect.assert_has_calls([])
- def test_single_valid(self):
- self.language_detector.detect.side_effect = ["ga"]
- self.cleaner.clean_text(["capall"], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "capall\n")
- self.assertEqual(self.output_rejected.getvalue(), "")
- self.language_detector.detect.assert_has_calls([call("capall")])
- def test_newline_termination(self):
- self.language_detector.detect.side_effect = ["ga"]
- self.cleaner.clean_text(["capall\n"], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "capall\n")
- self.assertEqual(self.output_rejected.getvalue(), "")
- self.language_detector.detect.assert_has_calls([call("capall")])
- def test_single_language_mismatched_short(self):
- self.cleaner.clean_text(["dó"], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "dó\n")
- self.assertEqual(self.output_rejected.getvalue(), "")
- self.language_detector.detect.assert_has_calls([])
- def test_single_target_language_mismatched(self):
- self.language_detector.detect.side_effect = ["de"]
- self.cleaner.clean_text(["Pferd"], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "")
- self.assertEqual(self.output_rejected.getvalue(), "unexpected_language_[de]@@@Pferd\n")
- self.language_detector.detect.assert_has_calls([call("Pferd")])
- def test_single_empty(self):
- self.cleaner.clean_text([""], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "")
- self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@\n")
- self.language_detector.detect.assert_has_calls([])
- def test_single_blank(self):
- self.cleaner.clean_text([" "], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "")
- self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@\n")
- self.language_detector.detect.assert_has_calls([])
- def test_multiple_mixed(self):
- self.language_detector.detect.side_effect = ["ga", "de"]
- self.cleaner.clean_text(["capall", "dó", "Pferd", "", " "], self.output_retained, self.output_rejected)
- self.assertEqual(self.output_retained.getvalue(), "capall\ndó\n")
- self.assertEqual(self.output_rejected.getvalue(), "\n".join([
- "unexpected_language_[de]@@@Pferd",
- "empty_segment@@@",
- "empty_segment@@@\n"
- ]))
- self.language_detector.detect.assert_has_calls([call("capall"), call("Pferd")])
- if __name__ == "__main__":
- unittest.main()
|