test_monolingual_cleaner.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. import io
  2. import unittest
  3. from unittest.mock import call, Mock
  4. from toolchain.cleaners.monolingual_cleaner import MonolingualCleaner
  5. class TestMonolingualCleaner(unittest.TestCase):
  6. def setUp(self):
  7. self.language_detector = unittest.mock.Mock()
  8. self.output_retained = io.StringIO()
  9. self.output_rejected = io.StringIO()
  10. config = {
  11. "language_detection_threshold" : "5",
  12. "rejected_line_delimiter" : "@@@",
  13. }
  14. self.cleaner = MonolingualCleaner("ga", config=config, language_detector=self.language_detector)
  15. def tearDown(self):
  16. self.output_retained.close()
  17. self.output_rejected.close()
  18. def test_empty(self):
  19. self.cleaner.clean_text([], self.output_retained, self.output_rejected)
  20. self.assertEqual(self.output_retained.getvalue(), "")
  21. self.assertEqual(self.output_rejected.getvalue(), "")
  22. self.language_detector.detect.assert_has_calls([])
  23. def test_single_valid(self):
  24. self.language_detector.detect.side_effect = ["ga"]
  25. self.cleaner.clean_text(["capall"], self.output_retained, self.output_rejected)
  26. self.assertEqual(self.output_retained.getvalue(), "capall\n")
  27. self.assertEqual(self.output_rejected.getvalue(), "")
  28. self.language_detector.detect.assert_has_calls([call("capall")])
  29. def test_newline_termination(self):
  30. self.language_detector.detect.side_effect = ["ga"]
  31. self.cleaner.clean_text(["capall\n"], self.output_retained, self.output_rejected)
  32. self.assertEqual(self.output_retained.getvalue(), "capall\n")
  33. self.assertEqual(self.output_rejected.getvalue(), "")
  34. self.language_detector.detect.assert_has_calls([call("capall")])
  35. def test_single_language_mismatched_short(self):
  36. self.cleaner.clean_text(["dó"], self.output_retained, self.output_rejected)
  37. self.assertEqual(self.output_retained.getvalue(), "dó\n")
  38. self.assertEqual(self.output_rejected.getvalue(), "")
  39. self.language_detector.detect.assert_has_calls([])
  40. def test_single_target_language_mismatched(self):
  41. self.language_detector.detect.side_effect = ["de"]
  42. self.cleaner.clean_text(["Pferd"], self.output_retained, self.output_rejected)
  43. self.assertEqual(self.output_retained.getvalue(), "")
  44. self.assertEqual(self.output_rejected.getvalue(), "unexpected_language_[de]@@@Pferd\n")
  45. self.language_detector.detect.assert_has_calls([call("Pferd")])
  46. def test_single_empty(self):
  47. self.cleaner.clean_text([""], self.output_retained, self.output_rejected)
  48. self.assertEqual(self.output_retained.getvalue(), "")
  49. self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@\n")
  50. self.language_detector.detect.assert_has_calls([])
  51. def test_single_blank(self):
  52. self.cleaner.clean_text([" "], self.output_retained, self.output_rejected)
  53. self.assertEqual(self.output_retained.getvalue(), "")
  54. self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@\n")
  55. self.language_detector.detect.assert_has_calls([])
  56. def test_multiple_mixed(self):
  57. self.language_detector.detect.side_effect = ["ga", "de"]
  58. self.cleaner.clean_text(["capall", "dó", "Pferd", "", " "], self.output_retained, self.output_rejected)
  59. self.assertEqual(self.output_retained.getvalue(), "capall\ndó\n")
  60. self.assertEqual(self.output_rejected.getvalue(), "\n".join([
  61. "unexpected_language_[de]@@@Pferd",
  62. "empty_segment@@@",
  63. "empty_segment@@@\n"
  64. ]))
  65. self.language_detector.detect.assert_has_calls([call("capall"), call("Pferd")])
  66. if __name__ == "__main__":
  67. unittest.main()