test_language_detector.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. import io
  2. import logging
  3. import unittest
  4. from toolchain.common.language_detector import LanguageDetector
  5. class TestLanguageDetector(unittest.TestCase):
  6. def setUp(self):
  7. logging.disable(level=logging.CRITICAL)
  8. self.detector = LanguageDetector()
  9. def tearDown(self):
  10. pass
  11. def test_empty_string(self):
  12. result = self.detector.detect_language("")
  13. self.assertEqual(result.language, "")
  14. self.assertEqual(result.probability, -1.0)
  15. self.assertEqual(result.total, 1)
  16. self.assertEqual(result.tested, 0)
  17. def test_nonempty_string(self):
  18. result = self.detector.detect_language("you cannot burn a candle at both ends")
  19. self.assertEqual(result.language, "en")
  20. self.assertEqual(result.total, 1)
  21. self.assertEqual(result.tested, 1)
  22. def test_upper_string(self):
  23. result = self.detector.detect_language("IS GLAS IAD NA CNOIC I bhFAD UAINN")
  24. self.assertEqual(result.language, "ga")
  25. self.assertEqual(result.total, 1)
  26. self.assertEqual(result.tested, 1)
  27. def test_file_empty_input(self):
  28. result = self.detector.detect_language_in_file([])
  29. self.assertEqual(result.language, "")
  30. self.assertEqual(result.probability, -1.0)
  31. self.assertEqual(result.total, 0)
  32. self.assertEqual(result.tested, 0)
  33. def test_file_single_line_invalid(self):
  34. result = self.detector.detect_language_in_file([""])
  35. self.assertEqual(result.language, "")
  36. self.assertEqual(result.total, 1)
  37. self.assertEqual(result.tested, 0)
  38. def test_file_single_line_valid(self):
  39. input = ["you cannot burn a candle at both ends"]
  40. result = self.detector.detect_language_in_file(input)
  41. self.assertEqual(result.language, "en")
  42. self.assertEqual(result.total, 1)
  43. self.assertEqual(result.tested, 1)
  44. def test_file_single_line_upper(self):
  45. input = ["IS GLAS IAD NA CNOIC I bhFAD UAINN"]
  46. result = self.detector.detect_language_in_file(input)
  47. self.assertEqual(result.language, "ga")
  48. self.assertEqual(result.total, 1)
  49. self.assertEqual(result.tested, 1)
  50. def test_file_multiple_line_invalid(self):
  51. input = [
  52. "",
  53. "",
  54. "",
  55. "",
  56. "",
  57. ]
  58. result = self.detector.detect_language_in_file(input)
  59. self.assertEqual(result.language, "")
  60. self.assertEqual(result.total, 5)
  61. self.assertEqual(result.tested, 0)
  62. def test_file_multiple_line_mixed(self):
  63. input = [
  64. "is glas iad na cnoic i bhfad uainn",
  65. "tús maith leath na hoibre",
  66. "is binn béal ina thost",
  67. "",
  68. "is leor nod don eolach",
  69. ]
  70. result = self.detector.detect_language_in_file(input)
  71. self.assertEqual(result.language, "ga")
  72. self.assertEqual(result.total, 5)
  73. self.assertEqual(result.tested, 4)
  74. def test_file_multiple_line_min_line_length(self):
  75. input = [
  76. "is glas iad na cnoic i bhfad uainn",
  77. "tús maith leath na hoibre",
  78. "is binn béal ina thost",
  79. "",
  80. "is leor nod don eolach",
  81. ]
  82. config = {
  83. "min_file_line_length" : "25",
  84. }
  85. result = self.detector.detect_language_in_file(input, config)
  86. self.assertEqual(result.language, "ga")
  87. self.assertEqual(result.total, 5)
  88. self.assertEqual(result.tested, 2)
  89. def test_file_limited_initial_lines(self):
  90. input = [
  91. "is glas iad na cnoic i bhfad uainn",
  92. "tús maith leath na hoibre",
  93. "is binn béal ina thost",
  94. "",
  95. "is leor nod don eolach",
  96. ]
  97. config = {
  98. "min_initial_lines" : "2",
  99. }
  100. result = self.detector.detect_language_in_file(input, config)
  101. self.assertEqual(result.language, "ga")
  102. self.assertEqual(result.total, 5)
  103. self.assertEqual(result.tested, 2)
  104. def test_file_limited_sampling_all_valid(self):
  105. input = [
  106. "is glas iad na cnoic i bhfad uainn",
  107. "tús maith leath na hoibre",
  108. "is binn béal ina thost",
  109. "bíonn an fhírinne searbh",
  110. "is leor nod don eolach",
  111. "fillean an feall ar an bhfeallaire",
  112. ]
  113. config = {
  114. "min_initial_lines" : "2",
  115. "sampling_interval" : "2",
  116. }
  117. result = self.detector.detect_language_in_file(input, config)
  118. self.assertEqual(result.language, "ga")
  119. self.assertEqual(result.total, 6)
  120. self.assertEqual(result.tested, 4)
  121. def test_file_limited_sampling_including_invalid(self):
  122. input = [
  123. "is glas iad na cnoic i bhfad uainn",
  124. "tús maith leath na hoibre",
  125. "is binn béal ina thost",
  126. "",
  127. "is leor nod don eolach",
  128. ]
  129. config = {
  130. "min_initial_lines" : "1",
  131. "sampling_interval" : "2",
  132. }
  133. result = self.detector.detect_language_in_file(input, config)
  134. self.assertEqual(result.language, "ga")
  135. self.assertEqual(result.total, 5)
  136. self.assertEqual(result.tested, 2)
  137. if __name__ == "__main__":
  138. unittest.main()