123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- import io
- import logging
- import unittest
- from toolchain.common.language_detector import LanguageDetector
- class TestLanguageDetector(unittest.TestCase):
- def setUp(self):
- logging.disable(level=logging.CRITICAL)
- self.detector = LanguageDetector()
- def tearDown(self):
- pass
- def test_empty_string(self):
- result = self.detector.detect_language("")
- self.assertEqual(result.language, "")
- self.assertEqual(result.probability, -1.0)
- self.assertEqual(result.total, 1)
- self.assertEqual(result.tested, 0)
- def test_nonempty_string(self):
- result = self.detector.detect_language("you cannot burn a candle at both ends")
- self.assertEqual(result.language, "en")
- self.assertEqual(result.total, 1)
- self.assertEqual(result.tested, 1)
- def test_upper_string(self):
- result = self.detector.detect_language("IS GLAS IAD NA CNOIC I bhFAD UAINN")
- self.assertEqual(result.language, "ga")
- self.assertEqual(result.total, 1)
- self.assertEqual(result.tested, 1)
- def test_file_empty_input(self):
- result = self.detector.detect_language_in_file([])
- self.assertEqual(result.language, "")
- self.assertEqual(result.probability, -1.0)
- self.assertEqual(result.total, 0)
- self.assertEqual(result.tested, 0)
- def test_file_single_line_invalid(self):
- result = self.detector.detect_language_in_file([""])
- self.assertEqual(result.language, "")
- self.assertEqual(result.total, 1)
- self.assertEqual(result.tested, 0)
- def test_file_single_line_valid(self):
- input = ["you cannot burn a candle at both ends"]
- result = self.detector.detect_language_in_file(input)
- self.assertEqual(result.language, "en")
- self.assertEqual(result.total, 1)
- self.assertEqual(result.tested, 1)
- def test_file_single_line_upper(self):
- input = ["IS GLAS IAD NA CNOIC I bhFAD UAINN"]
- result = self.detector.detect_language_in_file(input)
- self.assertEqual(result.language, "ga")
- self.assertEqual(result.total, 1)
- self.assertEqual(result.tested, 1)
- def test_file_multiple_line_invalid(self):
- input = [
- "",
- "",
- "",
- "",
- "",
- ]
- result = self.detector.detect_language_in_file(input)
- self.assertEqual(result.language, "")
- self.assertEqual(result.total, 5)
- self.assertEqual(result.tested, 0)
- def test_file_multiple_line_mixed(self):
- input = [
- "is glas iad na cnoic i bhfad uainn",
- "tús maith leath na hoibre",
- "is binn béal ina thost",
- "",
- "is leor nod don eolach",
- ]
- result = self.detector.detect_language_in_file(input)
- self.assertEqual(result.language, "ga")
- self.assertEqual(result.total, 5)
- self.assertEqual(result.tested, 4)
- def test_file_multiple_line_min_line_length(self):
- input = [
- "is glas iad na cnoic i bhfad uainn",
- "tús maith leath na hoibre",
- "is binn béal ina thost",
- "",
- "is leor nod don eolach",
- ]
- config = {
- "min_file_line_length" : "25",
- }
- result = self.detector.detect_language_in_file(input, config)
- self.assertEqual(result.language, "ga")
- self.assertEqual(result.total, 5)
- self.assertEqual(result.tested, 2)
- def test_file_limited_initial_lines(self):
- input = [
- "is glas iad na cnoic i bhfad uainn",
- "tús maith leath na hoibre",
- "is binn béal ina thost",
- "",
- "is leor nod don eolach",
- ]
- config = {
- "min_initial_lines" : "2",
- }
- result = self.detector.detect_language_in_file(input, config)
- self.assertEqual(result.language, "ga")
- self.assertEqual(result.total, 5)
- self.assertEqual(result.tested, 2)
- def test_file_limited_sampling_all_valid(self):
- input = [
- "is glas iad na cnoic i bhfad uainn",
- "tús maith leath na hoibre",
- "is binn béal ina thost",
- "bíonn an fhírinne searbh",
- "is leor nod don eolach",
- "fillean an feall ar an bhfeallaire",
- ]
- config = {
- "min_initial_lines" : "2",
- "sampling_interval" : "2",
- }
- result = self.detector.detect_language_in_file(input, config)
- self.assertEqual(result.language, "ga")
- self.assertEqual(result.total, 6)
- self.assertEqual(result.tested, 4)
- def test_file_limited_sampling_including_invalid(self):
- input = [
- "is glas iad na cnoic i bhfad uainn",
- "tús maith leath na hoibre",
- "is binn béal ina thost",
- "",
- "is leor nod don eolach",
- ]
- config = {
- "min_initial_lines" : "1",
- "sampling_interval" : "2",
- }
- result = self.detector.detect_language_in_file(input, config)
- self.assertEqual(result.language, "ga")
- self.assertEqual(result.total, 5)
- self.assertEqual(result.tested, 2)
- if __name__ == "__main__":
- unittest.main()
|