1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- import io
- import unittest
- from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
- class TestUnicodeNormalizer(unittest.TestCase):
- def setUp(self):
- self.output = io.StringIO()
- self.normalizer = UnicodeNormalizer()
- def tearDown(self):
- self.output.close()
- def test_empty(self):
- self.normalizer.normalize_text([], self.output)
- self.assertEqual(self.output.getvalue(), "")
- def test_ascii(self):
- lines = ["1234567890_[](){}<>.,;:?!|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"]
- self.normalizer.normalize_text(lines, self.output)
- self.assertEqual(self.output.getvalue(), "1234567890_[](){}<>.,;:?!|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\n")
- def test_nfc(self):
- lines = ["ÁÉÍÓÚáéíóú"]
- self.normalizer.normalize_text(lines, self.output)
- self.assertEqual(self.output.getvalue(), "ÁÉÍÓÚáéíóú\n")
- def test_nfd(self):
- lines = ["ÁÉÍÓÚáéı́óú"]
- self.normalizer.normalize_text(lines, self.output)
- self.assertEqual(self.output.getvalue(), "ÁÉÍÓÚáéíóú\n")
- def test_bom(self):
- lines = ["\ufeffabcde"]
- self.normalizer.normalize_text(lines, self.output)
- self.assertEqual(self.output.getvalue(), "abcde\n")
- def test_multiple_global_sub(self):
- lines = ["ı́éı́óúı́"]
- self.normalizer.normalize_text(lines, self.output)
- self.assertEqual(self.output.getvalue(), "íéíóúí\n")
- def test_custom_sub(self):
- lines = ["\u008045 \u0080\u0080"]
- self.normalizer.normalize_text(lines, self.output, [("\u0080", "€")])
- self.assertEqual(self.output.getvalue(), "€45 €€\n")
- def test_newline_termination(self):
- lines = [
- "\ufeffabcde\n",
- "ÁÉÍÓÚáéíóú\n",
- "ÁÉÍÓÚáéı́óú\n",
- ]
- self.normalizer.normalize_text(lines, self.output)
- self.assertEqual(self.output.getvalue(), "abcde\nÁÉÍÓÚáéíóú\nÁÉÍÓÚáéíóú\n")
- def test_mixed(self):
- lines = [
- "123+_ABcdEf",
- "ÁÉÍÓÚáéíóú",
- "ÁÉÍÓÚáéı́óú",
- "ı́éı́óúı́\u0080",
- "123ÚáÚ áı́óúı́"
- ]
- self.normalizer.normalize_text(lines, self.output, [("\u0080", "€")])
- self.assertEqual(self.output.getvalue(), "123+_ABcdEf\nÁÉÍÓÚáéíóú\nÁÉÍÓÚáéíóú\níéíóúí€\n123ÚáÚ áíóúí\n")
- if __name__ == "__main__":
- unittest.main()
|