test_unicode_normalizer.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import io
  2. import unittest
  3. from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
  4. class TestUnicodeNormalizer(unittest.TestCase):
  5. def setUp(self):
  6. self.output = io.StringIO()
  7. self.normalizer = UnicodeNormalizer()
  8. def tearDown(self):
  9. self.output.close()
  10. def test_empty(self):
  11. self.normalizer.normalize_text([], self.output)
  12. self.assertEqual(self.output.getvalue(), "")
  13. def test_ascii(self):
  14. lines = ["1234567890_[](){}<>.,;:?!|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"]
  15. self.normalizer.normalize_text(lines, self.output)
  16. self.assertEqual(self.output.getvalue(), "1234567890_[](){}<>.,;:?!|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\n")
  17. def test_nfc(self):
  18. lines = ["ÁÉÍÓÚáéíóú"]
  19. self.normalizer.normalize_text(lines, self.output)
  20. self.assertEqual(self.output.getvalue(), "ÁÉÍÓÚáéíóú\n")
  21. def test_nfd(self):
  22. lines = ["ÁÉÍÓÚáéı́óú"]
  23. self.normalizer.normalize_text(lines, self.output)
  24. self.assertEqual(self.output.getvalue(), "ÁÉÍÓÚáéíóú\n")
  25. def test_bom(self):
  26. lines = ["\ufeffabcde"]
  27. self.normalizer.normalize_text(lines, self.output)
  28. self.assertEqual(self.output.getvalue(), "abcde\n")
  29. def test_multiple_global_sub(self):
  30. lines = ["ı́éı́óúı́"]
  31. self.normalizer.normalize_text(lines, self.output)
  32. self.assertEqual(self.output.getvalue(), "íéíóúí\n")
  33. def test_custom_sub(self):
  34. lines = ["\u008045 \u0080\u0080"]
  35. self.normalizer.normalize_text(lines, self.output, [("\u0080", "€")])
  36. self.assertEqual(self.output.getvalue(), "€45 €€\n")
  37. def test_newline_termination(self):
  38. lines = [
  39. "\ufeffabcde\n",
  40. "ÁÉÍÓÚáéíóú\n",
  41. "ÁÉÍÓÚáéı́óú\n",
  42. ]
  43. self.normalizer.normalize_text(lines, self.output)
  44. self.assertEqual(self.output.getvalue(), "abcde\nÁÉÍÓÚáéíóú\nÁÉÍÓÚáéíóú\n")
  45. def test_mixed(self):
  46. lines = [
  47. "123+_ABcdEf",
  48. "ÁÉÍÓÚáéíóú",
  49. "ÁÉÍÓÚáéı́óú",
  50. "ı́éı́óúı́\u0080",
  51. "123ÚáÚ áı́óúı́"
  52. ]
  53. self.normalizer.normalize_text(lines, self.output, [("\u0080", "€")])
  54. self.assertEqual(self.output.getvalue(), "123+_ABcdEf\nÁÉÍÓÚáéíóú\nÁÉÍÓÚáéíóú\níéíóúí€\n123ÚáÚ áíóúí\n")
  55. if __name__ == "__main__":
  56. unittest.main()