test_sentence_aligner.py 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import io
  2. import os
  3. import pathlib
  4. import subprocess
  5. import tempfile
  6. import unittest
  7. from toolchain.sentalign.sentalign_error import SentalignError
  8. from toolchain.sentalign.sentence_aligner import SentenceAligner
  9. class TestSentenceAligner(unittest.TestCase):
  10. INPUT_BASE_PATH = pathlib.Path("test-res/sentalign").resolve()
  11. def setUp(self):
  12. self.output_src = io.StringIO()
  13. self.output_tgt = io.StringIO()
  14. self.aligner = SentenceAligner(os.environ["HUNALIGNPATH"])
  15. self.config = {
  16. "dictionary" : self.INPUT_BASE_PATH.joinpath("dictionary.txt"),
  17. "subprocess_timeout" : "20",
  18. }
  19. def tearDown(self):
  20. self.output_src.close()
  21. self.output_tgt.close()
  22. def run_test(self, input_file_src, input_file_tgt, output_expected_src, output_expected_tgt):
  23. input_path_src = self.INPUT_BASE_PATH.joinpath(input_file_src)
  24. input_path_tgt = self.INPUT_BASE_PATH.joinpath(input_file_tgt)
  25. with tempfile.TemporaryDirectory() as output_artefact_dir:
  26. self.aligner.align_files(input_path_src, input_path_tgt,
  27. self.output_src, self.output_tgt, output_artefact_dir, self.config)
  28. self.assertEqual(self.output_src.getvalue(), output_expected_src)
  29. self.assertEqual(self.output_tgt.getvalue(), output_expected_tgt)
  30. def test_empty(self):
  31. self.run_test("empty.src.txt", "empty.tgt.txt", "\n", "\n")
  32. def test_already_aligned(self):
  33. self.run_test("aligned.src.txt", "aligned.tgt.txt",
  34. "aniseed\nbasil\ncinnamon dill\nelderflower fennel ginger\nhorseradish\n\n",
  35. "ánísééd\nbásíl\ncínnámón díll\néldérflówér fénnél gíngér\nhórsérádísh\n\n")
  36. def test_unaligned(self):
  37. self.run_test("unaligned.src.txt", "unaligned.tgt.txt",
  38. "aniseed\nbasil cinnamon dill elderflower\nfennel\nginger horseradish\n\n",
  39. "ánísééd\nbásíl cínnámón díll éldérflówér\nfénnél\ngíngér hórsérádísh\n\n")
  40. def test_timeout(self):
  41. self.config["subprocess_timeout"] = 0
  42. with self.assertRaises(SentalignError):
  43. self.run_test("unaligned.src.txt", "unaligned.tgt.txt",
  44. "aniseed\nbasil cinnamon dill elderflower\nfennel\nginger horseradish\n\n",
  45. "ánísééd\nbásíl cínnámón díll éldérflówér\nfénnél\ngíngér hórsérádísh\n\n")
  46. self.assertEqual(self.output_src.getvalue(), "")
  47. self.assertEqual(self.output_tgt.getvalue(), "")
  48. def test_subprocess_error(self):
  49. self.config["dictionary"] = self.INPUT_BASE_PATH.joinpath("nonexistent-dictionary.txt")
  50. with self.assertRaises(SentalignError):
  51. self.run_test("unaligned.src.txt", "unaligned.tgt.txt",
  52. "aniseed\nbasil cinnamon dill elderflower\nfennel\nginger horseradish\n\n",
  53. "ánísééd\nbásíl cínnámón díll éldérflówér\nfénnél\ngíngér hórsérádísh\n\n")
  54. self.assertEqual(self.output_src.getvalue(), "")
  55. self.assertEqual(self.output_tgt.getvalue(), "")
  56. if __name__ == "__main__":
  57. unittest.main()