test_document_aligner.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071
  1. import pathlib
  2. import tempfile
  3. import unittest
  4. from toolchain.docalign.document_aligner import DocumentAligner
  5. class TestDocumentAligner(unittest.TestCase):
  6. INPUT_BASE_DIR = pathlib.Path("test-res/docalign").resolve()
  7. def setUp(self):
  8. self.config = {}
  9. self.aligner = DocumentAligner(self.config)
  10. def tearDown(self):
  11. pass
  12. def run_test(self, test_dir):
  13. file_list_src = list(test_dir.glob("*.src.txt"))
  14. file_list_tgt = list(test_dir.glob("*.tgt.txt"))
  15. expected_matches = []
  16. expected_unmatched_src = []
  17. expected_unmatched_tgt = []
  18. for file_src in file_list_src:
  19. file_tgt = pathlib.Path(str(file_src).replace("src.txt", "tgt.txt"))
  20. if file_tgt in file_list_tgt:
  21. expected_matches.append((file_src, file_tgt))
  22. else:
  23. expected_unmatched_src.append(file_src)
  24. for file_tgt in file_list_tgt:
  25. if not any(match[1] == file_tgt for match in expected_matches):
  26. expected_unmatched_tgt.append(file_tgt)
  27. with tempfile.TemporaryDirectory() as docalign_artefact_dir:
  28. docalign_artefact_dir = pathlib.Path(docalign_artefact_dir)
  29. matches, unmatched_src, unmatched_tgt = self.aligner.align(file_list_src, file_list_tgt, docalign_artefact_dir)
  30. self.assertEqual(set(matches), set(expected_matches))
  31. self.assertEqual(set(unmatched_src), set(expected_unmatched_src))
  32. self.assertEqual(set(unmatched_tgt), set(expected_unmatched_tgt))
  33. self.check_length_matches(docalign_artefact_dir.joinpath("alignments.txt"), matches)
  34. self.check_length_matches(docalign_artefact_dir.joinpath("unmatched_src.txt"), unmatched_src)
  35. self.check_length_matches(docalign_artefact_dir.joinpath("unmatched_tgt.txt"), unmatched_tgt)
  36. def check_length_matches(self, filepath, document_list):
  37. with open(filepath) as f:
  38. linecount = sum(1 for _ in f)
  39. assert linecount == len(document_list)
  40. def test_document_aligner(self):
  41. test_count = 0
  42. for test_dir in pathlib.Path(self.INPUT_BASE_DIR).iterdir():
  43. if test_dir.is_dir():
  44. test_count += 1
  45. with self.subTest(msg=test_dir.name):
  46. self.run_test(test_dir)
  47. print("\nTests run for document aligner: {0}".format(test_count))
  48. if __name__ == "__main__":
  49. unittest.main()