import io import unittest from toolchain.splitters.editable_sentence_splitter import EditableSentenceSplitter class TestEditableSentenceSplitter(unittest.TestCase): def setUp(self): self.output = io.StringIO() self.abbreviations = ["gCo\tgContae\ttrue", "IR\tIonstraim Reachtúil", "srl\tagus araile", "Uimh\tUimhir"] self.splitter = EditableSentenceSplitter() def tearDown(self): self.output.close() def test_empty(self): self.splitter.split_sentences(self.abbreviations, [], self.output) self.assertEqual(self.output.getvalue(), "") def test_line_empty(self): lines = [""] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "") def test_single_sentence_with_terminator(self): lines = ["Is glas iad na cnoic i bhfad uainn."] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\n") def test_single_sentence_without_terminator(self): lines = ["Is glas iad na cnoic i bhfad uainn"] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn\n") def test_non_abbreviation(self): lines = [ "aaaaa. bbbbb", "ccccc. DDDDD", ] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "aaaaa.\nbbbbb\nccccc.\nDDDDD\n") def test_abbreviation_followed_by_lowercase(self): lines = ["a, b, srl. agus c"] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "a, b, srl. agus c\n") def test_abbreviation_followed_by_uppercase(self): lines = ["a, b, srl. Agus c"] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "a, b, srl.\nAgus c\n") def test_abbreviation_expecting_additional_followed_by_lowercase(self): lines = ["i gCo. an Chláir"] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "i gCo. an Chláir\n") def test_abbreviation_expecting_additional_followed_by_uppercase(self): lines = ["i gCo. Chill Dara"] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "i gCo. Chill Dara\n") def test_abbreviation_at_end_of_line(self): lines = ["a, b, srl."] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "a, b, srl.\n") def test_abbreviation_followed_by_numeral(self): lines = ["Uimh. 9924 de 2027."] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "Uimh. 9924 de 2027.\n") def test_multichar_abbreviation(self): lines = ["I.R. 9924 de 2027."] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "I.R. 9924 de 2027.\n") def test_chained_abbreviations(self): lines = ["I.R. Uimh. 9924 de 2027."] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "I.R. Uimh. 9924 de 2027.\n") def test_multiple_sentence(self): lines = ["Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach? Fillean an feall ar an bhfeallaire!"] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\nNach leor nod don eolach?\nFillean an feall ar an bhfeallaire!\n") def test_newline_termination(self): lines = [ "Líne 1\n", "Líne 2?\n", "Líne 3!\n", "Líne 4.\n", ] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "\n".join([ "Líne 1", "Líne 2?", "Líne 3!", "Líne 4.\n", ])) def test_list_items(self): lines = [ "1. líne 1. a", "39. Líne 39", "iv. Ítim iv", "X. Ítim X", "2021. Líne eile", ] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "\n".join([ "1. líne 1.", "a", "39. Líne 39", "iv. Ítim iv", "X. Ítim X", "2021.", "Líne eile\n" ])) def test_mixed(self): lines = [ "Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach?", "", "Fillean an feall ar an bhfeallaire! I.R. 6742 de 2039. Uimh. 39 de 1382. I.R. Uimh. 9924 de 2027.", "a, b, srl. agus c? d, e, srl. Agus f! g, h, srl.", "1. líne 1. a", "39. Líne 39", "iv. Ítim iv", "X. Ítim X", "2021. Líne eile", ] self.splitter.split_sentences(self.abbreviations, lines, self.output) self.assertEqual(self.output.getvalue(), "\n".join([ "Is glas iad na cnoic i bhfad uainn.", "Nach leor nod don eolach?", "Fillean an feall ar an bhfeallaire!", "I.R. 6742 de 2039.", "Uimh. 39 de 1382.", "I.R. Uimh. 9924 de 2027.", "a, b, srl. agus c?", "d, e, srl.", "Agus f!", "g, h, srl.", "1. líne 1.", "a", "39. Líne 39", "iv. Ítim iv", "X. Ítim X", "2021.", "Líne eile\n" ])) if __name__ == "__main__": unittest.main()