|
- import io
- import unittest
- from toolchain.splitters.editable_sentence_splitter import EditableSentenceSplitter
- class TestEditableSentenceSplitter(unittest.TestCase):
- def setUp(self):
- self.output = io.StringIO()
- self.abbreviations = ["gCo\tgContae\ttrue", "IR\tIonstraim Reachtúil", "srl\tagus araile", "Uimh\tUimhir"]
- self.splitter = EditableSentenceSplitter()
- def tearDown(self):
- self.output.close()
- def test_empty(self):
- self.splitter.split_sentences(self.abbreviations, [], self.output)
- self.assertEqual(self.output.getvalue(), "")
- def test_line_empty(self):
- lines = [""]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "")
- def test_single_sentence_with_terminator(self):
- lines = ["Is glas iad na cnoic i bhfad uainn."]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\n")
- def test_single_sentence_without_terminator(self):
- lines = ["Is glas iad na cnoic i bhfad uainn"]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn\n")
- def test_non_abbreviation(self):
- lines = [
- "aaaaa. bbbbb",
- "ccccc. DDDDD",
- ]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "aaaaa.\nbbbbb\nccccc.\nDDDDD\n")
- def test_abbreviation_followed_by_lowercase(self):
- lines = ["a, b, srl. agus c"]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "a, b, srl. agus c\n")
- def test_abbreviation_followed_by_uppercase(self):
- lines = ["a, b, srl. Agus c"]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "a, b, srl.\nAgus c\n")
- def test_abbreviation_expecting_additional_followed_by_lowercase(self):
- lines = ["i gCo. an Chláir"]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "i gCo. an Chláir\n")
- def test_abbreviation_expecting_additional_followed_by_uppercase(self):
- lines = ["i gCo. Chill Dara"]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "i gCo. Chill Dara\n")
- def test_abbreviation_at_end_of_line(self):
- lines = ["a, b, srl."]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "a, b, srl.\n")
- def test_abbreviation_followed_by_numeral(self):
- lines = ["Uimh. 9924 de 2027."]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "Uimh. 9924 de 2027.\n")
- def test_multichar_abbreviation(self):
- lines = ["I.R. 9924 de 2027."]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "I.R. 9924 de 2027.\n")
- def test_chained_abbreviations(self):
- lines = ["I.R. Uimh. 9924 de 2027."]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "I.R. Uimh. 9924 de 2027.\n")
- def test_multiple_sentence(self):
- lines = ["Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach? Fillean an feall ar an bhfeallaire!"]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\nNach leor nod don eolach?\nFillean an feall ar an bhfeallaire!\n")
- def test_newline_termination(self):
- lines = [
- "Líne 1\n",
- "Líne 2?\n",
- "Líne 3!\n",
- "Líne 4.\n",
- ]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "\n".join([
- "Líne 1",
- "Líne 2?",
- "Líne 3!",
- "Líne 4.\n",
- ]))
- def test_list_items(self):
- lines = [
- "1. líne 1. a",
- "39. Líne 39",
- "iv. Ítim iv",
- "X. Ítim X",
- "2021. Líne eile",
- ]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "\n".join([
- "1. líne 1.",
- "a",
- "39. Líne 39",
- "iv. Ítim iv",
- "X. Ítim X",
- "2021.",
- "Líne eile\n"
- ]))
- def test_mixed(self):
- lines = [
- "Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach?",
- "",
- "Fillean an feall ar an bhfeallaire! I.R. 6742 de 2039. Uimh. 39 de 1382. I.R. Uimh. 9924 de 2027.",
- "a, b, srl. agus c? d, e, srl. Agus f! g, h, srl.",
- "1. líne 1. a",
- "39. Líne 39",
- "iv. Ítim iv",
- "X. Ítim X",
- "2021. Líne eile",
- ]
- self.splitter.split_sentences(self.abbreviations, lines, self.output)
- self.assertEqual(self.output.getvalue(), "\n".join([
- "Is glas iad na cnoic i bhfad uainn.",
- "Nach leor nod don eolach?",
- "Fillean an feall ar an bhfeallaire!",
- "I.R. 6742 de 2039.",
- "Uimh. 39 de 1382.",
- "I.R. Uimh. 9924 de 2027.",
- "a, b, srl. agus c?",
- "d, e, srl.",
- "Agus f!",
- "g, h, srl.",
- "1. líne 1.",
- "a",
- "39. Líne 39",
- "iv. Ítim iv",
- "X. Ítim X",
- "2021.",
- "Líne eile\n"
- ]))
- if __name__ == "__main__":
- unittest.main()
|