test_tmx_parser.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. import io
  2. import unittest
  3. import defusedxml.ElementTree as ET
  4. from toolchain.parsers.parsing_error import ParsingError
  5. from toolchain.parsers.tmx_parser import TmxParser
  6. class TestTmxParser(unittest.TestCase):
  7. ROOT_TEMPLATE = \
  8. "<tmx version=\"1.4\">\
  9. <header/>\
  10. <body>\
  11. {0}\
  12. </body>\
  13. </tmx>"
  14. LANGUAGE_CODE_SRC = "en"
  15. LANGUAGE_CODE_TGT = "ga"
  16. def setUp(self):
  17. self.output_src = io.StringIO()
  18. self.output_tgt = io.StringIO()
  19. self.parser = TmxParser(self.LANGUAGE_CODE_SRC, self.LANGUAGE_CODE_TGT)
  20. def tearDown(self):
  21. self.output_src.close()
  22. self.output_tgt.close()
  23. def make_document(self, content):
  24. return ET.fromstring(TestTmxParser.ROOT_TEMPLATE.format(content))
  25. def test_empty_body(self):
  26. document = self.make_document("")
  27. self.parser.parse_content(document, self.output_src, self.output_tgt)
  28. self.assertEqual(self.output_src.getvalue(), "")
  29. self.assertEqual(self.output_tgt.getvalue(), "")
  30. def test_absent_tgt(self):
  31. document = self.make_document("\
  32. <tu>\
  33. <tuv xml:lang=\"en\">\
  34. <seg>yellow</seg>\
  35. </tuv>\
  36. </tu>\
  37. ")
  38. self.parser.parse_content(document, self.output_src, self.output_tgt)
  39. self.assertEqual(self.output_src.getvalue(), "")
  40. self.assertEqual(self.output_tgt.getvalue(), "")
  41. def test_absent_src(self):
  42. document = self.make_document("\
  43. <tu>\
  44. <tuv xml:lang=\"ga\">\
  45. <seg>buí</seg>\
  46. </tuv>\
  47. </tu>\
  48. ")
  49. self.parser.parse_content(document, self.output_src, self.output_tgt)
  50. self.assertEqual(self.output_src.getvalue(), "")
  51. self.assertEqual(self.output_tgt.getvalue(), "")
  52. def test_empty_tgt(self):
  53. document = self.make_document("\
  54. <tu>\
  55. <tuv xml:lang=\"en\">\
  56. <seg>yellow</seg>\
  57. </tuv>\
  58. <tuv xml:lang=\"ga\">\
  59. <seg></seg>\
  60. </tuv>\
  61. </tu>\
  62. ")
  63. self.parser.parse_content(document, self.output_src, self.output_tgt)
  64. self.assertEqual(self.output_src.getvalue(), "")
  65. self.assertEqual(self.output_tgt.getvalue(), "")
  66. def test_single_simple(self):
  67. document = self.make_document("\
  68. <tu>\
  69. <tuv xml:lang=\"en\">\
  70. <seg>yellow</seg>\
  71. </tuv>\
  72. <tuv xml:lang=\"ga\">\
  73. <seg>buí</seg>\
  74. </tuv>\
  75. </tu>\
  76. ")
  77. self.parser.parse_content(document, self.output_src, self.output_tgt)
  78. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  79. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  80. def test_language_variants(self):
  81. document = self.make_document("\
  82. <tu>\
  83. <tuv xml:lang=\"en-GB\">\
  84. <seg>yellow</seg>\
  85. </tuv>\
  86. <tuv xml:lang=\"ga-IE\">\
  87. <seg>buí</seg>\
  88. </tuv>\
  89. </tu>\
  90. ")
  91. self.parser.parse_content(document, self.output_src, self.output_tgt)
  92. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  93. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  94. def test_language_missing(self):
  95. document = self.make_document("\
  96. <tu>\
  97. <tuv xml:lang=\"en\">\
  98. <seg>yellow</seg>\
  99. </tuv>\
  100. <tuv>\
  101. <seg>buí</seg>\
  102. </tuv>\
  103. </tu>\
  104. ")
  105. with self.assertRaises(ParsingError):
  106. self.parser.parse_content(document, self.output_src, self.output_tgt)
  107. self.assertEqual(self.output_src.getvalue(), "")
  108. self.assertEqual(self.output_tgt.getvalue(), "")
  109. def test_inner_node_empty(self):
  110. document = self.make_document("\
  111. <tu>\
  112. <tuv xml:lang=\"en\">\
  113. <seg><inner/>yellow</seg>\
  114. </tuv>\
  115. <tuv xml:lang=\"ga\">\
  116. <seg>buí</seg>\
  117. </tuv>\
  118. </tu>\
  119. ")
  120. self.parser.parse_content(document, self.output_src, self.output_tgt)
  121. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  122. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  123. def test_inner_node_nonempty_preceding(self):
  124. document = self.make_document("\
  125. <tu>\
  126. <tuv xml:lang=\"en\">\
  127. <seg><inner>ye</inner>llow</seg>\
  128. </tuv>\
  129. <tuv xml:lang=\"ga\">\
  130. <seg>buí</seg>\
  131. </tuv>\
  132. </tu>\
  133. ")
  134. self.parser.parse_content(document, self.output_src, self.output_tgt)
  135. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  136. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  137. def test_inner_node_nonempty_following(self):
  138. document = self.make_document("\
  139. <tu>\
  140. <tuv xml:lang=\"en\">\
  141. <seg>yell<inner>ow</inner></seg>\
  142. </tuv>\
  143. <tuv xml:lang=\"ga\">\
  144. <seg>buí</seg>\
  145. </tuv>\
  146. </tu>\
  147. ")
  148. self.parser.parse_content(document, self.output_src, self.output_tgt)
  149. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  150. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  151. def test_extra_whitespace_leading_trailing(self):
  152. document = self.make_document("\
  153. <tu>\
  154. <tuv xml:lang=\"en\">\
  155. <seg>yellow </seg>\
  156. </tuv>\
  157. <tuv xml:lang=\"ga\">\
  158. <seg> \tbuí</seg>\
  159. </tuv>\
  160. </tu>\
  161. ")
  162. self.parser.parse_content(document, self.output_src, self.output_tgt)
  163. self.assertEqual(self.output_src.getvalue(), "yellow \n")
  164. self.assertEqual(self.output_tgt.getvalue(), " \tbuí\n")
  165. def test_extra_whitespace_contained(self):
  166. document = self.make_document("\
  167. <tu>\
  168. <tuv xml:lang=\"en\">\
  169. <seg>cake</seg>\
  170. </tuv>\
  171. <tuv xml:lang=\"ga\">\
  172. <seg>cáca \tmilis</seg>\
  173. </tuv>\
  174. </tu>\
  175. ")
  176. self.parser.parse_content(document, self.output_src, self.output_tgt)
  177. self.assertEqual(self.output_src.getvalue(), "cake\n")
  178. self.assertEqual(self.output_tgt.getvalue(), "cáca \tmilis\n")
  179. def test_newline_contained(self):
  180. document = self.make_document("\
  181. <tu>\
  182. <tuv xml:lang=\"en\">\
  183. <seg>cake</seg>\
  184. </tuv>\
  185. <tuv xml:lang=\"ga\">\
  186. <seg>cáca\nmilis</seg>\
  187. </tuv>\
  188. </tu>\
  189. ")
  190. self.parser.parse_content(document, self.output_src, self.output_tgt)
  191. self.assertEqual(self.output_src.getvalue(), "cake\n")
  192. self.assertEqual(self.output_tgt.getvalue(), "cácamilis\n")
  193. def test_only_whitespace(self):
  194. document = self.make_document("\
  195. <tu>\
  196. <tuv xml:lang=\"en\">\
  197. <seg>yellow</seg>\
  198. </tuv>\
  199. <tuv xml:lang=\"ga\">\
  200. <seg> </seg>\
  201. </tuv>\
  202. </tu>\
  203. ")
  204. self.parser.parse_content(document, self.output_src, self.output_tgt)
  205. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  206. self.assertEqual(self.output_tgt.getvalue(), " \n")
  207. def test_only_newline(self):
  208. document = self.make_document("\
  209. <tu>\
  210. <tuv xml:lang=\"en\">\
  211. <seg>yellow</seg>\
  212. </tuv>\
  213. <tuv xml:lang=\"ga\">\
  214. <seg>\n</seg>\
  215. </tuv>\
  216. </tu>\
  217. ")
  218. self.parser.parse_content(document, self.output_src, self.output_tgt)
  219. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  220. self.assertEqual(self.output_tgt.getvalue(), "\n")
  221. def test_multiple(self):
  222. document = self.make_document("\
  223. <tu>\
  224. <tuv xml:lang=\"en\">\
  225. <seg>horse</seg>\
  226. </tuv>\
  227. <tuv xml:lang=\"ga\">\
  228. <seg>capall</seg>\
  229. </tuv>\
  230. </tu>\
  231. <tu>\
  232. <tuv xml:lang=\"en-IE\">\
  233. <seg>eat</seg>\
  234. </tuv>\
  235. <tuv xml:lang=\"ga-IE\">\
  236. <seg>ith</seg>\
  237. </tuv>\
  238. </tu>\
  239. <tu>\
  240. <tuv xml:lang=\"en-GB\">\
  241. <seg>cake</seg>\
  242. </tuv>\
  243. <tuv xml:lang=\"ga-IE\">\
  244. <seg>cáca\nmilis</seg>\
  245. </tuv>\
  246. </tu>\
  247. ")
  248. self.parser.parse_content(document, self.output_src, self.output_tgt)
  249. self.assertEqual(self.output_src.getvalue(), "horse\neat\ncake\n")
  250. self.assertEqual(self.output_tgt.getvalue(), "capall\nith\ncácamilis\n")
  251. if __name__ == "__main__":
  252. unittest.main()