test_xliff_parser.py 24 KB


  1. import io
  2. import unittest
  3. import xml.etree.ElementTree as ET
  4. from toolchain.parsers.parsing_error import ParsingError
  5. from toolchain.parsers.xliff_parser import Xliff12Parser, Xliff20Parser
  6. class TestXliff12Parser(unittest.TestCase):
  7. LANGUAGE_CODE_SRC = "en"
  8. LANGUAGE_CODE_TGT = "ga"
  9. ROOT_TEMPLATE = \
  10. "<xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\
  11. <file original=\"/path/to/original\" source-language=\"{0}\" target-language=\"{1}\" datatype=\"datatype\">\
  12. <header/>\
  13. <body>\
  14. {2}\
  15. </body>\
  16. </file>\
  17. </xliff>"
  18. def setUp(self):
  19. self.output_src = io.StringIO()
  20. self.output_tgt = io.StringIO()
  21. self.parser = Xliff12Parser(self.LANGUAGE_CODE_SRC, self.LANGUAGE_CODE_TGT)
  22. def tearDown(self):
  23. self.output_src.close()
  24. self.output_tgt.close()
  25. def make_document(self, source_language_code, target_language_code, content):
  26. return ET.fromstring(TestXliff12Parser.ROOT_TEMPLATE.format(source_language_code, target_language_code, content))
  27. def test_empty_body(self):
  28. document = self.make_document("en", "ga", "")
  29. self.parser.parse_content(document, self.output_src, self.output_tgt)
  30. self.assertEqual(self.output_src.getvalue(), "")
  31. self.assertEqual(self.output_tgt.getvalue(), "")
  32. def test_absent_src(self):
  33. document = self.make_document("en", "ga", "\
  34. <trans-unit>\
  35. <target>capall</target>\
  36. </trans-unit>\
  37. ")
  38. self.parser.parse_content(document, self.output_src, self.output_tgt)
  39. self.assertEqual(self.output_src.getvalue(), "")
  40. self.assertEqual(self.output_tgt.getvalue(), "")
  41. def test_absent_tgt(self):
  42. document = self.make_document("en", "ga", "\
  43. <trans-unit>\
  44. <source>horse</source>\
  45. </trans-unit>\
  46. ")
  47. self.parser.parse_content(document, self.output_src, self.output_tgt)
  48. self.assertEqual(self.output_src.getvalue(), "")
  49. self.assertEqual(self.output_tgt.getvalue(), "")
  50. def test_empty_src(self):
  51. document = self.make_document("en", "ga", "\
  52. <trans-unit>\
  53. <source/>\
  54. <target>capall</target>\
  55. </trans-unit>\
  56. ")
  57. self.parser.parse_content(document, self.output_src, self.output_tgt)
  58. self.assertEqual(self.output_src.getvalue(), "")
  59. self.assertEqual(self.output_tgt.getvalue(), "")
  60. def test_empty_tgt(self):
  61. document = self.make_document("en", "ga", "\
  62. <trans-unit>\
  63. <source>horse</source>\
  64. <target/>\
  65. </trans-unit>\
  66. ")
  67. self.parser.parse_content(document, self.output_src, self.output_tgt)
  68. self.assertEqual(self.output_src.getvalue(), "")
  69. self.assertEqual(self.output_tgt.getvalue(), "")
  70. def test_single_valid(self):
  71. document = self.make_document("en", "ga", "\
  72. <trans-unit>\
  73. <source>horse</source>\
  74. <target>capall</target>\
  75. </trans-unit>\
  76. ")
  77. self.parser.parse_content(document, self.output_src, self.output_tgt)
  78. self.assertEqual(self.output_src.getvalue(), "horse\n")
  79. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  80. def test_single_valid_language_variants(self):
  81. document = self.make_document("en-GB", "ga-IE", "\
  82. <trans-unit>\
  83. <source>horse</source>\
  84. <target>capall</target>\
  85. </trans-unit>\
  86. ")
  87. self.parser.parse_content(document, self.output_src, self.output_tgt)
  88. self.assertEqual(self.output_src.getvalue(), "horse\n")
  89. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  90. def test_single_valid_languages_reversed(self):
  91. document = self.make_document("ga", "en", "\
  92. <trans-unit>\
  93. <source>capall</source>\
  94. <target>horse</target>\
  95. </trans-unit>\
  96. ")
  97. self.parser.parse_content(document, self.output_src, self.output_tgt)
  98. self.assertEqual(self.output_src.getvalue(), "horse\n")
  99. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  100. def test_single_valid_inner_tags_all(self):
  101. document = self.make_document("en", "ga", "\
  102. <trans-unit>\
  103. <source><inner>yellow</inner></source>\
  104. <target><g>buí</g></target>\
  105. </trans-unit>\
  106. ")
  107. self.parser.parse_content(document, self.output_src, self.output_tgt)
  108. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  109. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  110. def test_single_valid_inner_tags_start(self):
  111. document = self.make_document("en", "ga", "\
  112. <trans-unit>\
  113. <source><inner>ye</inner>llow</source>\
  114. <target><g>buí</g></target>\
  115. </trans-unit>\
  116. ")
  117. self.parser.parse_content(document, self.output_src, self.output_tgt)
  118. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  119. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  120. def test_single_valid_inner_tags_all(self):
  121. document = self.make_document("en", "ga", "\
  122. <trans-unit>\
  123. <source>yell<inner>ow</inner></source>\
  124. <target><g>buí</g></target>\
  125. </trans-unit>\
  126. ")
  127. self.parser.parse_content(document, self.output_src, self.output_tgt)
  128. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  129. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  130. def test_single_valid_with_group(self):
  131. document = self.make_document("en", "ga", "\
  132. <group>\
  133. <trans-unit>\
  134. <source>horse</source>\
  135. <target>capall</target>\
  136. </trans-unit>\
  137. </group>\
  138. ")
  139. self.parser.parse_content(document, self.output_src, self.output_tgt)
  140. self.assertEqual(self.output_src.getvalue(), "horse\n")
  141. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  142. def test_single_valid_multiple_group(self):
  143. document = self.make_document("en", "ga", "\
  144. <group>\
  145. <group>\
  146. <trans-unit>\
  147. <source>horse</source>\
  148. <target>capall</target>\
  149. </trans-unit>\
  150. </group>\
  151. </group>\
  152. ")
  153. self.parser.parse_content(document, self.output_src, self.output_tgt)
  154. self.assertEqual(self.output_src.getvalue(), "horse\n")
  155. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  156. def test_extra_whitespace_leading_trailing(self):
  157. document = self.make_document("en", "ga", "\
  158. <trans-unit>\
  159. <source>yellow </source>\
  160. <target> \tbuí</target>\
  161. </trans-unit>\
  162. ")
  163. self.parser.parse_content(document, self.output_src, self.output_tgt)
  164. self.assertEqual(self.output_src.getvalue(), "yellow \n")
  165. self.assertEqual(self.output_tgt.getvalue(), " \tbuí\n")
  166. def test_extra_whitespace_contained(self):
  167. document = self.make_document("en", "ga", "\
  168. <trans-unit>\
  169. <source>cake</source>\
  170. <target>cáca \tmilis</target>\
  171. </trans-unit>\
  172. ")
  173. self.parser.parse_content(document, self.output_src, self.output_tgt)
  174. self.assertEqual(self.output_src.getvalue(), "cake\n")
  175. self.assertEqual(self.output_tgt.getvalue(), "cáca \tmilis\n")
  176. def test_newline_contained(self):
  177. document = self.make_document("en", "ga", "\
  178. <trans-unit>\
  179. <source>cake</source>\
  180. <target>cáca\nmilis</target>\
  181. </trans-unit>\
  182. ")
  183. self.parser.parse_content(document, self.output_src, self.output_tgt)
  184. self.assertEqual(self.output_src.getvalue(), "cake\n")
  185. self.assertEqual(self.output_tgt.getvalue(), "cácamilis\n")
  186. def test_only_whitespace(self):
  187. document = self.make_document("en", "ga", "\
  188. <trans-unit>\
  189. <source>yellow</source>\
  190. <target> </target>\
  191. </trans-unit>\
  192. ")
  193. self.parser.parse_content(document, self.output_src, self.output_tgt)
  194. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  195. self.assertEqual(self.output_tgt.getvalue(), " \n")
  196. def test_only_newline(self):
  197. document = self.make_document("en", "ga", "\
  198. <trans-unit>\
  199. <source>yellow</source>\
  200. <target>\n</target>\
  201. </trans-unit>\
  202. ")
  203. self.parser.parse_content(document, self.output_src, self.output_tgt)
  204. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  205. self.assertEqual(self.output_tgt.getvalue(), "\n")
  206. def test_multiple_valid(self):
  207. document = self.make_document("en", "ga", "\
  208. <trans-unit>\
  209. <source>horse</source>\
  210. <target>capall</target>\
  211. </trans-unit>\
  212. <group>\
  213. <trans-unit>\
  214. <source>eat</source>\
  215. </trans-unit>\
  216. </group>\
  217. <group>\
  218. <group>\
  219. <trans-unit>\
  220. <source>yell<inner>ow</inner></source>\
  221. <target><g>buí</g></target>\
  222. </trans-unit>\
  223. </group>\
  224. </group>\
  225. ")
  226. self.parser.parse_content(document, self.output_src, self.output_tgt)
  227. self.assertEqual(self.output_src.getvalue(), "horse\nyellow\n")
  228. self.assertEqual(self.output_tgt.getvalue(), "capall\nbuí\n")
  229. def test_no_target_language(self):
  230. document = ET.fromstring("\
  231. <xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\
  232. <file original=\"/path/to/original\" source-language=\"en\" datatype=\"datatype\">\
  233. <header/>\
  234. <body>\
  235. <trans-unit>\
  236. <source>horse</source>\
  237. <target>capall</target>\
  238. </trans-unit>\
  239. </body>\
  240. </file>\
  241. </xliff>\
  242. ")
  243. self.parser.parse_content(document, self.output_src, self.output_tgt)
  244. self.assertEqual(self.output_src.getvalue(), "")
  245. self.assertEqual(self.output_tgt.getvalue(), "")
  246. def test_no_source_language(self):
  247. document = ET.fromstring("\
  248. <xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\
  249. <file original=\"/path/to/original\" target-language=\"ga\" datatype=\"datatype\">\
  250. <header/>\
  251. <body>\
  252. <trans-unit>\
  253. <source>horse</source>\
  254. <target>capall</target>\
  255. </trans-unit>\
  256. </body>\
  257. </file>\
  258. </xliff>\
  259. ")
  260. with self.assertRaises(ParsingError):
  261. self.parser.parse_content(document, self.output_src, self.output_tgt)
  262. self.assertEqual(self.output_src.getvalue(), "")
  263. self.assertEqual(self.output_tgt.getvalue(), "")
  264. class TestXliff20Parser(unittest.TestCase):
  265. LANGUAGE_CODE_SRC = "en"
  266. LANGUAGE_CODE_TGT = "ga"
  267. ROOT_TEMPLATE = \
  268. "<xliff xmlns=\"urn:oasis:names:tc:xliff:document:2.0\" version=\"2.0\" srcLang=\"{0}\" trgLang=\"{1}\">\
  269. <file>\
  270. {2}\
  271. </file>\
  272. </xliff>"
  273. def setUp(self):
  274. self.output_src = io.StringIO()
  275. self.output_tgt = io.StringIO()
  276. self.parser = Xliff20Parser(self.LANGUAGE_CODE_SRC, self.LANGUAGE_CODE_TGT)
  277. def tearDown(self):
  278. self.output_src.close()
  279. self.output_tgt.close()
  280. def make_document(self, source_language_code, target_language_code, content):
  281. return ET.fromstring(TestXliff20Parser.ROOT_TEMPLATE.format(source_language_code, target_language_code, content))
  282. def test_empty_body(self):
  283. document = self.make_document("en", "ga", "")
  284. self.parser.parse_content(document, self.output_src, self.output_tgt)
  285. self.assertEqual(self.output_src.getvalue(), "")
  286. self.assertEqual(self.output_tgt.getvalue(), "")
  287. def test_absent_src(self):
  288. document = self.make_document("en", "ga", "\
  289. <unit id=\"7\">\
  290. <segment>\
  291. <target>capall</target>\
  292. </segment>\
  293. </unit>\
  294. ")
  295. self.parser.parse_content(document, self.output_src, self.output_tgt)
  296. self.assertEqual(self.output_src.getvalue(), "")
  297. self.assertEqual(self.output_tgt.getvalue(), "")
  298. def test_absent_tgt(self):
  299. document = self.make_document("en", "ga", "\
  300. <unit id=\"7\">\
  301. <segment>\
  302. <source>horse</source>\
  303. </segment>\
  304. </unit>\
  305. ")
  306. self.parser.parse_content(document, self.output_src, self.output_tgt)
  307. self.assertEqual(self.output_src.getvalue(), "")
  308. self.assertEqual(self.output_tgt.getvalue(), "")
  309. def test_empty_src(self):
  310. document = self.make_document("en", "ga", "\
  311. <unit id=\"7\">\
  312. <segment>\
  313. <source/>\
  314. <target>capall</target>\
  315. </segment>\
  316. </unit>\
  317. ")
  318. self.parser.parse_content(document, self.output_src, self.output_tgt)
  319. self.assertEqual(self.output_src.getvalue(), "")
  320. self.assertEqual(self.output_tgt.getvalue(), "")
  321. def test_empty_tgt(self):
  322. document = self.make_document("en", "ga", "\
  323. <unit id=\"7\">\
  324. <segment>\
  325. <source>horse</source>\
  326. <target/>\
  327. </segment>\
  328. </unit>\
  329. ")
  330. self.parser.parse_content(document, self.output_src, self.output_tgt)
  331. self.assertEqual(self.output_src.getvalue(), "")
  332. self.assertEqual(self.output_tgt.getvalue(), "")
  333. def test_single_valid(self):
  334. document = self.make_document("en", "ga", "\
  335. <unit id=\"7\">\
  336. <segment>\
  337. <source>horse</source>\
  338. <target>capall</target>\
  339. </segment>\
  340. </unit>\
  341. ")
  342. self.parser.parse_content(document, self.output_src, self.output_tgt)
  343. self.assertEqual(self.output_src.getvalue(), "horse\n")
  344. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  345. def test_single_valid_language_variants(self):
  346. document = self.make_document("en-GB", "ga-IE", "\
  347. <unit id=\"7\">\
  348. <segment>\
  349. <source>horse</source>\
  350. <target>capall</target>\
  351. </segment>\
  352. </unit>\
  353. ")
  354. self.parser.parse_content(document, self.output_src, self.output_tgt)
  355. self.assertEqual(self.output_src.getvalue(), "horse\n")
  356. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  357. def test_single_valid_languages_reversed(self):
  358. document = self.make_document("ga", "en", "\
  359. <unit id=\"7\">\
  360. <segment>\
  361. <source>capall</source>\
  362. <target>horse</target>\
  363. </segment>\
  364. </unit>\
  365. ")
  366. self.parser.parse_content(document, self.output_src, self.output_tgt)
  367. self.assertEqual(self.output_src.getvalue(), "horse\n")
  368. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  369. def test_single_valid_inner_tags_all(self):
  370. document = self.make_document("en", "ga", "\
  371. <unit id=\"7\">\
  372. <segment>\
  373. <source><inner>yellow</inner></source>\
  374. <target><g>buí</g></target>\
  375. </segment>\
  376. </unit>\
  377. ")
  378. self.parser.parse_content(document, self.output_src, self.output_tgt)
  379. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  380. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  381. def test_single_valid_inner_tags_start(self):
  382. document = self.make_document("en", "ga", "\
  383. <unit id=\"7\">\
  384. <segment>\
  385. <source><inner>ye</inner>llow</source>\
  386. <target><g>buí</g></target>\
  387. </segment>\
  388. </unit>\
  389. ")
  390. self.parser.parse_content(document, self.output_src, self.output_tgt)
  391. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  392. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  393. def test_single_valid_inner_tags_all(self):
  394. document = self.make_document("en", "ga", "\
  395. <unit id=\"7\">\
  396. <segment>\
  397. <source>yell<inner>ow</inner></source>\
  398. <target><g>buí</g></target>\
  399. </segment>\
  400. </unit>\
  401. ")
  402. self.parser.parse_content(document, self.output_src, self.output_tgt)
  403. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  404. self.assertEqual(self.output_tgt.getvalue(), "buí\n")
  405. def test_single_valid_with_group(self):
  406. document = self.make_document("en", "ga", "\
  407. <group>\
  408. <unit id=\"7\">\
  409. <segment>\
  410. <source>horse</source>\
  411. <target>capall</target>\
  412. </segment>\
  413. </unit>\
  414. </group>\
  415. ")
  416. self.parser.parse_content(document, self.output_src, self.output_tgt)
  417. self.assertEqual(self.output_src.getvalue(), "horse\n")
  418. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  419. def test_single_valid_multiple_group(self):
  420. document = self.make_document("en", "ga", "\
  421. <group>\
  422. <group>\
  423. <group>\
  424. <unit id=\"7\">\
  425. <segment>\
  426. <source>horse</source>\
  427. <target>capall</target>\
  428. </segment>\
  429. </unit>\
  430. </group>\
  431. </group>\
  432. </group>\
  433. ")
  434. self.parser.parse_content(document, self.output_src, self.output_tgt)
  435. self.assertEqual(self.output_src.getvalue(), "horse\n")
  436. self.assertEqual(self.output_tgt.getvalue(), "capall\n")
  437. def test_extra_whitespace_leading_trailing(self):
  438. document = self.make_document("en", "ga", "\
  439. <unit id=\"7\">\
  440. <segment>\
  441. <source>yellow </source>\
  442. <target> \tbuí</target>\
  443. </segment>\
  444. </unit>\
  445. ")
  446. self.parser.parse_content(document, self.output_src, self.output_tgt)
  447. self.assertEqual(self.output_src.getvalue(), "yellow \n")
  448. self.assertEqual(self.output_tgt.getvalue(), " \tbuí\n")
  449. def test_extra_whitespace_contained(self):
  450. document = self.make_document("en", "ga", "\
  451. <unit id=\"7\">\
  452. <segment>\
  453. <source>cake</source>\
  454. <target>cáca \tmilis</target>\
  455. </segment>\
  456. </unit>\
  457. ")
  458. self.parser.parse_content(document, self.output_src, self.output_tgt)
  459. self.assertEqual(self.output_src.getvalue(), "cake\n")
  460. self.assertEqual(self.output_tgt.getvalue(), "cáca \tmilis\n")
  461. def test_newline_contained(self):
  462. document = self.make_document("en", "ga", "\
  463. <unit id=\"7\">\
  464. <segment>\
  465. <source>cake</source>\
  466. <target>cáca\nmilis</target>\
  467. </segment>\
  468. </unit>\
  469. ")
  470. self.parser.parse_content(document, self.output_src, self.output_tgt)
  471. self.assertEqual(self.output_src.getvalue(), "cake\n")
  472. self.assertEqual(self.output_tgt.getvalue(), "cácamilis\n")
  473. def test_only_whitespace(self):
  474. document = self.make_document("en", "ga", "\
  475. <unit id=\"7\">\
  476. <segment>\
  477. <source>yellow</source>\
  478. <target> </target>\
  479. </segment>\
  480. </unit>\
  481. ")
  482. self.parser.parse_content(document, self.output_src, self.output_tgt)
  483. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  484. self.assertEqual(self.output_tgt.getvalue(), " \n")
  485. def test_only_newline(self):
  486. document = self.make_document("en", "ga", "\
  487. <unit id=\"7\">\
  488. <segment>\
  489. <source>yellow</source>\
  490. <target>\n</target>\
  491. </segment>\
  492. </unit>\
  493. ")
  494. self.parser.parse_content(document, self.output_src, self.output_tgt)
  495. self.assertEqual(self.output_src.getvalue(), "yellow\n")
  496. self.assertEqual(self.output_tgt.getvalue(), "\n")
  497. def test_multiple_valid(self):
  498. document = self.make_document("en", "ga", "\
  499. <unit id=\"7\">\
  500. <segment>\
  501. <source>horse</source>\
  502. <target>capall</target>\
  503. </segment>\
  504. </unit>\
  505. <group>\
  506. <unit id=\"13\">\
  507. <segment>\
  508. <source>eat</source>\
  509. </segment>\
  510. </unit>\
  511. </group>\
  512. <group>\
  513. <group>\
  514. <group>\
  515. <unit id=\"49\">\
  516. <segment>\
  517. <source>yell<inner>ow</inner></source>\
  518. <target><g>buí</g></target>\
  519. </segment>\
  520. </unit>\
  521. </group>\
  522. </group>\
  523. </group>\
  524. ")
  525. self.parser.parse_content(document, self.output_src, self.output_tgt)
  526. self.assertEqual(self.output_src.getvalue(), "horse\nyellow\n")
  527. self.assertEqual(self.output_tgt.getvalue(), "capall\nbuí\n")
  528. def test_no_target_language(self):
  529. document = ET.fromstring("\
  530. <xliff xmlns=\"urn:oasis:names:tc:xliff:document:2.0\" version=\"2.0\" srcLang=\"en\">\
  531. <file>\
  532. <unit id=\"7\">\
  533. <segment>\
  534. <source>horse</source>\
  535. <target>capall</target>\
  536. </segment>\
  537. </unit>\
  538. </file>\
  539. </xliff>\
  540. ")
  541. self.parser.parse_content(document, self.output_src, self.output_tgt)
  542. self.assertEqual(self.output_src.getvalue(), "")
  543. self.assertEqual(self.output_tgt.getvalue(), "")
  544. def test_no_source_language(self):
  545. document = ET.fromstring("\
  546. <xliff xmlns=\"urn:oasis:names:tc:xliff:document:2.0\" version=\"2.0\" trgLang=\"ga\">\
  547. <file>\
  548. <unit id=\"7\">\
  549. <segment>\
  550. <source>horse</source>\
  551. <target>capall</target>\
  552. </segment>\
  553. </unit>\
  554. </file>\
  555. </xliff>\
  556. ")
  557. with self.assertRaises(ParsingError):
  558. self.parser.parse_content(document, self.output_src, self.output_tgt)
  559. self.assertEqual(self.output_src.getvalue(), "")
  560. self.assertEqual(self.output_tgt.getvalue(), "")
  561. if __name__ == "__main__":
  562. unittest.main()