Browse Source

Initial commit

Órla Ní Loinsigh 2 years ago
commit
be6ebb2ec8
100 changed files with 4371 additions and 0 deletions
  1. 11 0
      .gitignore
  2. 26 0
      Dockerfile
  3. 503 0
      README.md
  4. 10 0
      entrypoint.sh
  5. 2 0
      gunicorn.conf.py
  6. 16 0
      requirements.txt
  7. 0 0
      res/abbreviations/abbreviations_en.txt
  8. 0 0
      res/abbreviations/abbreviations_ga.txt
  9. 0 0
      res/dictionaries/docalign_en-ga.txt
  10. 0 0
      res/dictionaries/sentalign_en-ga.txt
  11. 0 0
      res/langdetect_models/ga
  12. 20 0
      res/tmx_templates/generic_tmx_template.xml
  13. 5 0
      test-res/docalign/closest_linecount/1.src.txt
  14. 4 0
      test-res/docalign/closest_linecount/1.tgt.txt
  15. 3 0
      test-res/docalign/closest_linecount/2.tgt.txt
  16. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/1.tgt.txt
  17. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/2.src.txt
  18. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/2.tgt.txt
  19. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/3.src.txt
  20. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/3.tgt.txt
  21. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/4.src.txt
  22. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/4.tgt.txt
  23. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/5.src.txt
  24. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/5.tgt.txt
  25. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/6.src.txt
  26. 1 0
      test-res/docalign/five_pairs_one_tgt_unmatched/6.tgt.txt
  27. 2 0
      test-res/docalign/mismatched_linecounts/1.src.txt
  28. 2 0
      test-res/docalign/mismatched_linecounts/1.tgt.txt
  29. 5 0
      test-res/docalign/mismatched_linecounts/2.tgt.txt
  30. 1 0
      test-res/docalign/one_pair_one_src_tgt_unmatched/1.src.txt
  31. 1 0
      test-res/docalign/one_pair_one_src_tgt_unmatched/2.src.txt
  32. 1 0
      test-res/docalign/one_pair_one_src_tgt_unmatched/2.tgt.txt
  33. 1 0
      test-res/docalign/one_pair_one_src_tgt_unmatched/3.tgt.txt
  34. 1 0
      test-res/docalign/three_pairs_one_src_unmatched/1.src.txt
  35. 1 0
      test-res/docalign/three_pairs_one_src_unmatched/2.src.txt
  36. 1 0
      test-res/docalign/three_pairs_one_src_unmatched/2.tgt.txt
  37. 1 0
      test-res/docalign/three_pairs_one_src_unmatched/3.src.txt
  38. 1 0
      test-res/docalign/three_pairs_one_src_unmatched/3.tgt.txt
  39. 1 0
      test-res/docalign/three_pairs_one_src_unmatched/4.src.txt
  40. 1 0
      test-res/docalign/three_pairs_one_src_unmatched/4.tgt.txt
  41. BIN
      test-res/extractors/doc.doc
  42. 1 0
      test-res/extractors/doc_expected.txt
  43. BIN
      test-res/extractors/docx.docx
  44. 1 0
      test-res/extractors/docx_expected.txt
  45. BIN
      test-res/extractors/empty.odt
  46. BIN
      test-res/extractors/empty.pdf
  47. 1 0
      test-res/extractors/empty_editable_expected.txt
  48. 1 0
      test-res/extractors/empty_pdf_expected.txt
  49. BIN
      test-res/extractors/odt.odt
  50. 1 0
      test-res/extractors/odt_expected.txt
  51. BIN
      test-res/extractors/pdf.pdf
  52. 3 0
      test-res/extractors/pdf_expected.txt
  53. 18 0
      test-res/extractors/rtf.rtf
  54. 1 0
      test-res/extractors/rtf_expected.txt
  55. BIN
      test-res/extractors/upper.ODT
  56. 1 0
      test-res/extractors/upper_expected.txt
  57. 5 0
      test-res/sentalign/aligned.src.txt
  58. 5 0
      test-res/sentalign/aligned.tgt.txt
  59. 0 0
      test-res/sentalign/dictionary.txt
  60. 0 0
      test-res/sentalign/empty.src.txt
  61. 0 0
      test-res/sentalign/empty.tgt.txt
  62. 5 0
      test-res/sentalign/unaligned.src.txt
  63. 5 0
      test-res/sentalign/unaligned.tgt.txt
  64. 0 0
      test/cleaners/__init__.py
  65. 102 0
      test/cleaners/test_monolingual_cleaner.py
  66. 165 0
      test/cleaners/test_post_alignment_cleaner.py
  67. 0 0
      test/common/__init__.py
  68. 47 0
      test/common/test_file_size_counter.py
  69. 192 0
      test/common/test_language_detector.py
  70. 30 0
      test/common/test_raw_file_indexer.py
  71. 0 0
      test/docalign/__init__.py
  72. 71 0
      test/docalign/test_document_aligner.py
  73. 0 0
      test/extractors/__init__.py
  74. 84 0
      test/extractors/test_editable_text_extractor.py
  75. 56 0
      test/extractors/test_pdf_text_extractor.py
  76. 0 0
      test/integration/__init__.py
  77. 48 0
      test/integration/test_processors.py
  78. 0 0
      test/normalizer/__init__.py
  79. 98 0
      test/normalizer/test_unicode_normalizer.py
  80. 0 0
      test/parsers/__init__.py
  81. 218 0
      test/parsers/test_sdltm_parser.py
  82. 327 0
      test/parsers/test_tmx_parser.py
  83. 750 0
      test/parsers/test_xliff_parser.py
  84. 0 0
      test/sentalign/__init__.py
  85. 83 0
      test/sentalign/test_sentence_aligner.py
  86. 0 0
      test/splitters/__init__.py
  87. 207 0
      test/splitters/test_editable_sentence_splitter.py
  88. 452 0
      test/splitters/test_pdf_sentence_splitter.py
  89. 0 0
      test/writers/__init__.py
  90. 50 0
      test/writers/test_file_concatenator.py
  91. 124 0
      test/writers/test_tmx_creator.py
  92. 73 0
      toolchain/cleaners/monolingual_cleaner.py
  93. 90 0
      toolchain/cleaners/post_alignment_cleaner.py
  94. 31 0
      toolchain/common/file_size_counter.py
  95. 113 0
      toolchain/common/language_detector.py
  96. 21 0
      toolchain/common/raw_file_indexer.py
  97. 9 0
      toolchain/common/templates.py
  98. 2 0
      toolchain/common/toolchain_error.py
  99. 245 0
      toolchain/doc_to_tmx_processor.py
  100. 4 0
      toolchain/docalign/docalign_error.py

+ 11 - 0
.gitignore

@@ -0,0 +1,11 @@
+__pycache__/
+
+*.pyc
+
+venv/
+
+.coverage
+.coveragerc
+htmlcov/
+
+test-res/integration

+ 26 - 0
Dockerfile

@@ -0,0 +1,26 @@
+# syntax=docker/dockerfile:1
+FROM ubuntu:22.04
+
+RUN useradd -u 4991 toolchain
+
+WORKDIR /home/toolchain
+SHELL ["/bin/bash", "-c"]
+
+RUN apt update && apt install -y python3-pip libreoffice poppler-utils
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+COPY toolchain toolchain
+COPY lib lib
+COPY res res
+COPY toolchains.cfg gunicorn.conf.py entrypoint.sh ./
+RUN chmod +x entrypoint.sh
+
+RUN cd lib/hunalign/src/hunalign && make && cd -
+RUN IFS=' ' read -r -a location_line <<< $(pip3 show langdetect | grep Location) && langdetect_profile_dir="${location_line[1]}/langdetect/profiles" && cp res/langdetect_models/ga "$langdetect_profile_dir"
+
+RUN chown -R toolchain:toolchain ./
+USER toolchain
+
+EXPOSE 8001
+ENTRYPOINT ["./entrypoint.sh"]

+ 503 - 0
README.md

@@ -0,0 +1,503 @@
+# STÓR Toolchain
+
+A rewrite of the TM-to-TMX and Doc-to-TMX STÓR (formerly NRS) toolchains.
+
+## Dependencies
+
+* Python 3.8+ (developed/tested with Python 3.8.10)
+* python3-venv (for other Python dependencies)
+* libreoffice (text extraction)
+* pdftotext (text extraction)
+* A C++ compiler (to build hunalign)
+* hunalign (sentence alignment; see below)
+* Access to certain input resources as required by specific components (see below)
+
+If not already present, `libreoffice` is available in Debian/Ubuntu repositories through apt, i.e.:
+```
+apt install libreoffice
+```
+
+## Setup
+
+All the following blocks of instructions will assume you are starting from a specific base directory, which will be referred to as `$BASE`. Changes to working directory will be noted as needed.
+
+While the dependencies above may have needed root/sudo to install, the remaining setup should be done as a regular user.
+
+```
+cd <some directory>
+export BASE=`pwd`
+```
+
+### Check out repositories
+
+Firstly, check out all repositories; from the base directory run:
+```
+cd $BASE
+git clone https://github.com/danielvarga/hunalign
+git clone https://opengogs.adaptcentre.ie/Oniloinsigh/stor-toolchain
+git clone <location of toolchain-internal>
+```
+
+### Build hunalign
+```
+cd $BASE/hunalign/src/hunalign
+make
+```
+
+### Install python dependencies
+```
+cd $BASE/toolchain
+python3 -m venv venv
+. venv/bin/activate
+pip install --upgrade pip
+pip install -r requirements.txt
+cp res/langdetect_models/ga venv/lib/python3.8/site-packages/langdetect/profiles/
+```
+
+### Copy internal resources
+```
+cd $BASE/toolchain
+cp -r $BASE/toolchain-internal/res/abbreviations/ ./res/
+cp -r $BASE/toolchain-internal/res/dictionaries/ ./res/
+cp -r $BASE/toolchain-internal/test-res/integration/ ./test-res/
+```
+
+### Export environment variables
+```
+export PYTHONPATH=$BASE/toolchain:$PYTHONPATH
+export HUNALIGNPATH=$BASE/hunalign/src/hunalign/hunalign
+export PDFTOTEXTPATH=$(which pdftotext)
+export LIBREOFFICEPATH=$(which libreoffice)
+```
+
+## Testing
+
+Note that a few of the tests around text extraction are a bit slow due to a forced sleep (in turn, this is due to a limitation of LibreOffice that apparently cannot be got around). However, the rest should run relatively quickly.
+
+To run the tests:
+```
+cd $BASE/toolchain
+coverage run -m unittest discover -s test
+```
+
+## Components
+
+Each toolchain consists of a set of components run in sequence. Each individual component may also be run stand-alone. Not all components are run in any given run of a toolchain; different components may be chosen depending on input filetype, whether the data is parallel or monolingual, the condition of the data, etc.
+
+### Parsers
+
+Parsers are used to extract text from aligned input types (i.e. translation memory files).
+
+#### Parser Types
+
+Three types of aligned file types are accepted.
+
+* SDLTM
+* TMX (version 1.4)
+* XLIFF (versions 1.2 and 2.0)
+
+#### Usage
+
+All parsers have the same usage.
+
+```
+python toolchain/parsers/{sdltm_parser|tmx_parser|xliff_parser}.py lang_src lang_tgt input_path output_path_src output_path_tgt
+```
+
+Where:
+* `lang_src`: ISO 639-1 code of source language; variants are accepted but not mandatory
+* `lang_tgt`: ISO 639-1 code of target language; variants are accepted but not mandatory
+* `input_path`: path to input file
+* `output_path_src`: path to output file of source language
+* `output_path_tgt`: path to output file of target language
+
+#### Output
+
+All parsers output two plaintext files, one for the source language and one for the target. Language variant information, if present in the input, will not be preserved.
+
+### Extractors
+
+Extractors are used to extract text from unaligned input types (i.e. raw corpus documents). They each expect a single document at a time. The language of the text to be extracted need not be specified.
+
+#### Extractor Types
+
+There are three types of extractor.
+
+* `PlainTextExtractor` for plaintext files (in reality, this is just a file copy)
+* `EditableTextExtractor` for other editable files (.doc, .docx, .odt, .rtf)
+* `PdfTextExtractor` for PDFs (note that this currently only works with PDFs that contain a text layer)
+
+#### Usage
+
+Because the plain text extractor is a trivial file copy, it does not have its own main. Each of the other extractors has a system dependency whose location must be communicated to it.
+
+```
+python toolchain/extractors/editable_text_extractor.py libreoffice_path input_path output_path
+python toolchain/extractors/pdf_text_extractor.py pdftotext_path input_path output_path
+```
+
+Where:
+* `libreoffice_path`: path to libreoffice
+* `pdftotext_path`: path to pdftotext
+* `input_path`: path to input file
+* `output_path`: path to output file
+
+#### Output
+
+All extractors output a single plaintext file. Note that no normalization, segmentation, sentence-splitting etc. have been done in this step; it is simply text extraction alone.
+
+### Unicode Normalizer
+
+The unicode normalizer may be used to perform unicode normalization on plaintext files. It resolves both NFC and NFD schemes to NFC. It also optionally supports configurable character substitution.
+
+#### Usage
+
+To perform basic normalization, the tool may be run from the command line.
+```
+python toolchain/normalizer/unicode_normalizer.py input_path output_path
+```
+
+Where:
+* `input_path`: input plaintext file
+* `output_path`: output plaintext file
+
+To configure character substitution, it is easiest to create a `UnicodeNormalizer` object directly.
+
+```
+UnicodeNormalizer().normalize(input_path, output_path, custom_subtitutions)
+```
+
+Where:
+* `custom_subtitutions` is a list of tuples; for each tuple, the first is the character to be replaced and the second is the substitution
+
+Character substitution is a simple character/string replacement; regex replacement is not supported. Note that two character substitutions are configured by default and may not be overridden:
+
+| Original | Substitution | Notes |
+| -------- | -------- | ------ |
+| \ufeff   | [empty]  | Byte Order Mark (BOM) |
+| ı́ | í | NFD i with accent, uncaught by the `unicodedata` library as it combines the dotless i |
+
+#### Output
+
+The normalizer outputs a single plaintext file encoded as UTF-8 NFC.
+
+### Language Detector
+
+Language can be detected on a string or on an entire file. If detecting the language of an entire file, not every line is scanned, as the process is quite slow. Instead, a number of lines is read at the start of the file, and after that lines are sampled. This can be configured using the command-line arguments detailed below.
+
+The language detector is a wrapper around [langdetect](https://pypi.org/project/langdetect/), which is itself is a Python port of [Nakatani Shuyo's language-detection](https://github.com/shuyo/language-detection).
+
+#### Usage
+
+```
+python toolchain/common/language_detector.py [--file] [--min_file_line_length MIN_FILE_LINE_LENGTH] [--min_initial_lines MIN_INITIAL_LINES] [--sampling_interval SAMPLING_INTERVAL] input_line
+```
+
+Where:
+* `file`: the input is a filename, i.e. the language is to be detected on the whole file; default false
+* `min_file_line_length`: if detecting in a file, only check the language of lines with this many characters or more; default 0
+* `min_initial_lines`: if detecting in a file, search this number of lines at the start of the file; default 50
+* `sampling_interval`: if detecting in a file, sample this number of lines throughout the file; default 100
+* `input_line`: the input string or filename
+
+#### Output
+
+The detected language will be output in the form of an ISO 639-1 two-letter code, without variants, to stdout.
+
+### Sentence Splitters
+
+Sentence splitters are used to reconstruct sentence boundary information. They take in plain text files as input and create files with a single sentence on each line.
+
+#### Types
+
+There are two types of sentence splitter.
+
+* `EditableSentenceSplitter` for text that originated from editable files (.doc, .docx, .odt, .rtf, .txt)
+* `PdfSentenceSplitter` for text that originated from PDFs
+
+The main distinction between them is that for editable files, line endings are assumed to also be sentence endings, whereas for PDFs lines may end anywhere in a sentence.
+
+#### Abbreviation lists
+
+Both splitters require lists of abbreviations in the source and target languages. Abbreviation lists must come in the form of TSV files. Each line in an abbreviations file will consist of three fields.
+
+The first field is the abbreviation itself. This is case-sensitive, and should feature no terminal full stop.
+
+The second is the expansion of the acronym. This is present only as a convenience, and is not strictly used by the splitters.
+
+The third field is an optional boolean field. A True value indicates that the abbreviation expects to be followed by another word. If absent, it defaults to False.
+
+Example snippet from an English abbreviation list:
+
+```
+Aug\tAugust
+Co\tCounty\tTrue
+Dr\tDoctor\tTrue
+etc\tet cetera
+Ms\tMs\tTrue
+SI\tStatutory Instrument
+```
+
+#### Usage
+
+Because abbreviation lists are language-specific, a sentence splitter should only be run on files whose language is known.
+
+```
+python toolchain/splitters/{editable_sentence_splitter|pdf_sentence_splitter}.py abbreviations_path input_path output_path
+```
+
+Where:
+* `abbreviations_path`: path to abbreviations file appropriate to language of file
+* `input_path`: path to input file
+* `output_path`: path to output file
+
+#### Output
+
+Both splitters output a single plaintext file with a single sentence to a line. Fragments of sentences that do not obviously belong to any sentence (e.g. section headings) will ideally also be output on their own line.
+
+Note that there is room for error here, as all determinations are ultimately based on assumptions. This is particularly the case for PDFs.
+
+### Document Aligner
+
+The document aligner examines lists of files whose languages have been identified and attempts to determine which files correspond to one another.
+
+#### Usage
+
+Document alignment should be performed on files whose language is known and whose lines have been sentence-split.
+
+```
+python toolchain/docalign/document_aligner.py file_list_path_src file_list_path_tgt output_dir
+```
+
+Where:
+* `file_list_path_src`: path to file listing source documents
+* `file_list_path_tgt`: path to file listing target documents
+* `output_dir`: path to directory to write results and artefacts to
+* `--refalignments`: path to reference alignments file for evaluation
+
+#### Output
+
+Not all documents are guaranteed to be aligned. The aligner will return three lists:
+* `alignments`: a list of tuples/pairs of aligned documents
+* `unmatched_src`: a list of documents in the source language for which no match could be found
+* `unmatched_tgt`: a list of documents in the target language for which no match could be found
+
+These three lists will also be written to file.
+
+### Sentence Aligner
+
+The sentence aligner aligns a single pair of files at a sentence level. The aligner is a wrapper around [hunalign](https://github.com/danielvarga/hunalign).
+
+The wrapper adds a timeout to the external hunalign call, and splits the output into two target files.
+
+The hunalign project must be built and available somewhere as a binary in order to run this component.
+
+#### Usage
+
+The sentence aligner should be run only on a single pair of files that are known to correspond to one another, i.e. after document alignment.
+
+```
+python toolchain/sentalign/sentence_aligner.py [--subprocess_timeout] hunalign dictionary input_path_src input_path_tgt output_path_src output_path_tgt output_artefact_dirname
+```
+
+Where:
+* `hunalign`: path to hunalign binary
+* `dictionary`: path to dictionary file
+* `input_path_src`: path to source input text file
+* `input_path_tgt`: path to target input text file
+* `output_path_src`: path to source output text file
+* `output_path_tgt`: path to target output text file
+* `output_artefact_dirname`: path to output artefact dir
+* `--subprocess_timeout`: timeout limit in seconds for running hunalign subprocess
+
+#### Output
+
+Output will be in the form of two plaintext files, one each for source and target, where the lines correspond to one another. Note that lines are not guaranteed to be non-empty on either side.
+
+### Monolingual Text Cleaner
+
+The monolingual text cleaner may be used to remove unwanted lines from a plaintext file. The file should be in a single language throughout. This cleaner removes empty lines and lines that are not of the expected language.
+
+#### Usage
+
+```
+python toolchain/cleaners/monolingual_cleaner.py [--langdetect_threshold LANGDETECT_THRESHOLD] [--rejected_line_delimiter REJECTED_LINE_DELIMITER] lang input_path output_path_retained output_path_rejected
+```
+
+Where:
+* `langdetect_threshold`: the minimum length in characters that a line must be in order for language detection to be performed; default 40
+* `rejected_line_delimiter`: a string used to delimit fields in the output report; default "@@@"
+* `lang`: ISO 639-1 code of expected language
+* `input_path`: input plaintext file
+* `output_path_retained`: output file of accepted lines
+* `output_path_rejected`: output file of rejected lines
+
+#### Output
+
+This cleaner outputs two files. One will consist of the lines that are found acceptable, and the other a form of structured report file detailing lines that were rejected and why.
+
+### Post-Alignment Text Cleaner
+
+The post-alignment text cleaner may be used to remove unwanted pairs of lines from parallel files whose languages are known. The files should be segmented and aligned. This cleaner removes line pairs where either source or target line is empty, not of the expected language, or does not contain any alphanumeric characters.
+
+#### Usage
+
+```
+python toolchain/cleaners/post_alignment_cleaner.py [--langdetect_threshold LANGDETECT_THRESHOLD] [--rejected_line_delimiter REJECTED_LINE_DELIMITER] lang_src lang_tgt input_path_src input_path_tgt output_path_src output_path_tgt output_path_rejected
+```
+
+Where:
+* `langdetect_threshold`: the minimum length in characters that a line must be in order for language detection to be performed; default 40
+* `rejected_line_delimiter`: a string used to delimit fields in the output report; default "@@@"
+* `lang_src`: ISO 639-1 code of expected language of source file
+* `lang_tgt`: ISO 639-1 code of expected language of target file
+* `input_path_src`: input plaintext file of source language
+* `input_path_tgt`: input plaintext file of target language
+* `output_path_src`:output file of accepted source lines
+* `output_path_tgt`: output file of accepted target lines
+* `output_path_rejected` output file of rejected lines
+
+#### Output
+
+This cleaner will output three files. Two will consist of the lines that were found acceptable for each language, and will still be aligned. The third will consist of a form of structured report file detailing line pairs that were rejected and why.
+
+### TMX Creator
+
+The TMX creator creates a single TMX file from a pair of plaintext files. The input files are expected to have the same number of lines, and to be aligned. The language of each input file must be known.
+
+The TMX creator requires a Jinja template. A basic one is provided as a project resource under `res/tmx_templates/generic_tmx_template.xml`.
+
+#### Usage
+
+Command-line usage of this tool demands a large number of mandatory arguments, in order to comply with the TMX specification.
+
+```
+python toolchain/writers/tmx_creator.py template_path input_path_src input_path_tgt output_path adminlang datatype o_tmf segtype srclang tgtlang
+```
+
+Where:
+* `template_path`: path to Jinja template file
+* `input_path_src:`: path to input file for source language
+* `input_path_tgt`: path to input file for target language
+* `output_path:`: path to write TMX file to
+* `adminlang`: ISO 639-1 code of administrative language
+* `datatype`: type of data contained
+* `o_tmf`: original translation memory format
+* `segtype`: segmentation type
+* `srclang`: ISO 639-1 code of source language
+* `tgtlang`: ISO 639-1 code of target language
+
+Alternatively, it is possible to create a `TmxCreator` object and pass the mandatory arguments in as a dictionary. The `additional_args` dictionary may also be used to pass any other non-mandatory arguments that might be required by a custom template.
+
+```
+TmxCreator().create(template_path, input_path_src, input_path_tgt, output_path, additional_args)
+```
+
+#### Output
+
+A single TMX file is generated. It should be compliant with TMX schema 1.4b. In addition to the mandatory attributes listed above, a creation date attribute is also generated in the file header.
+
+## Running end-to-end toolchains
+
+The alternative to running components individually is to run them in a toolchain. There are two toolchains:
+* `TM-to-TMX` for processing file types that are already aligned and creating cleaned TMX files from them
+* `Doc-to-TMX` for creating TMX files from raw corpus documents
+
+### Usage
+
+Although they work differently, both toolchains are called the same way.
+
+```
+cd $BASE/toolchain
+python toolchain/{tm_to_tmx_processor|doc_to_tmx_processor}.py id input_dir artefact_dir output_dir config_path
+```
+
+Where:
+* `id`: LR identifier; this will be used to generate filenames
+* `input_dir`: path to input directory
+* `artefact_dir`: path to artefact directory
+* `output_dir`: path to output directory
+* `config_path`: path to config; a sample has been included as `toolchains.cfg`
+
+The `artefact_dir` and `output_dir` need not exist already, but they must be somewhere that there are permissions to create.
+
+### Output
+
+TMX file(s) will be created in the output directory specified.
+
+The TM-to-TMX toolchain creates a single file for each TM file found.
+
+The Doc-to-TMX toolchain produces a single combined file for all the input it processed. In addition, it may produce monolingual text files for any documents that were unmatched at the end of document alignment. Whether these are retained for the source language, the target language, or both is configurable.
+
+## Running the Flask app
+
+A simple Flask app is available to handle toolchain requests. To run the this for development, in addition to the above instructions, run:
+
+```
+cd $BASE/toolchain
+export FLASK_APP=$BASE/toolchain/toolchain/toolchain_app
+export FLASK_ENV=development
+export FLASK_RUN_PORT=5001
+python -m flask run
+```
+
+There are two endpoints, one for each toolchain. In a local setup, they will look something like this:
+
+```
+http://127.0.0.1:5001/tm
+http://127.0.0.1:5001/doc
+```
+
+Again, the interface for both is the same. Request json:
+
+```
+{
+  'id': <id>,
+  'input': <input_dir>,
+  'artefact': <artefact_dir>,
+  'output': <output_dir>
+}
+```
+
+Response json:
+
+```
+{
+  'file_infos': [<information about each file produced>],
+  'rejected': <no. of files rejected as unprocessable>,
+  'success': <ran without errors>
+}
+```
+
+A toolchain will return a small amount of information about each file it produced. As indicated above, this will be returned in the form of a list of records.
+
+
+```
+{
+  'encoding': <character encoding>,
+  'format': <file format>,
+  'languages': [<two-letter ISO 639-1 language code>],
+  'linguality_type': <bilingual|monolingual>,
+  'multilinguality_type': <parallel|comparable|...>,
+  'size': <size>,
+  'size_unit': <size unit>
+}
+```
+
+Sample response:
+
+```
+{
+  'file_infos': [{
+    'encoding': 'utf8',
+    'format': 'tmx',
+    'languages': ['en', 'ga'],
+    'linguality_type': 'bilingual',
+    'multilinguality_type': 'parallel',
+    'size': 35,
+    'size_unit': 'translation_units'
+  }],
+  'rejected': 0,
+  'success': True
+}
+```

+ 10 - 0
entrypoint.sh

@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export BASE=/home/toolchain
+
+export PYTHONPATH=$BASE/toolchain:$PYTHONPATH
+export HUNALIGNPATH=$BASE/lib/hunalign/src/hunalign/hunalign
+export PDFTOTEXTPATH=$(which pdftotext)
+export LIBREOFFICEPATH=$(which libreoffice)
+
+exec gunicorn -c gunicorn.conf.py toolchain.toolchain_app:app

+ 2 - 0
gunicorn.conf.py

@@ -0,0 +1,2 @@
+bind = "0.0.0.0:8001"
+workers = 4

+ 16 - 0
requirements.txt

@@ -0,0 +1,16 @@
+certifi==2022.6.15
+charset-normalizer==2.1.1
+click==8.1.3
+coverage==6.4.4
+defusedxml==0.7.1
+Flask==2.2.2
+gunicorn==20.1.0
+idna==3.3
+itsdangerous==2.1.2
+Jinja2==3.1.2
+langdetect==1.0.9
+MarkupSafe==2.1.1
+requests==2.28.1
+six==1.16.0
+urllib3==1.26.12
+Werkzeug==2.2.2

+ 0 - 0
res/abbreviations/abbreviations_en.txt


+ 0 - 0
res/abbreviations/abbreviations_ga.txt


+ 0 - 0
res/dictionaries/docalign_en-ga.txt


+ 0 - 0
res/dictionaries/sentalign_en-ga.txt


File diff suppressed because it is too large
+ 0 - 0
res/langdetect_models/ga


+ 20 - 0
res/tmx_templates/generic_tmx_template.xml

@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<tmx version="1.4">
+  <header creationtool="tmx_creator" creationtoolversion="1.0" segtype="{{ segtype }}" o-tmf="{{ o_tmf }}" adminlang="{{ adminlang }}" srclang="{{ srclang }}" datatype="{{ datatype }}" creationdate="{{ creation_date }}">
+    <prop type="distributor">{{ distributor }}</prop>
+    <prop type="disclaimer">{{ disclaimer }}</prop>
+    <prop type="licence">{{ licence }}</prop>
+  </header>
+  <body>
+    {% for (src, tgt) in tus -%}
+    <tu tuid="{{ loop.index }}">
+      <tuv xml:lang="{{ srclang }}">
+        <seg>{{ src }}</seg>
+      </tuv>
+      <tuv xml:lang="{{ tgtlang }}">
+        <seg>{{ tgt }}</seg>
+      </tuv>
+    </tu>
+    {% endfor -%}
+  </body>
+</tmx>

+ 5 - 0
test-res/docalign/closest_linecount/1.src.txt

@@ -0,0 +1,5 @@
+a
+a
+a
+a
+a

+ 4 - 0
test-res/docalign/closest_linecount/1.tgt.txt

@@ -0,0 +1,4 @@
+bb
+bb
+bb
+b

+ 3 - 0
test-res/docalign/closest_linecount/2.tgt.txt

@@ -0,0 +1,3 @@
+bbb
+bbb
+bbb

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/1.tgt.txt

@@ -0,0 +1 @@
+b

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/2.src.txt

@@ -0,0 +1 @@
+aa

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/2.tgt.txt

@@ -0,0 +1 @@
+bb

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/3.src.txt

@@ -0,0 +1 @@
+aaa

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/3.tgt.txt

@@ -0,0 +1 @@
+bbb

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/4.src.txt

@@ -0,0 +1 @@
+aaaa

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/4.tgt.txt

@@ -0,0 +1 @@
+bbbb

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/5.src.txt

@@ -0,0 +1 @@
+aaaaa

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/5.tgt.txt

@@ -0,0 +1 @@
+bbbbb

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/6.src.txt

@@ -0,0 +1 @@
+aaaaaa

+ 1 - 0
test-res/docalign/five_pairs_one_tgt_unmatched/6.tgt.txt

@@ -0,0 +1 @@
+bbbbbb

+ 2 - 0
test-res/docalign/mismatched_linecounts/1.src.txt

@@ -0,0 +1,2 @@
+aaa
+aaaaa

+ 2 - 0
test-res/docalign/mismatched_linecounts/1.tgt.txt

@@ -0,0 +1,2 @@
+bbbb
+bb

+ 5 - 0
test-res/docalign/mismatched_linecounts/2.tgt.txt

@@ -0,0 +1,5 @@
+b
+b
+b
+b
+b

+ 1 - 0
test-res/docalign/one_pair_one_src_tgt_unmatched/1.src.txt

@@ -0,0 +1 @@
+a

+ 1 - 0
test-res/docalign/one_pair_one_src_tgt_unmatched/2.src.txt

@@ -0,0 +1 @@
+aaaaaa

+ 1 - 0
test-res/docalign/one_pair_one_src_tgt_unmatched/2.tgt.txt

@@ -0,0 +1 @@
+bbbbbbb

+ 1 - 0
test-res/docalign/one_pair_one_src_tgt_unmatched/3.tgt.txt

@@ -0,0 +1 @@
+bbbb

+ 1 - 0
test-res/docalign/three_pairs_one_src_unmatched/1.src.txt

@@ -0,0 +1 @@
+b

+ 1 - 0
test-res/docalign/three_pairs_one_src_unmatched/2.src.txt

@@ -0,0 +1 @@
+aa

+ 1 - 0
test-res/docalign/three_pairs_one_src_unmatched/2.tgt.txt

@@ -0,0 +1 @@
+bb

+ 1 - 0
test-res/docalign/three_pairs_one_src_unmatched/3.src.txt

@@ -0,0 +1 @@
+aaa

+ 1 - 0
test-res/docalign/three_pairs_one_src_unmatched/3.tgt.txt

@@ -0,0 +1 @@
+bbb

+ 1 - 0
test-res/docalign/three_pairs_one_src_unmatched/4.src.txt

@@ -0,0 +1 @@
+aaaa

+ 1 - 0
test-res/docalign/three_pairs_one_src_unmatched/4.tgt.txt

@@ -0,0 +1 @@
+bbbb

BIN
test-res/extractors/doc.doc


+ 1 - 0
test-res/extractors/doc_expected.txt

@@ -0,0 +1 @@
+.doc test

BIN
test-res/extractors/docx.docx


+ 1 - 0
test-res/extractors/docx_expected.txt

@@ -0,0 +1 @@
+.docx test

BIN
test-res/extractors/empty.odt


BIN
test-res/extractors/empty.pdf


+ 1 - 0
test-res/extractors/empty_editable_expected.txt

@@ -0,0 +1 @@
+

+ 1 - 0
test-res/extractors/empty_pdf_expected.txt

@@ -0,0 +1 @@
+

BIN
test-res/extractors/odt.odt


+ 1 - 0
test-res/extractors/odt_expected.txt

@@ -0,0 +1 @@
+.odt test

BIN
test-res/extractors/pdf.pdf


+ 3 - 0
test-res/extractors/pdf_expected.txt

@@ -0,0 +1,3 @@
+.pdf test
+
+

+ 18 - 0
test-res/extractors/rtf.rtf

@@ -0,0 +1,18 @@
+{\rtf1\ansi\deff3\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fnil\fprq2\fcharset0 Noto Sans CJK SC;}{\f6\fnil\fprq2\fcharset0 Lohit Devanagari;}{\f7\fnil\fprq0\fcharset128 Lohit Devanagari;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
+{\stylesheet{\s0\snext0\widctlpar\hyphpar0\cf0\kerning1\dbch\af5\langfe2052\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang6153 Normal;}
+{\s15\sbasedon0\snext16\sb240\sa120\keepn\dbch\af5\dbch\af6\afs28\loch\f4\fs28 Heading;}
+{\s16\sbasedon0\snext16\sl276\slmult1\sb0\sa140 Text Body;}
+{\s17\sbasedon16\snext17\sl276\slmult1\sb0\sa140\dbch\af7 List;}
+{\s18\sbasedon0\snext18\sb120\sa120\noline\i\dbch\af7\afs24\ai\fs24 Caption;}
+{\s19\sbasedon0\snext19\noline\dbch\af7 Index;}
+}{\*\generator LibreOffice/6.0.7.3$Linux_X86_64 LibreOffice_project/00m0$Build-3}{\info{\creatim\yr2021\mo9\dy27\hr14\min8}{\revtim\yr2021\mo9\dy27\hr14\min8}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab709
+
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\pgdscnxt0 Default Style;}}
+\formshade\paperh16838\paperw11906\margl1134\margr1134\margt1134\margb1134\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1134\margrsxn1134\margtsxn1134\margbsxn1134\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc
+{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\widctlpar\hyphpar0\cf0\kerning1\dbch\af5\langfe2052\dbch\af6\afs24\alang1081\loch\f3\hich\af3\fs24\lang6153{\rtlch \ltrch\loch
+.}{\rtlch \ltrch\loch
+rtf test}
+\par }

+ 1 - 0
test-res/extractors/rtf_expected.txt

@@ -0,0 +1 @@
+.rtf test

BIN
test-res/extractors/upper.ODT


+ 1 - 0
test-res/extractors/upper_expected.txt

@@ -0,0 +1 @@
+.ODT test

+ 5 - 0
test-res/sentalign/aligned.src.txt

@@ -0,0 +1,5 @@
+aniseed
+basil
+cinnamon dill
+elderflower fennel ginger
+horseradish

+ 5 - 0
test-res/sentalign/aligned.tgt.txt

@@ -0,0 +1,5 @@
+ánísééd
+básíl
+cínnámón díll
+éldérflówér fénnél gíngér
+hórsérádísh

+ 0 - 0
test-res/sentalign/dictionary.txt


+ 0 - 0
test-res/sentalign/empty.src.txt


+ 0 - 0
test-res/sentalign/empty.tgt.txt


+ 5 - 0
test-res/sentalign/unaligned.src.txt

@@ -0,0 +1,5 @@
+aniseed
+
+basil cinnamon dill elderflower
+fennel
+ginger horseradish

+ 5 - 0
test-res/sentalign/unaligned.tgt.txt

@@ -0,0 +1,5 @@
+ánísééd
+básíl cínnámón díll éldérflówér
+fénnél
+gíngér
+hórsérádísh

+ 0 - 0
test/cleaners/__init__.py


+ 102 - 0
test/cleaners/test_monolingual_cleaner.py

@@ -0,0 +1,102 @@
+import io
+import unittest
+from unittest.mock import call, Mock
+
+from toolchain.cleaners.monolingual_cleaner import MonolingualCleaner
+
+class TestMonolingualCleaner(unittest.TestCase):
+
+    def setUp(self):
+        self.language_detector = unittest.mock.Mock()
+        self.output_retained = io.StringIO()
+        self.output_rejected = io.StringIO()
+        config = {
+            "language_detection_threshold" : "5",
+            "rejected_line_delimiter" : "@@@",
+        }
+        self.cleaner = MonolingualCleaner("ga", config=config, language_detector=self.language_detector)
+
+
+    def tearDown(self):
+        self.output_retained.close()
+        self.output_rejected.close()
+
+
+    def test_empty(self):
+        self.cleaner.clean_text([], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_single_valid(self):
+        self.language_detector.detect.side_effect = ["ga"]
+
+        self.cleaner.clean_text(["capall"], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "capall\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([call("capall")])
+
+
+    def test_newline_termination(self):
+        self.language_detector.detect.side_effect = ["ga"]
+
+        self.cleaner.clean_text(["capall\n"], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "capall\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([call("capall")])
+
+
+    def test_single_language_mismatched_short(self):
+        self.cleaner.clean_text(["dó"], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "dó\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_single_target_language_mismatched(self):
+        self.language_detector.detect.side_effect = ["de"]
+
+        self.cleaner.clean_text(["Pferd"], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "unexpected_language_[de]@@@Pferd\n")
+        self.language_detector.detect.assert_has_calls([call("Pferd")])
+
+
+    def test_single_empty(self):
+        self.cleaner.clean_text([""], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@\n")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_single_blank(self):
+        self.cleaner.clean_text(["   "], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@\n")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_multiple_mixed(self):
+        self.language_detector.detect.side_effect = ["ga", "de"]
+
+        self.cleaner.clean_text(["capall", "dó", "Pferd", "", "   "], self.output_retained, self.output_rejected)
+
+        self.assertEqual(self.output_retained.getvalue(), "capall\ndó\n")
+        self.assertEqual(self.output_rejected.getvalue(), "\n".join([
+            "unexpected_language_[de]@@@Pferd",
+            "empty_segment@@@",
+            "empty_segment@@@\n"
+        ]))
+        self.language_detector.detect.assert_has_calls([call("capall"), call("Pferd")])
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 165 - 0
test/cleaners/test_post_alignment_cleaner.py

@@ -0,0 +1,165 @@
+import io
+import unittest
+from unittest.mock import call, Mock
+
+from toolchain.cleaners.post_alignment_cleaner import PostAlignmentCleaner
+
+class TestPostAlignmentCleaner(unittest.TestCase):
+
+    def setUp(self):
+        self.language_detector = unittest.mock.Mock()
+        self.output_src = io.StringIO()
+        self.output_tgt = io.StringIO()
+        self.output_rejected = io.StringIO()
+        config = {
+            "language_detection_threshold" : "5",
+            "rejected_line_delimiter" : "@@@",
+        }
+        self.cleaner = PostAlignmentCleaner("en", "ga", config=config, language_detector=self.language_detector)
+
+
+    def tearDown(self):
+        self.output_src.close()
+        self.output_tgt.close()
+        self.output_rejected.close()
+
+
+    def test_empty(self):
+        self.cleaner.clean_text([], [], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_single_valid(self):
+        self.language_detector.detect.side_effect = ["en", "ga"]
+
+        self.cleaner.clean_text(["horse"], ["capall"], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([call("horse"), call("capall")])
+
+
+    def test_newline_termination(self):
+        self.language_detector.detect.side_effect = ["en", "ga"]
+
+        self.cleaner.clean_text(["horse\n"], ["capall\n"], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([call("horse"), call("capall")])
+
+
+    def test_single_language_mismatched_short(self):
+        self.cleaner.clean_text(["two"], ["dó"], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "two\n")
+        self.assertEqual(self.output_tgt.getvalue(), "dó\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_single_target_language_mismatched(self):
+        self.language_detector.detect.side_effect = ["en", "de"]
+
+        self.cleaner.clean_text(["horse"], ["Pferd"], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "unexpected_language_[en:de]@@@horse@@@Pferd\n")
+        self.language_detector.detect.assert_has_calls([call("horse"), call("Pferd")])
+
+
+    def test_single_source_empty(self):
+        self.language_detector.detect.side_effect = ["en", "ga"]
+
+        self.cleaner.clean_text([""], ["capall"], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@@@@capall\n")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_single_target_blank(self):
+        self.language_detector.detect.side_effect = ["en", "ga"]
+
+        self.cleaner.clean_text(["horse"], ["   "], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "empty_segment@@@horse@@@\n")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_source_partially_nonalpha(self):
+        self.language_detector.detect.side_effect = ["en", "ga"]
+
+        self.cleaner.clean_text(["7 people."], ["seachtar"], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "7 people.\n")
+        self.assertEqual(self.output_tgt.getvalue(), "seachtar\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([call("7 people."), call("seachtar")])
+
+
+    def test_target_entirely_nonalpha(self):
+        self.language_detector.detect.side_effect = ["en", "ga"]
+
+        self.cleaner.clean_text(["thirty-two"], ["32"], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "thirty-two\n")
+        self.assertEqual(self.output_tgt.getvalue(), "32\n")
+        self.assertEqual(self.output_rejected.getvalue(), "")
+        self.language_detector.detect.assert_has_calls([call("thirty-two"), call("32")])
+
+
+    def test_single_source_and_target_src_entirely_nonalpha(self):
+        self.language_detector.detect.side_effect = ["en", "ga"]
+
+        self.cleaner.clean_text(["123"], ["?["], self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+        self.assertEqual(self.output_rejected.getvalue(), "nonalpha@@@123@@@?[\n")
+        self.language_detector.detect.assert_has_calls([])
+
+
+    def test_multiple_mixed(self):
+        self.language_detector.detect.side_effect = ["en", "ga", "en", "de", "en", "ga", "en", "ga", "en", "ga", "en", "ga", "en", "ga"]
+
+        self.cleaner.clean_text(["horse",  "horse", "",       "horse", "7 people.", "fifty-two", "123"],
+                           ["capall", "Pferd", "capall", "   ",   "seachtar",  "52"     ,   "?["],
+                           self.output_src, self.output_tgt, self.output_rejected)
+
+        self.assertEqual(self.output_src.getvalue(), "\n".join([
+            "horse",
+            "7 people.",
+            "fifty-two\n"
+        ]))
+        self.assertEqual(self.output_tgt.getvalue(), "\n".join([
+            "capall",
+            "seachtar",
+            "52\n"
+        ]))
+        self.assertEqual(self.output_rejected.getvalue(), "\n".join([
+            "unexpected_language_[en:de]@@@horse@@@Pferd",
+            "empty_segment@@@@@@capall",
+            "empty_segment@@@horse@@@",
+            "nonalpha@@@123@@@?[\n"
+        ]))
+        self.language_detector.detect.assert_has_calls([
+            call("horse"), call("capall"),
+            call("horse"), call("Pferd"),
+            call("7 people."), call("seachtar"),
+            call("fifty-two"), call("52"),
+        ])
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/common/__init__.py


+ 47 - 0
test/common/test_file_size_counter.py

@@ -0,0 +1,47 @@
+import io
+import logging
+import unittest
+
+from toolchain.common.file_size_counter import FileSizeCounter
+
+class TestFileSizeCounter(unittest.TestCase):
+
+    def setUp(self):
+        logging.disable(level=logging.CRITICAL)
+        self.counter = FileSizeCounter()
+
+
+    def tearDown(self):
+        pass
+
+
+    def test_empty(self):
+        result = self.counter.count_sizes([])
+
+        self.assertEqual(result.lines, 0)
+        self.assertEqual(result.words, 0)
+
+
+    def test_single_line_single_word(self):
+        result = self.counter.count_sizes(["aniseed"])
+
+        self.assertEqual(result.lines, 1)
+        self.assertEqual(result.words, 1)
+
+
+    def test_single_line_multiple_word(self):
+        result = self.counter.count_sizes(["aniseed\tbasil.   ' cinnamon \"dill\""])
+
+        self.assertEqual(result.lines, 1)
+        self.assertEqual(result.words, 5)
+
+
+    def test_multiple_line(self):
+        result = self.counter.count_sizes(["aniseed. basil? cinnamon", "dill \"elderflower \"", "\tfennel !"])
+
+        self.assertEqual(result.lines, 3)
+        self.assertEqual(result.words, 8)
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 192 - 0
test/common/test_language_detector.py

@@ -0,0 +1,192 @@
+import io
+import logging
+import unittest
+
+from toolchain.common.language_detector import LanguageDetector
+
+class TestLanguageDetector(unittest.TestCase):
+
+    def setUp(self):
+        logging.disable(level=logging.CRITICAL)
+        self.detector = LanguageDetector()
+
+
+    def tearDown(self):
+        pass
+
+
+    def test_empty_string(self):
+        result = self.detector.detect_language("")
+
+        self.assertEqual(result.language, "")
+        self.assertEqual(result.probability, -1.0)
+        self.assertEqual(result.total, 1)
+        self.assertEqual(result.tested, 0)
+
+
+    def test_nonempty_string(self):
+        result = self.detector.detect_language("you cannot burn a candle at both ends")
+
+        self.assertEqual(result.language, "en")
+        self.assertEqual(result.total, 1)
+        self.assertEqual(result.tested, 1)
+
+
+    def test_upper_string(self):
+        result = self.detector.detect_language("IS GLAS IAD NA CNOIC I bhFAD UAINN")
+
+        self.assertEqual(result.language, "ga")
+        self.assertEqual(result.total, 1)
+        self.assertEqual(result.tested, 1)
+
+
+    def test_file_empty_input(self):
+        result = self.detector.detect_language_in_file([])
+
+        self.assertEqual(result.language, "")
+        self.assertEqual(result.probability, -1.0)
+        self.assertEqual(result.total, 0)
+        self.assertEqual(result.tested, 0)
+
+
+    def test_file_single_line_invalid(self):
+        result = self.detector.detect_language_in_file([""])
+
+        self.assertEqual(result.language, "")
+        self.assertEqual(result.total, 1)
+        self.assertEqual(result.tested, 0)
+
+
+    def test_file_single_line_valid(self):
+        input = ["you cannot burn a candle at both ends"]
+
+        result = self.detector.detect_language_in_file(input)
+
+        self.assertEqual(result.language, "en")
+        self.assertEqual(result.total, 1)
+        self.assertEqual(result.tested, 1)
+
+
+    def test_file_single_line_upper(self):
+        input = ["IS GLAS IAD NA CNOIC I bhFAD UAINN"]
+
+        result = self.detector.detect_language_in_file(input)
+
+        self.assertEqual(result.language, "ga")
+        self.assertEqual(result.total, 1)
+        self.assertEqual(result.tested, 1)
+
+
+    def test_file_multiple_line_invalid(self):
+        input = [
+            "",
+            "",
+            "",
+            "",
+            "",
+        ]
+
+        result = self.detector.detect_language_in_file(input)
+
+        self.assertEqual(result.language, "")
+        self.assertEqual(result.total, 5)
+        self.assertEqual(result.tested, 0)
+
+
+    def test_file_multiple_line_mixed(self):
+        input = [
+            "is glas iad na cnoic i bhfad uainn",
+            "tús maith leath na hoibre",
+            "is binn béal ina thost",
+            "",
+            "is leor nod don eolach",
+        ]
+
+        result = self.detector.detect_language_in_file(input)
+
+        self.assertEqual(result.language, "ga")
+        self.assertEqual(result.total, 5)
+        self.assertEqual(result.tested, 4)
+
+
+    def test_file_multiple_line_min_line_length(self):
+        input = [
+            "is glas iad na cnoic i bhfad uainn",
+            "tús maith leath na hoibre",
+            "is binn béal ina thost",
+            "",
+            "is leor nod don eolach",
+        ]
+        config = {
+            "min_file_line_length" : "25",
+        }
+
+        result = self.detector.detect_language_in_file(input, config)
+
+        self.assertEqual(result.language, "ga")
+        self.assertEqual(result.total, 5)
+        self.assertEqual(result.tested, 2)
+
+
+    def test_file_limited_initial_lines(self):
+        input = [
+            "is glas iad na cnoic i bhfad uainn",
+            "tús maith leath na hoibre",
+            "is binn béal ina thost",
+            "",
+            "is leor nod don eolach",
+        ]
+        config = {
+            "min_initial_lines" : "2",
+        }
+
+        result = self.detector.detect_language_in_file(input, config)
+
+        self.assertEqual(result.language, "ga")
+        self.assertEqual(result.total, 5)
+        self.assertEqual(result.tested, 2)
+
+
+    def test_file_limited_sampling_all_valid(self):
+        input = [
+            "is glas iad na cnoic i bhfad uainn",
+            "tús maith leath na hoibre",
+            "is binn béal ina thost",
+            "bíonn an fhírinne searbh",
+            "is leor nod don eolach",
+            "fillean an feall ar an bhfeallaire",
+        ]
+        config = {
+            "min_initial_lines" : "2",
+            "sampling_interval" : "2",
+        }
+
+        result = self.detector.detect_language_in_file(input, config)
+
+        self.assertEqual(result.language, "ga")
+        self.assertEqual(result.total, 6)
+        self.assertEqual(result.tested, 4)
+
+
+    def test_file_limited_sampling_including_invalid(self):
+        input = [
+            "is glas iad na cnoic i bhfad uainn",
+            "tús maith leath na hoibre",
+            "is binn béal ina thost",
+            "",
+            "is leor nod don eolach",
+        ]
+        config = {
+            "min_initial_lines" : "1",
+            "sampling_interval" : "2",
+        }
+
+        result = self.detector.detect_language_in_file(input, config)
+
+        self.assertEqual(result.language, "ga")
+        self.assertEqual(result.total, 5)
+        self.assertEqual(result.tested, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 30 - 0
test/common/test_raw_file_indexer.py

@@ -0,0 +1,30 @@
+import io
+import unittest
+
+from toolchain.common.raw_file_indexer import RawFileIndexer
+
+class TestRawFileIndexer(unittest.TestCase):
+
+    def setUp(self):
+        self.output = io.StringIO()
+        self.indexer = RawFileIndexer()
+
+
+    def tearDown(self):
+        self.output.close()
+
+
+    def test_empty(self):
+        self.indexer.index([], self.output)
+
+        self.assertEqual(self.output.getvalue(), "")
+
+
+    def test_nonempty(self):
+        self.indexer.index(["/absolute/path/to/file.tmx", "relative/path/to/file.docx"], self.output)
+
+        self.assertEqual(self.output.getvalue(), "1\t/absolute/path/to/file.tmx\n2\trelative/path/to/file.docx\n")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/docalign/__init__.py


+ 71 - 0
test/docalign/test_document_aligner.py

@@ -0,0 +1,71 @@
+import pathlib
+import tempfile
+import unittest
+
+from toolchain.docalign.document_aligner import DocumentAligner
+
+class TestDocumentAligner(unittest.TestCase):
+
+    INPUT_BASE_DIR = pathlib.Path("test-res/docalign").resolve()
+
+    def setUp(self):
+        self.config = {}
+        self.aligner = DocumentAligner(self.config)
+
+
+    def tearDown(self):
+        pass
+
+
+    def run_test(self, test_dir):
+        file_list_src = list(test_dir.glob("*.src.txt"))
+        file_list_tgt = list(test_dir.glob("*.tgt.txt"))
+
+        expected_matches = []
+        expected_unmatched_src = []
+        expected_unmatched_tgt = []
+
+        for file_src in file_list_src:
+            file_tgt = pathlib.Path(str(file_src).replace("src.txt", "tgt.txt"))
+            if file_tgt in file_list_tgt:
+                expected_matches.append((file_src, file_tgt))
+            else:
+                expected_unmatched_src.append(file_src)
+
+        for file_tgt in file_list_tgt:
+            if not any(match[1] == file_tgt for match in expected_matches):
+                expected_unmatched_tgt.append(file_tgt)
+
+        with tempfile.TemporaryDirectory() as docalign_artefact_dir:
+            docalign_artefact_dir = pathlib.Path(docalign_artefact_dir)
+            matches, unmatched_src, unmatched_tgt = self.aligner.align(file_list_src, file_list_tgt, docalign_artefact_dir)
+
+            self.assertEqual(set(matches), set(expected_matches))
+            self.assertEqual(set(unmatched_src), set(expected_unmatched_src))
+            self.assertEqual(set(unmatched_tgt), set(expected_unmatched_tgt))
+
+            self.check_length_matches(docalign_artefact_dir.joinpath("alignments.txt"), matches)
+            self.check_length_matches(docalign_artefact_dir.joinpath("unmatched_src.txt"), unmatched_src)
+            self.check_length_matches(docalign_artefact_dir.joinpath("unmatched_tgt.txt"), unmatched_tgt)
+
+
+    def check_length_matches(self, filepath, document_list):
+        with open(filepath) as f:
+            linecount = sum(1 for _ in f)
+            assert linecount == len(document_list)
+
+
+    def test_document_aligner(self):
+        test_count = 0
+
+        for test_dir in pathlib.Path(self.INPUT_BASE_DIR).iterdir():
+            if test_dir.is_dir():
+                test_count += 1
+                with self.subTest(msg=test_dir.name):
+                    self.run_test(test_dir)
+
+        print("\nTests run for document aligner: {0}".format(test_count))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/extractors/__init__.py


+ 84 - 0
test/extractors/test_editable_text_extractor.py

@@ -0,0 +1,84 @@
+import filecmp
+import os
+import pathlib
+import tempfile
+import unittest
+
+from toolchain.extractors.editable_text_extractor import EditableTextExtractor
+from toolchain.extractors.extraction_error import ExtractionError
+
+class TestEditableTextExtractor(unittest.TestCase):
+
+    INPUT_BASE_DIR = pathlib.Path("test-res/extractors").resolve()
+
+    def setUp(self):
+        self.extractor = EditableTextExtractor(os.environ["LIBREOFFICEPATH"])
+        self.config = {
+            "libreoffice_subprocess_timeout" : "20",
+            "libreoffice_file_write_timeout" : "20",
+        }
+
+
+    def tearDown(self):
+        pass
+
+
+    def run_test(self, input_filename, output_filename_expected):
+        input_path = self.INPUT_BASE_DIR.joinpath(input_filename)
+        output_path_expected = self.INPUT_BASE_DIR.joinpath(output_filename_expected)
+        output_path_actual = tempfile.NamedTemporaryFile(delete=False).name
+
+        self.extractor.extract(input_path, output_path_actual, self.config)
+
+        self.assertTrue(filecmp.cmp(output_path_actual, output_path_expected))
+        pathlib.Path(output_path_actual).unlink()
+
+
+    def test_empty(self):
+        self.run_test("empty.odt", "empty_editable_expected.txt")
+
+
+    def test_odt(self):
+        self.run_test("odt.odt", "odt_expected.txt")
+
+
+    def test_upper(self):
+        self.run_test("upper.ODT", "upper_expected.txt")
+
+
+    def test_doc(self):
+        self.run_test("doc.doc", "doc_expected.txt")
+
+
+    def test_docx(self):
+        self.run_test("docx.docx", "docx_expected.txt")
+
+
+    def test_rtf(self):
+        self.run_test("rtf.rtf", "rtf_expected.txt")
+
+
+    def test_subprocess_timeout(self):
+        self.config["libreoffice_subprocess_timeout"] = "0"
+        input_path = self.INPUT_BASE_DIR.joinpath("odt.odt")
+        output_path_actual = tempfile.NamedTemporaryFile(delete=False).name
+
+        with self.assertRaises(ExtractionError):
+            self.extractor.extract(input_path, output_path_actual, self.config)
+
+        pathlib.Path(output_path_actual).unlink()
+
+
+    def test_file_write_timeout(self):
+        self.config["libreoffice_file_write_timeout"] = "0"
+        input_path = self.INPUT_BASE_DIR.joinpath("odt.odt")
+        output_path_actual = tempfile.NamedTemporaryFile(delete=False).name
+
+        with self.assertRaises(ExtractionError):
+            self.extractor.extract(input_path, output_path_actual, self.config)
+
+        pathlib.Path(output_path_actual).unlink()
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 56 - 0
test/extractors/test_pdf_text_extractor.py

@@ -0,0 +1,56 @@
+import filecmp
+import os
+import pathlib
+import tempfile
+import unittest
+
+from toolchain.extractors.extraction_error import ExtractionError
+from toolchain.extractors.pdf_text_extractor import PdfTextExtractor
+
+class TestPdfTextExtractor(unittest.TestCase):
+
+    INPUT_BASE_DIR = pathlib.Path("test-res/extractors").resolve()
+
+    def setUp(self):
+        self.extractor = PdfTextExtractor(os.environ["PDFTOTEXTPATH"])
+        self.config = {
+            "pdftotext_subprocess_timeout" : "20",
+        }
+
+
+    def tearDown(self):
+        pass
+
+
+    def run_test(self, input_filename, output_filename_expected):
+        input_path = self.INPUT_BASE_DIR.joinpath(input_filename)
+        output_path_expected = self.INPUT_BASE_DIR.joinpath(output_filename_expected)
+        output_path_actual = tempfile.NamedTemporaryFile(delete=False).name
+
+        self.extractor.extract(input_path, output_path_actual, self.config)
+
+        self.assertTrue(filecmp.cmp(output_path_actual, output_path_expected))
+        pathlib.Path(output_path_actual).unlink()
+
+
+    def test_empty(self):
+        self.run_test("empty.pdf", "empty_pdf_expected.txt")
+
+
+    def test_pdf(self):
+        self.run_test("pdf.pdf", "pdf_expected.txt")
+
+
+    def test_timeout(self):
+        self.config["pdftotext_subprocess_timeout"] = "0"
+        input_path = self.INPUT_BASE_DIR.joinpath("pdf.pdf")
+        output_path_actual = tempfile.NamedTemporaryFile(delete=False).name
+
+        with self.assertRaises(ExtractionError):
+            self.extractor.extract(input_path, output_path_actual, self.config)
+
+        pathlib.Path(output_path_actual).unlink
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/integration/__init__.py


+ 48 - 0
test/integration/test_processors.py

@@ -0,0 +1,48 @@
+import json
+import pathlib
+import tempfile
+import unittest
+
+from toolchain.doc_to_tmx_processor import DocToTmxProcessor
+from toolchain.tm_to_tmx_processor import TmToTmxProcessor
+
+class TestToolchainProcessors(unittest.TestCase):
+
+    CONFIG_PATH = pathlib.Path("toolchains.cfg").resolve()
+
+    def run_test(self, toolchain, test_dir):
+        id = test_dir.name
+        input_dir = test_dir.joinpath("input")
+        expectations_filepath = test_dir.joinpath("expect", "expectations.json")
+
+        with tempfile.TemporaryDirectory() as artefact_dir, tempfile.TemporaryDirectory() as output_dir,\
+                open(expectations_filepath) as expectations_file:
+            rejected, file_infos = toolchain(self.CONFIG_PATH).process(id, input_dir, artefact_dir, output_dir)
+
+            expectations = json.load(expectations_file)
+            meta_expectations = expectations["meta"]
+            self.assertEqual(rejected, meta_expectations["rejected"])
+            self.assertEqual(file_infos, meta_expectations["file_infos"])
+            # TODO: add content checking
+
+
+    def run_tests(self, toolchain, base_dir):
+        test_count = 0
+        for test_dir in pathlib.Path(base_dir).iterdir():
+            if test_dir.is_dir():
+                test_count += 1
+                with self.subTest(msg=test_dir.name):
+                    self.run_test(toolchain, test_dir)
+        print("\nTests run for toolchain {0}: {1}".format(toolchain.__name__, test_count))
+
+
+    def test_tm_to_tmx(self):
+        self.run_tests(TmToTmxProcessor, "test-res/integration/tm")
+
+
+    def test_doc_to_tmx(self):
+        self.run_tests(DocToTmxProcessor, "test-res/integration/doc")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/normalizer/__init__.py


+ 98 - 0
test/normalizer/test_unicode_normalizer.py

@@ -0,0 +1,98 @@
+import io
+import unittest
+
+from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
+
+class TestUnicodeNormalizer(unittest.TestCase):
+
+    def setUp(self):
+        self.output = io.StringIO()
+        self.normalizer = UnicodeNormalizer()
+
+
+    def tearDown(self):
+        self.output.close()
+
+
+    def test_empty(self):
+        self.normalizer.normalize_text([], self.output)
+
+        self.assertEqual(self.output.getvalue(), "")
+
+
+    def test_ascii(self):
+        lines = ["1234567890_[](){}<>.,;:?!|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"]
+
+        self.normalizer.normalize_text(lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "1234567890_[](){}<>.,;:?!|ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\n")
+
+
+    def test_nfc(self):
+        lines = ["ÁÉÍÓÚáéíóú"]
+
+        self.normalizer.normalize_text(lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "ÁÉÍÓÚáéíóú\n")
+
+
+    def test_nfd(self):
+        lines = ["ÁÉÍÓÚáéı́óú"]
+
+        self.normalizer.normalize_text(lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "ÁÉÍÓÚáéíóú\n")
+
+
+    def test_bom(self):
+        lines = ["\ufeffabcde"]
+
+        self.normalizer.normalize_text(lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "abcde\n")
+
+
+    def test_multiple_global_sub(self):
+        lines = ["ı́éı́óúı́"]
+
+        self.normalizer.normalize_text(lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "íéíóúí\n")
+
+
+    def test_custom_sub(self):
+        lines = ["\u008045 \u0080\u0080"]
+
+        self.normalizer.normalize_text(lines, self.output, [("\u0080", "€")])
+
+        self.assertEqual(self.output.getvalue(), "€45 €€\n")
+
+
+    def test_newline_termination(self):
+        lines = [
+            "\ufeffabcde\n",
+            "ÁÉÍÓÚáéíóú\n",
+            "ÁÉÍÓÚáéı́óú\n",
+        ]
+
+        self.normalizer.normalize_text(lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "abcde\nÁÉÍÓÚáéíóú\nÁÉÍÓÚáéíóú\n")
+
+
+    def test_mixed(self):
+        lines = [
+            "123+_ABcdEf",
+            "ÁÉÍÓÚáéíóú",
+            "ÁÉÍÓÚáéı́óú",
+            "ı́éı́óúı́\u0080",
+            "123ÚáÚ áı́óúı́"
+        ]
+
+        self.normalizer.normalize_text(lines, self.output, [("\u0080", "€")])
+
+        self.assertEqual(self.output.getvalue(), "123+_ABcdEf\nÁÉÍÓÚáéíóú\nÁÉÍÓÚáéíóú\níéíóúí€\n123ÚáÚ áíóúí\n")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/parsers/__init__.py


+ 218 - 0
test/parsers/test_sdltm_parser.py

@@ -0,0 +1,218 @@
+import io
+import unittest
+import defusedxml.ElementTree as ET
+
+from toolchain.parsers.sdltm_parser import SdltmParser
+
+class TestSdltmParser(unittest.TestCase):
+
+    ROOT_TEMPLATE_VALID = \
+        "<Segment>\
+            <Elements>{0}</Elements>\
+            <CultureName>{1}</CultureName>\
+         </Segment>"
+    ROOT_TEMPLATE_MISSING_LANGUAGE = \
+        "<Segment>\
+            <Elements>{0}</Elements>\
+         </Segment>"
+
+    LANGUAGE_CODE_SRC = "en"
+    LANGUAGE_CODE_TGT = "ga"
+
+    def setUp(self):
+        self.output_src = io.StringIO()
+        self.output_tgt = io.StringIO()
+        self.parser = SdltmParser(self.LANGUAGE_CODE_SRC, self.LANGUAGE_CODE_TGT)
+
+
+    def tearDown(self):
+        self.output_src.close()
+        self.output_tgt.close()
+
+
+    def make_root(self, template, content, lang_code):
+        return ET.fromstring(template.format(content, lang_code))
+
+
+    def test_missing_language(self):
+        first_texts = "<Text><Value>horse</Value></Text>"
+        second_texts = "<Text><Value>capall</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_MISSING_LANGUAGE, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_missing_value(self):
+        first_texts = "<Text><Value>horse</Value></Text>"
+        second_texts = "<Text></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_empty_language(self):
+        first_texts = "<Text><Value>horse</Value></Text>"
+        second_texts = "<Text><Value>capall</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_unknown_language(self):
+        first_texts = "<Text><Value>horse</Value></Text>"
+        second_texts = "<Text><Value>Pferd</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "de")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_no_texts(self):
+        first_texts = "<Text><Value>horse</Value></Text>"
+        second_texts = ""
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_empty_value(self):
+        first_texts = "<Text><Value></Value></Text>"
+        second_texts = "<Text><Value>capall</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_in_order(self):
+        first_texts = "<Text><Value>horse</Value></Text>"
+        second_texts = "<Text><Value>capall</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_reverse_order(self):
+        first_texts = "<Text><Value>capall</Value></Text>"
+        second_texts = "<Text><Value>horse</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "ga")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "en")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_language_variants(self):
+        first_texts = "<Text><Value>horse</Value></Text>"
+        second_texts = "<Text><Value>capall</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en-GB")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga-IE")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_extra_whitespace_leading_trailing(self):
+        first_texts = "<Text><Value>yellow </Value></Text>"
+        second_texts = "<Text><Value>    \tbuí</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow \n")
+        self.assertEqual(self.output_tgt.getvalue(), "    \tbuí\n")
+
+
+    def test_extra_whitespace_contained(self):
+        first_texts = "<Text><Value>cake</Value></Text>"
+        second_texts = "<Text><Value>cáca    \tmilis</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cáca    \tmilis\n")
+
+
+    def test_newline_contained(self):
+        first_texts = "<Text><Value>cake</Value></Text>"
+        second_texts = "<Text><Value>cáca\nmilis</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cácamilis\n")
+
+
+    def test_only_whitespace(self):
+        first_texts = "<Text><Value>yellow</Value></Text>"
+        second_texts = "<Text><Value>   </Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "   \n")
+
+
+    def test_only_newline(self):
+        first_texts = "<Text><Value>yellow</Value></Text>"
+        second_texts = "<Text><Value>\n</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "\n")
+
+
+    def test_multiple_texts(self):
+        first_texts = "<Text><Value>bread </Value></Text><Text><Value>and</Value></Text><Text><Value> jam</Value></Text>"
+        second_texts = "<Text><Value>arán</Value></Text><Text><Value> agus </Value></Text><Text><Value>subh</Value></Text>"
+        root_first = self.make_root(self.ROOT_TEMPLATE_VALID, first_texts, "en")
+        root_second = self.make_root(self.ROOT_TEMPLATE_VALID, second_texts, "ga")
+
+        self.parser.parse_segment(root_first, root_second, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "bread and jam\n")
+        self.assertEqual(self.output_tgt.getvalue(), "arán agus subh\n")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 327 - 0
test/parsers/test_tmx_parser.py

@@ -0,0 +1,327 @@
+import io
+import unittest
+import defusedxml.ElementTree as ET
+
+from toolchain.parsers.parsing_error import ParsingError
+from toolchain.parsers.tmx_parser import TmxParser
+
+class TestTmxParser(unittest.TestCase):
+
+    ROOT_TEMPLATE = \
+        "<tmx version=\"1.4\">\
+            <header/>\
+            <body>\
+            {0}\
+            </body>\
+         </tmx>"
+
+    LANGUAGE_CODE_SRC = "en"
+    LANGUAGE_CODE_TGT = "ga"
+
+    def setUp(self):
+        self.output_src = io.StringIO()
+        self.output_tgt = io.StringIO()
+        self.parser = TmxParser(self.LANGUAGE_CODE_SRC, self.LANGUAGE_CODE_TGT)
+
+
+    def tearDown(self):
+        self.output_src.close()
+        self.output_tgt.close()
+
+
+    def make_document(self, content):
+        return ET.fromstring(TestTmxParser.ROOT_TEMPLATE.format(content))
+
+
+    def test_empty_body(self):
+        document = self.make_document("")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_absent_tgt(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yellow</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_absent_src(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>buí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_empty_tgt(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yellow</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg></seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_single_simple(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yellow</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>buí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_language_variants(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en-GB\">\
+                    <seg>yellow</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga-IE\">\
+                    <seg>buí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_language_missing(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yellow</seg>\
+                </tuv>\
+                <tuv>\
+                    <seg>buí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        with self.assertRaises(ParsingError):
+            self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_inner_node_empty(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg><inner/>yellow</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>buí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_inner_node_nonempty_preceding(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg><inner>ye</inner>llow</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>buí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_inner_node_nonempty_following(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yell<inner>ow</inner></seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>buí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_extra_whitespace_leading_trailing(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yellow </seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>    \tbuí</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow \n")
+        self.assertEqual(self.output_tgt.getvalue(), "    \tbuí\n")
+
+
+    def test_extra_whitespace_contained(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>cake</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>cáca    \tmilis</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cáca    \tmilis\n")
+
+
+    def test_newline_contained(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>cake</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>cáca\nmilis</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cácamilis\n")
+
+
+    def test_only_whitespace(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yellow</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>   </seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "   \n")
+
+
+    def test_only_newline(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>yellow</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>\n</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "\n")
+
+
+    def test_multiple(self):
+        document = self.make_document("\
+            <tu>\
+                <tuv xml:lang=\"en\">\
+                    <seg>horse</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga\">\
+                    <seg>capall</seg>\
+                </tuv>\
+            </tu>\
+            <tu>\
+                <tuv xml:lang=\"en-IE\">\
+                    <seg>eat</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga-IE\">\
+                    <seg>ith</seg>\
+                </tuv>\
+            </tu>\
+            <tu>\
+                <tuv xml:lang=\"en-GB\">\
+                    <seg>cake</seg>\
+                </tuv>\
+                <tuv xml:lang=\"ga-IE\">\
+                    <seg>cáca\nmilis</seg>\
+                </tuv>\
+            </tu>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\neat\ncake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\nith\ncácamilis\n")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 750 - 0
test/parsers/test_xliff_parser.py

@@ -0,0 +1,750 @@
+import io
+import unittest
+import xml.etree.ElementTree as ET
+
+from toolchain.parsers.parsing_error import ParsingError
+from toolchain.parsers.xliff_parser import Xliff12Parser, Xliff20Parser
+
+class TestXliff12Parser(unittest.TestCase):
+
+    LANGUAGE_CODE_SRC = "en"
+    LANGUAGE_CODE_TGT = "ga"
+
+    ROOT_TEMPLATE = \
+        "<xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\
+            <file original=\"/path/to/original\" source-language=\"{0}\" target-language=\"{1}\" datatype=\"datatype\">\
+                <header/>\
+                <body>\
+                {2}\
+                </body>\
+            </file>\
+         </xliff>"
+
+    def setUp(self):
+        self.output_src = io.StringIO()
+        self.output_tgt = io.StringIO()
+        self.parser = Xliff12Parser(self.LANGUAGE_CODE_SRC, self.LANGUAGE_CODE_TGT)
+
+
+    def tearDown(self):
+        self.output_src.close()
+        self.output_tgt.close()
+
+
+    def make_document(self, source_language_code, target_language_code, content):
+        return ET.fromstring(TestXliff12Parser.ROOT_TEMPLATE.format(source_language_code, target_language_code, content))
+
+
+    def test_empty_body(self):
+        document = self.make_document("en", "ga", "")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_absent_src(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <target>capall</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_absent_tgt(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>horse</source>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_empty_src(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source/>\
+                <target>capall</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_empty_tgt(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>horse</source>\
+                <target/>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_single_valid(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>horse</source>\
+                <target>capall</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_language_variants(self):
+        document = self.make_document("en-GB", "ga-IE", "\
+            <trans-unit>\
+                <source>horse</source>\
+                <target>capall</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_languages_reversed(self):
+        document = self.make_document("ga", "en", "\
+            <trans-unit>\
+                <source>capall</source>\
+                <target>horse</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_inner_tags_all(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source><inner>yellow</inner></source>\
+                <target><g>buí</g></target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_single_valid_inner_tags_start(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source><inner>ye</inner>llow</source>\
+                <target><g>buí</g></target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_single_valid_inner_tags_all(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>yell<inner>ow</inner></source>\
+                <target><g>buí</g></target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_single_valid_with_group(self):
+        document = self.make_document("en", "ga", "\
+            <group>\
+                <trans-unit>\
+                    <source>horse</source>\
+                    <target>capall</target>\
+                </trans-unit>\
+            </group>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_multiple_group(self):
+        document = self.make_document("en", "ga", "\
+            <group>\
+                <group>\
+                    <trans-unit>\
+                        <source>horse</source>\
+                        <target>capall</target>\
+                    </trans-unit>\
+                </group>\
+            </group>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_extra_whitespace_leading_trailing(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>yellow </source>\
+                <target>    \tbuí</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow \n")
+        self.assertEqual(self.output_tgt.getvalue(), "    \tbuí\n")
+
+
+    def test_extra_whitespace_contained(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>cake</source>\
+                <target>cáca    \tmilis</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cáca    \tmilis\n")
+
+
+    def test_newline_contained(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>cake</source>\
+                <target>cáca\nmilis</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cácamilis\n")
+
+
+    def test_only_whitespace(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>yellow</source>\
+                <target>   </target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "   \n")
+
+
+    def test_only_newline(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>yellow</source>\
+                <target>\n</target>\
+            </trans-unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "\n")
+
+
+    def test_multiple_valid(self):
+        document = self.make_document("en", "ga", "\
+            <trans-unit>\
+                <source>horse</source>\
+                <target>capall</target>\
+            </trans-unit>\
+            <group>\
+                <trans-unit>\
+                    <source>eat</source>\
+                </trans-unit>\
+            </group>\
+            <group>\
+                <group>\
+                    <trans-unit>\
+                        <source>yell<inner>ow</inner></source>\
+                        <target><g>buí</g></target>\
+                    </trans-unit>\
+                </group>\
+            </group>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\nyellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\nbuí\n")
+
+
+    def test_no_target_language(self):
+        document = ET.fromstring("\
+            <xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\
+                <file original=\"/path/to/original\" source-language=\"en\" datatype=\"datatype\">\
+                    <header/>\
+                    <body>\
+                        <trans-unit>\
+                            <source>horse</source>\
+                            <target>capall</target>\
+                        </trans-unit>\
+                    </body>\
+                </file>\
+            </xliff>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_no_source_language(self):
+        document = ET.fromstring("\
+            <xliff xmlns=\"urn:oasis:names:tc:xliff:document:1.2\" version=\"1.2\">\
+                <file original=\"/path/to/original\" target-language=\"ga\" datatype=\"datatype\">\
+                    <header/>\
+                    <body>\
+                        <trans-unit>\
+                            <source>horse</source>\
+                            <target>capall</target>\
+                        </trans-unit>\
+                    </body>\
+                </file>\
+            </xliff>\
+        ")
+
+        with self.assertRaises(ParsingError):
+            self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+class TestXliff20Parser(unittest.TestCase):
+
+    LANGUAGE_CODE_SRC = "en"
+    LANGUAGE_CODE_TGT = "ga"
+
+    ROOT_TEMPLATE = \
+        "<xliff xmlns=\"urn:oasis:names:tc:xliff:document:2.0\" version=\"2.0\" srcLang=\"{0}\" trgLang=\"{1}\">\
+            <file>\
+            {2}\
+            </file>\
+         </xliff>"
+
+    def setUp(self):
+        self.output_src = io.StringIO()
+        self.output_tgt = io.StringIO()
+        self.parser = Xliff20Parser(self.LANGUAGE_CODE_SRC, self.LANGUAGE_CODE_TGT)
+
+
+    def tearDown(self):
+        self.output_src.close()
+        self.output_tgt.close()
+
+
+    def make_document(self, source_language_code, target_language_code, content):
+        return ET.fromstring(TestXliff20Parser.ROOT_TEMPLATE.format(source_language_code, target_language_code, content))
+
+
+    def test_empty_body(self):
+        document = self.make_document("en", "ga", "")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_absent_src(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <target>capall</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_absent_tgt(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>horse</source>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_empty_src(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source/>\
+                    <target>capall</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_empty_tgt(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>horse</source>\
+                    <target/>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_single_valid(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>horse</source>\
+                    <target>capall</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_language_variants(self):
+        document = self.make_document("en-GB", "ga-IE", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>horse</source>\
+                    <target>capall</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_languages_reversed(self):
+        document = self.make_document("ga", "en", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>capall</source>\
+                    <target>horse</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_inner_tags_all(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source><inner>yellow</inner></source>\
+                    <target><g>buí</g></target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_single_valid_inner_tags_start(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source><inner>ye</inner>llow</source>\
+                    <target><g>buí</g></target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_single_valid_inner_tags_all(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>yell<inner>ow</inner></source>\
+                    <target><g>buí</g></target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "buí\n")
+
+
+    def test_single_valid_with_group(self):
+        document = self.make_document("en", "ga", "\
+            <group>\
+                <unit id=\"7\">\
+                    <segment>\
+                        <source>horse</source>\
+                        <target>capall</target>\
+                    </segment>\
+                </unit>\
+            </group>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_single_valid_multiple_group(self):
+        document = self.make_document("en", "ga", "\
+            <group>\
+                <group>\
+                    <group>\
+                        <unit id=\"7\">\
+                            <segment>\
+                                <source>horse</source>\
+                                <target>capall</target>\
+                            </segment>\
+                        </unit>\
+                    </group>\
+                </group>\
+            </group>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\n")
+
+
+    def test_extra_whitespace_leading_trailing(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>yellow </source>\
+                    <target>    \tbuí</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow \n")
+        self.assertEqual(self.output_tgt.getvalue(), "    \tbuí\n")
+
+
+    def test_extra_whitespace_contained(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>cake</source>\
+                    <target>cáca    \tmilis</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cáca    \tmilis\n")
+
+
+    def test_newline_contained(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>cake</source>\
+                    <target>cáca\nmilis</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "cake\n")
+        self.assertEqual(self.output_tgt.getvalue(), "cácamilis\n")
+
+
+    def test_only_whitespace(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>yellow</source>\
+                    <target>   </target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "   \n")
+
+
+    def test_only_newline(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>yellow</source>\
+                    <target>\n</target>\
+                </segment>\
+            </unit>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "yellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "\n")
+
+
+    def test_multiple_valid(self):
+        document = self.make_document("en", "ga", "\
+            <unit id=\"7\">\
+                <segment>\
+                    <source>horse</source>\
+                    <target>capall</target>\
+                </segment>\
+            </unit>\
+            <group>\
+                <unit id=\"13\">\
+                    <segment>\
+                        <source>eat</source>\
+                    </segment>\
+                </unit>\
+            </group>\
+            <group>\
+                <group>\
+                    <group>\
+                        <unit id=\"49\">\
+                            <segment>\
+                                <source>yell<inner>ow</inner></source>\
+                                <target><g>buí</g></target>\
+                            </segment>\
+                        </unit>\
+                    </group>\
+                </group>\
+            </group>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "horse\nyellow\n")
+        self.assertEqual(self.output_tgt.getvalue(), "capall\nbuí\n")
+
+
+    def test_no_target_language(self):
+        document = ET.fromstring("\
+            <xliff xmlns=\"urn:oasis:names:tc:xliff:document:2.0\" version=\"2.0\" srcLang=\"en\">\
+                <file>\
+                    <unit id=\"7\">\
+                        <segment>\
+                            <source>horse</source>\
+                            <target>capall</target>\
+                        </segment>\
+                    </unit>\
+                </file>\
+            </xliff>\
+        ")
+
+        self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_no_source_language(self):
+        document = ET.fromstring("\
+            <xliff xmlns=\"urn:oasis:names:tc:xliff:document:2.0\" version=\"2.0\" trgLang=\"ga\">\
+                <file>\
+                    <unit id=\"7\">\
+                        <segment>\
+                            <source>horse</source>\
+                            <target>capall</target>\
+                        </segment>\
+                    </unit>\
+                </file>\
+            </xliff>\
+        ")
+
+        with self.assertRaises(ParsingError):
+            self.parser.parse_content(document, self.output_src, self.output_tgt)
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/sentalign/__init__.py


+ 83 - 0
test/sentalign/test_sentence_aligner.py

@@ -0,0 +1,83 @@
+import io
+import os
+import pathlib
+import subprocess
+import tempfile
+import unittest
+
+from toolchain.sentalign.sentalign_error import SentalignError
+from toolchain.sentalign.sentence_aligner import SentenceAligner
+
+class TestSentenceAligner(unittest.TestCase):
+
+    INPUT_BASE_PATH = pathlib.Path("test-res/sentalign").resolve()
+
+    def setUp(self):
+        self.output_src = io.StringIO()
+        self.output_tgt = io.StringIO()
+        self.aligner = SentenceAligner(os.environ["HUNALIGNPATH"])
+        self.config = {
+            "dictionary" : self.INPUT_BASE_PATH.joinpath("dictionary.txt"),
+            "subprocess_timeout" : "20",
+        }
+
+
+    def tearDown(self):
+        self.output_src.close()
+        self.output_tgt.close()
+
+
+    def run_test(self, input_file_src, input_file_tgt, output_expected_src, output_expected_tgt):
+        input_path_src = self.INPUT_BASE_PATH.joinpath(input_file_src)
+        input_path_tgt = self.INPUT_BASE_PATH.joinpath(input_file_tgt)
+
+        with tempfile.TemporaryDirectory() as output_artefact_dir:
+            self.aligner.align_files(input_path_src, input_path_tgt,
+                    self.output_src, self.output_tgt, output_artefact_dir, self.config)
+
+        self.assertEqual(self.output_src.getvalue(), output_expected_src)
+        self.assertEqual(self.output_tgt.getvalue(), output_expected_tgt)
+
+
+    def test_empty(self):
+        self.run_test("empty.src.txt", "empty.tgt.txt", "\n", "\n")
+
+
+    def test_already_aligned(self):
+        self.run_test("aligned.src.txt", "aligned.tgt.txt",
+                "aniseed\nbasil\ncinnamon dill\nelderflower fennel ginger\nhorseradish\n\n",
+                "ánísééd\nbásíl\ncínnámón díll\néldérflówér fénnél gíngér\nhórsérádísh\n\n")
+
+
+    def test_unaligned(self):
+        self.run_test("unaligned.src.txt", "unaligned.tgt.txt",
+                "aniseed\nbasil cinnamon dill elderflower\nfennel\nginger horseradish\n\n",
+                "ánísééd\nbásíl cínnámón díll éldérflówér\nfénnél\ngíngér hórsérádísh\n\n")
+
+
+    def test_timeout(self):
+        self.config["subprocess_timeout"] = 0
+
+        with self.assertRaises(SentalignError):
+            self.run_test("unaligned.src.txt", "unaligned.tgt.txt",
+                    "aniseed\nbasil cinnamon dill elderflower\nfennel\nginger horseradish\n\n",
+                    "ánísééd\nbásíl cínnámón díll éldérflówér\nfénnél\ngíngér hórsérádísh\n\n")
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+    def test_subprocess_error(self):
+        self.config["dictionary"] = self.INPUT_BASE_PATH.joinpath("nonexistent-dictionary.txt")
+
+        with self.assertRaises(SentalignError):
+            self.run_test("unaligned.src.txt", "unaligned.tgt.txt",
+                    "aniseed\nbasil cinnamon dill elderflower\nfennel\nginger horseradish\n\n",
+                    "ánísééd\nbásíl cínnámón díll éldérflówér\nfénnél\ngíngér hórsérádísh\n\n")
+
+        self.assertEqual(self.output_src.getvalue(), "")
+        self.assertEqual(self.output_tgt.getvalue(), "")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/splitters/__init__.py


+ 207 - 0
test/splitters/test_editable_sentence_splitter.py

@@ -0,0 +1,207 @@
+import io
+import unittest
+
+from toolchain.splitters.editable_sentence_splitter import EditableSentenceSplitter
+
+class TestEditableSentenceSplitter(unittest.TestCase):
+
+    def setUp(self):
+        self.output = io.StringIO()
+        self.abbreviations = ["gCo\tgContae\ttrue", "IR\tIonstraim Reachtúil", "srl\tagus araile", "Uimh\tUimhir"]
+        self.splitter = EditableSentenceSplitter()
+
+
+    def tearDown(self):
+        self.output.close()
+
+
+    def test_empty(self):
+        self.splitter.split_sentences(self.abbreviations, [], self.output)
+
+        self.assertEqual(self.output.getvalue(), "")
+
+
+    def test_line_empty(self):
+        lines = [""]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "")
+
+
+    def test_single_sentence_with_terminator(self):
+        lines = ["Is glas iad na cnoic i bhfad uainn."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\n")
+
+
+    def test_single_sentence_without_terminator(self):
+        lines = ["Is glas iad na cnoic i bhfad uainn"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn\n")
+
+
+    def test_non_abbreviation(self):
+        lines = [
+            "aaaaa. bbbbb",
+            "ccccc. DDDDD",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "aaaaa.\nbbbbb\nccccc.\nDDDDD\n")
+
+
+    def test_abbreviation_followed_by_lowercase(self):
+        lines = ["a, b, srl. agus c"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "a, b, srl. agus c\n")
+
+
+    def test_abbreviation_followed_by_uppercase(self):
+        lines = ["a, b, srl. Agus c"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "a, b, srl.\nAgus c\n")
+
+
+    def test_abbreviation_expecting_additional_followed_by_lowercase(self):
+        lines = ["i gCo. an Chláir"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "i gCo. an Chláir\n")
+
+
+    def test_abbreviation_expecting_additional_followed_by_uppercase(self):
+        lines = ["i gCo. Chill Dara"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "i gCo. Chill Dara\n")
+
+
+    def test_abbreviation_at_end_of_line(self):
+        lines = ["a, b, srl."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "a, b, srl.\n")
+
+
+    def test_abbreviation_followed_by_numeral(self):
+        lines = ["Uimh. 9924 de 2027."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Uimh. 9924 de 2027.\n")
+
+
+    def test_multichar_abbreviation(self):
+        lines = ["I.R. 9924 de 2027."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "I.R. 9924 de 2027.\n")
+
+
+    def test_chained_abbreviations(self):
+        lines = ["I.R. Uimh. 9924 de 2027."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "I.R. Uimh. 9924 de 2027.\n")
+
+
+    def test_multiple_sentence(self):
+        lines = ["Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach? Fillean an feall ar an bhfeallaire!"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\nNach leor nod don eolach?\nFillean an feall ar an bhfeallaire!\n")
+
+
+    def test_newline_termination(self):
+        lines = [
+            "Líne 1\n",
+            "Líne 2?\n",
+            "Líne 3!\n",
+            "Líne 4.\n",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "\n".join([
+            "Líne 1",
+            "Líne 2?",
+            "Líne 3!",
+            "Líne 4.\n",
+        ]))
+
+
+    def test_list_items(self):
+        lines = [
+            "1. líne 1. a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X",
+            "2021. Líne eile",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "\n".join([
+            "1. líne 1.",
+            "a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X",
+            "2021.",
+            "Líne eile\n"
+        ]))
+
+    def test_mixed(self):
+        lines = [
+            "Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach?",
+            "",
+            "Fillean an feall ar an bhfeallaire! I.R. 6742 de 2039. Uimh. 39 de 1382. I.R. Uimh. 9924 de 2027.",
+            "a, b, srl. agus c? d, e, srl. Agus f! g, h, srl.",
+            "1. líne 1. a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X",
+            "2021. Líne eile",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "\n".join([
+            "Is glas iad na cnoic i bhfad uainn.",
+            "Nach leor nod don eolach?",
+            "Fillean an feall ar an bhfeallaire!",
+            "I.R. 6742 de 2039.",
+            "Uimh. 39 de 1382.",
+            "I.R. Uimh. 9924 de 2027.",
+            "a, b, srl. agus c?",
+            "d, e, srl.",
+            "Agus f!",
+            "g, h, srl.",
+            "1. líne 1.",
+            "a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X",
+            "2021.",
+            "Líne eile\n"
+        ]))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 452 - 0
test/splitters/test_pdf_sentence_splitter.py

@@ -0,0 +1,452 @@
+import io
+import unittest
+
+from toolchain.splitters.pdf_sentence_splitter import PdfSentenceSplitter
+
+class TestPdfSentenceSplitter(unittest.TestCase):
+
+    def setUp(self):
+        self.output = io.StringIO()
+        self.abbreviations = ["gCo\tgContae\ttrue", "IR\tIonstraim Reachtúil", "srl\tagus araile", "Teo\tTeoranta", "Uimh\tUimhir"]
+        self.splitter = PdfSentenceSplitter()
+
+
+    def tearDown(self):
+        self.output.close()
+
+
+    def test_empty(self):
+        self.splitter.split_sentences(self.abbreviations, [], self.output)
+
+        self.assertEqual(self.output.getvalue(), "")
+
+
+    def test_line_empty(self):
+        lines = [""]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "")
+
+
+    def test_single_sentence_start_upper_with_terminator(self):
+        lines = ["Is glas iad na cnoic i bhfad uainn."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\n")
+
+
+    def test_single_sentence_start_lower_with_terminator(self):
+        lines = ["is glas iad na cnoic i bhfad uainn."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "is glas iad na cnoic i bhfad uainn.\n")
+
+
+    def test_single_sentence_without_terminator(self):
+        lines = ["Is glas iad na cnoic i bhfad uainn"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn\n")
+
+
+    def test_non_abbreviation(self):
+        lines = [
+            "aaaaa. bbbbb",
+            "ccccc. DDDDD",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "aaaaa.\nbbbbb ccccc.\nDDDDD\n")
+
+
+    def test_abbreviation_followed_by_lowercase(self):
+        lines = ["a, b, srl. agus c"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "a, b, srl. agus c\n")
+
+
+    def test_abbreviation_followed_by_uppercase(self):
+        lines = ["a, b, srl. Agus c"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "a, b, srl.\nAgus c\n")
+
+
+    def test_abbreviation_at_end_of_line(self):
+        lines = ["a, b, srl."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "a, b, srl.\n")
+
+
+    def test_abbreviation_followed_by_numeral(self):
+        lines = ["Uimh. 9924 de 2027."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Uimh. 9924 de 2027.\n")
+
+
+    def test_multichar_abbreviation(self):
+        lines = ["I.R. 9924 de 2027."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "I.R. 9924 de 2027.\n")
+
+
+    def test_chained_abbreviations(self):
+        lines = ["I.R. Uimh. 9924 de 2027."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "I.R. Uimh. 9924 de 2027.\n")
+
+
+    def test_abbreviation_expecting_additional_followed_by_lowercase_same_line(self):
+        lines = ["i gCo. an Chláir"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "i gCo. an Chláir\n")
+
+
+    def test_abbreviation_expecting_additional_followed_by_uppercase_same_line(self):
+        lines = ["i gCo. Chill Dara"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "i gCo. Chill Dara\n")
+
+
+    def test_abbreviation_expecting_additional_followed_by_lowercase_across_lines(self):
+        lines = [
+            "i gCo.",
+            "an Chláir",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "i gCo. an Chláir\n")
+
+
+    def test_abbreviation_expecting_additional_followed_by_uppercase_across_lines(self):
+        lines = [
+            "i gCo.",
+            "Chill Dara",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "i gCo. Chill Dara\n")
+
+
+    def test_multiple_sentence(self):
+        lines = ["Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach? Fillean an feall ar an bhfeallaire!"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Is glas iad na cnoic i bhfad uainn.\nNach leor nod don eolach?\nFillean an feall ar an bhfeallaire!\n")
+
+
+    def test_newline_termination(self):
+        lines = [
+            "Líne 1\n",
+            "Líne 2?\n",
+            "Líne 3!\n",
+            "Líne 4.\n",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "\n".join([
+            "Líne 1",
+            "Líne 2?",
+            "Líne 3!",
+            "Líne 4.\n",
+        ]))
+
+
+    def test_list_items(self):
+        lines = [
+            "1. líne 1. a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X",
+            "2021. Líne eile",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "\n".join([
+            "1. líne 1.",
+            "a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X 2021.",
+            "Líne eile\n"
+        ]))
+
+
+    def test_multiple_lines_single_sentence_without_trailing_whitespace(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel,", "ginger."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel, ginger.\n")
+
+
+    def test_multiple_lines_single_sentence_with_single_trailing_whitespace(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel, ", "ginger."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel, ginger.\n")
+
+
+    def test_multiple_lines_single_sentence_with_multiple_trailing_whitespace(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel,   ", "ginger."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel, ginger.\n")
+
+
+    def test_multiple_lines_single_sentence_followed_by_numbered_list_single_digit(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "5. ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel\n5. ginger\n")
+
+
+    def test_multiple_lines_single_sentence_followed_by_numbered_list_double_digit(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "75. ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel\n75. ginger\n")
+
+
+    def test_multiple_lines_single_sentence_followed_by_numbered_list_single_letter(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(a) ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel\n(a) ginger\n")
+
+
+    def test_multiple_lines_single_sentence_followed_by_numbered_list_digit_and_letter(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "5a. ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel\n5a. ginger\n")
+
+
+    def test_multiple_lines_single_sentence_followed_by_numbered_list_letter_and_digit(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(A2) ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel\n(A2) ginger\n")
+
+
+    def test_multiple_lines_single_sentence_followed_by_numbered_list_two_letters(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(AB) ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel (AB) ginger\n")
+
+
+    def test_multiple_lines_multiple_sentences_matching_linebreaks(self):
+        lines = ["Hello.", "World."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Hello.\nWorld.\n")
+
+
+    def test_multiple_lines_multiple_sentences_non_matching_linebreaks(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel, ", "ginger. Good day."]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel, ginger.\nGood day.\n")
+
+
+    def test_abbreviation_followed_by_numeral(self):
+        lines = ["Uimh. 7"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Uimh. 7\n")
+
+
+    def test_abbreviation_in_brackets(self):
+        lines = ["[Uimh. 7]"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "[Uimh. 7]\n")
+
+
+    def test_line_followed_by_single_empty_line(self):
+        lines = ["end of one paragraph", "", "start of next paragraph"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "end of one paragraph\nstart of next paragraph\n")
+
+
+    def test_line_followed_by_multiple_empty_lines(self):
+        lines = ["end of one paragraph", "", "", "", "start of next paragraph"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "end of one paragraph\nstart of next paragraph\n")
+
+
+    def test_long_line_without_sentence_terminator(self):
+        lines = ["this is a long line without a full stop at the end of it", "start of next paragraph"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "this is a long line without a full stop at the end of it start of next paragraph\n")
+
+
+    def test_short_line_without_sentence_terminator_next_uppercase(self):
+        lines = ["short line", "Start of next paragraph"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "short line\nStart of next paragraph\n")
+
+
+    def test_short_line_ending_previous_line(self):
+        lines = ["this is a long line without a full stop at the end of it", "short line", "Start of next paragraph"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "this is a long line without a full stop at the end of it short line\nStart of next paragraph\n")
+
+
+    def test_short_line_without_sentence_terminator_next_lowercase(self):
+        lines = ["short line", "rest of line"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "short line rest of line\n")
+
+
+    def test_abbreviation_split_by_stops_non_upper(self):
+        lines = ["T.e.o. 7"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "T.e.o.\n7\n")
+
+
+    def test_abbreviation_split_by_stops_upper(self):
+        lines = ["I.R. 7"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "I.R. 7\n")
+
+
+    def test_two_abbreviations_in_sequence(self):
+        lines = ["I.R. Uimh. 7"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "I.R. Uimh. 7\n")
+
+
+    def test_single_word_line_starting_with_list_number_in_parentheses(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(1)"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel (1)\n")
+
+
+    def test_multi_word_line_starting_with_single_character_list_number_in_parentheses(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(1) ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel\n(1) ginger\n")
+
+
+    def test_multi_word_line_starting_with_multi_character_list_number_in_parentheses(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(7c) ginger"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel\n(7c) ginger\n")
+
+
+    def test_single_word_line_starting_with_word_in_parentheses(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(ginger)"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel (ginger)\n")
+
+
+    def test_multi_word_line_starting_with_word_in_parentheses(self):
+        lines = ["Aniseed, basil, cinnamon, dill, elder, fennel", "(ginger) horseradish"]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "Aniseed, basil, cinnamon, dill, elder, fennel (ginger) horseradish\n")
+
+
+    def test_mixed(self):
+        lines = [
+            "Is glas iad na cnoic i bhfad uainn. Nach leor nod don eolach?",
+            "",
+            "Fillean an feall ar an bhfeallaire! I.R. 6742 de 2039. Uimh. 39 de 1382. I.R. Uimh. 9924 de 2027.",
+            "a, b, srl. agus c? d, e, srl. Agus f! g, h, srl.",
+            "1. líne 1. a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X",
+            "2021. Líne eile",
+        ]
+
+        self.splitter.split_sentences(self.abbreviations, lines, self.output)
+
+        self.assertEqual(self.output.getvalue(), "\n".join([
+            "Is glas iad na cnoic i bhfad uainn.",
+            "Nach leor nod don eolach?",
+            "Fillean an feall ar an bhfeallaire!",
+            "I.R. 6742 de 2039.",
+            "Uimh. 39 de 1382.",
+            "I.R. Uimh. 9924 de 2027.",
+            "a, b, srl. agus c?",
+            "d, e, srl.",
+            "Agus f!",
+            "g, h, srl.",
+            "1. líne 1.",
+            "a",
+            "39. Líne 39",
+            "iv. Ítim iv",
+            "X. Ítim X 2021.",
+            "Líne eile\n"
+        ]))
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 0 - 0
test/writers/__init__.py


+ 50 - 0
test/writers/test_file_concatenator.py

@@ -0,0 +1,50 @@
+import io
+import unittest
+
+from toolchain.writers.file_concatenator import FileConcatenator
+
+class TestFileConcatenator(unittest.TestCase):
+
+    def setUp(self):
+        self.output = io.StringIO()
+        self.concatenator = FileConcatenator()
+
+
+    def tearDown(self):
+        self.output.close()
+
+
+    def make_input(self, lines):
+        input = io.StringIO()
+        for line in lines:
+            input.write(line + "\n")
+        input.seek(0)
+        return input
+
+
+    def test_empty(self):
+        input = self.make_input([])
+
+        self.concatenator.concatenate_file(input, self.output)
+
+        self.assertEqual(self.output.getvalue(), "")
+
+
+    def test_single_line(self):
+        input = self.make_input(["aniseed"])
+
+        self.concatenator.concatenate_file(input, self.output)
+
+        self.assertEqual(self.output.getvalue(), "aniseed\n")
+
+
+    def test_multiple_line(self):
+        input = self.make_input(["aniseed", "basil", "cinnamon dill", "elderflower"])
+
+        self.concatenator.concatenate_file(input, self.output)
+
+        self.assertEqual(self.output.getvalue(), "aniseed\nbasil\ncinnamon dill\nelderflower\n")
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 124 - 0
test/writers/test_tmx_creator.py

@@ -0,0 +1,124 @@
+import io
+import unittest
+
+from toolchain.writers.tmx_creator import TmxCreator
+
+class TestTmxCreator(unittest.TestCase):
+
+    TEMPLATE_SOURCE = \
+    '<?xml version="1.0" encoding="utf-8"?>\n' +\
+    '<tmx version="1.4" creationtool="tmx_creator" creationtoolversion="1.0" segtype="{{ segtype }}" o-tmf="{{ o_tmf }}" adminlang="{{ adminlang }}" srclang="{{ srclang }}" datatype="{{ datatype }}">\n' +\
+    '    <header>\n' +\
+    '        <prop type="x-TMName">{{ tm_name }}</prop>\n' +\
+    '        <prop type="distributor">{{ distributor }}</prop>\n' +\
+    '        <prop type="disclaimer">{{ disclaimer }}</prop>\n' +\
+    '        <prop type="licence">{{ licence }}</prop>\n' +\
+    '    </header>\n' +\
+    '    <body>\n' +\
+    '        {% for (src, tgt) in tus -%}\n' +\
+    '        <tu tuid="{{ loop.index }}">\n' +\
+    '            <tuv xml:lang="{{ srclang }}">\n' +\
+    '                <seg>{{ src }}</seg>\n' +\
+    '            </tuv>\n' +\
+    '            <tuv xml:lang="{{ tgtlang }}">\n' +\
+    '                <seg>{{ tgt }}</seg>\n' +\
+    '            </tuv>\n' +\
+    '        </tu>\n' +\
+    '        {% endfor -%}\n' +\
+    '    </body>\n' +\
+    '</tmx>\n'
+
+    ADDITIONAL_TMX_ARGS = {
+        "adminlang" : "en",
+        "datatype" : "plaintext",
+        "disclaimer" : "This file was produced by automated means",
+        "distributor" : "Distributor Name",
+        "licence" : "CC-BY-4.0",
+        "o_tmf" : "unknown",
+        "segtype" : "sentence",
+        "tm_name" : "sample",
+        "srclang" : "en",
+        "tgtlang" : "ga",
+    }
+
+    def setUp(self):
+        self.output = io.StringIO()
+        self.creator = TmxCreator()
+
+
+    def tearDown(self):
+        self.output.close()
+
+
+    def test_empty(self):
+        self.creator.create_tmx(self.TEMPLATE_SOURCE, [], [], self.output, self.ADDITIONAL_TMX_ARGS)
+
+        self.assertEqual(self.output.getvalue(),
+            '<?xml version="1.0" encoding="utf-8"?>\n' +
+            '<tmx version="1.4" creationtool="tmx_creator" creationtoolversion="1.0" segtype="sentence" o-tmf="unknown" adminlang="en" srclang="en" datatype="plaintext">\n' +
+            '    <header>\n' +
+            '        <prop type="x-TMName">sample</prop>\n' +
+            '        <prop type="distributor">Distributor Name</prop>\n' +\
+            '        <prop type="disclaimer">This file was produced by automated means</prop>\n' +\
+            '        <prop type="licence">CC-BY-4.0</prop>\n' +\
+            '    </header>\n' +
+            '    <body>\n' +
+            '        </body>\n' +
+            '</tmx>'
+        )
+
+
+    def test_single_tu_non_empty(self):
+        self.creator.create_tmx(self.TEMPLATE_SOURCE, ["window"], ["fuinneog"], self.output, self.ADDITIONAL_TMX_ARGS)
+
+        self.assertEqual(self.output.getvalue(),
+            '<?xml version="1.0" encoding="utf-8"?>\n' +
+            '<tmx version="1.4" creationtool="tmx_creator" creationtoolversion="1.0" segtype="sentence" o-tmf="unknown" adminlang="en" srclang="en" datatype="plaintext">\n' +
+            '    <header>\n' +
+            '        <prop type="x-TMName">sample</prop>\n' +
+            '        <prop type="distributor">Distributor Name</prop>\n' +\
+            '        <prop type="disclaimer">This file was produced by automated means</prop>\n' +\
+            '        <prop type="licence">CC-BY-4.0</prop>\n' +\
+            '    </header>\n' +
+            '    <body>\n' +
+            '        <tu tuid="1">\n' +\
+            '            <tuv xml:lang="en">\n' +\
+            '                <seg>window</seg>\n' +\
+            '            </tuv>\n' +\
+            '            <tuv xml:lang="ga">\n' +\
+            '                <seg>fuinneog</seg>\n' +\
+            '            </tuv>\n' +\
+            '        </tu>\n' +\
+            '        </body>\n' +
+            '</tmx>'
+        )
+
+
+    def test_single_tu_escape(self):
+        self.creator.create_tmx(self.TEMPLATE_SOURCE, ["window&"], ["fuinneog<"], self.output, self.ADDITIONAL_TMX_ARGS)
+
+        self.assertEqual(self.output.getvalue(),
+            '<?xml version="1.0" encoding="utf-8"?>\n' +
+            '<tmx version="1.4" creationtool="tmx_creator" creationtoolversion="1.0" segtype="sentence" o-tmf="unknown" adminlang="en" srclang="en" datatype="plaintext">\n' +
+            '    <header>\n' +
+            '        <prop type="x-TMName">sample</prop>\n' +
+            '        <prop type="distributor">Distributor Name</prop>\n' +\
+            '        <prop type="disclaimer">This file was produced by automated means</prop>\n' +\
+            '        <prop type="licence">CC-BY-4.0</prop>\n' +\
+            '    </header>\n' +
+            '    <body>\n' +
+            '        <tu tuid="1">\n' +\
+            '            <tuv xml:lang="en">\n' +\
+            '                <seg>window&amp;</seg>\n' +\
+            '            </tuv>\n' +\
+            '            <tuv xml:lang="ga">\n' +\
+            '                <seg>fuinneog&lt;</seg>\n' +\
+            '            </tuv>\n' +\
+            '        </tu>\n' +\
+            '        </body>\n' +
+            '</tmx>'
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

+ 73 - 0
toolchain/cleaners/monolingual_cleaner.py

@@ -0,0 +1,73 @@
+import argparse
+import logging
+
+from toolchain.common.language_detector import LanguageDetector
+
+logger = logging.getLogger(__name__)
+
+class MonolingualCleaner:
+
+    DEFAULT_LANGUAGE_DETECTION_THRESHOLD = 40
+    DEFAULT_REJECTED_LINE_DELIMITER = "@@@"
+    REJECTION_EMPTY = "empty_segment"
+    REJECTION_UNEXPECTED_LANGUAGE = "unexpected_language_[{0}]"
+
+    def __init__(self, lang, config={}, language_detector=LanguageDetector()):
+        self.lang = lang
+        self.language_detector = language_detector
+        self.language_detection_threshold = int(config.get("language_detection_threshold", self.DEFAULT_LANGUAGE_DETECTION_THRESHOLD))
+        rejected_line_delimiter = config.get("rejected_line_delimiter", self.DEFAULT_REJECTED_LINE_DELIMITER)
+        self.rejected_line_template = rejected_line_delimiter.join(["{0}", "{1}"])
+
+
+    def clean(self, input_path, output_path_retained, output_path_rejected):
+        logger.info("Cleaning {0} to {1} with rejections to {2}.".format(input_path, output_path_retained, output_path_rejected))
+        with open(input_path) as input, open(output_path_retained, "w") as output_retained, open(output_path_rejected, "w") as output_rejected:
+            self.clean_text(input, output_retained, output_rejected)
+
+
+    def clean_text(self, input, output_retained, output_rejected):
+        for line in input:
+            term = line.rstrip("\n")
+            should_include, message = self.should_include(term.strip())
+            if should_include:
+                self.write_file_line(output_retained, term)
+            else:
+                self.write_file_line(output_rejected, message)
+
+
+    def should_include(self, term):
+        if not term:
+            message = self.rejected_line_template.format(self.REJECTION_EMPTY, term)
+            return False, message
+
+        if len(term) >= self.language_detection_threshold:
+            detected_lang = self.language_detector.detect(term)
+            if detected_lang != self.lang:
+                reason = self.REJECTION_UNEXPECTED_LANGUAGE.format(detected_lang)
+                message = self.rejected_line_template.format(reason, term)
+                return False, message
+
+        return True, ""
+
+
+    def write_file_line(self, file, text):
+        file.write(text + "\n")
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("lang", help="language code")
+    argparser.add_argument("input_path", help="path to input file")
+    argparser.add_argument("output_path_retained", help="path to output file")
+    argparser.add_argument("output_path_rejected", help="path to output rejection file")
+    argparser.add_argument("--langdetect_threshold", type=int, default=40, help="check language of only lines of this number of characters or more")
+    argparser.add_argument("--rejected_line_delimiter", type=str, default="@@@", help="string to use to delimit fields of rejection lines")
+    args = argparser.parse_args()
+
+    config = {
+        "language_detection_threshold" : args.langdetect_threshold,
+        "rejected_line_delimiter" : args.rejected_line_delimiter,
+    }
+
+    MonolingualCleaner(args.lang, config=config).clean(args.input_path, args.output_path_retained, args.output_path_rejected)

+ 90 - 0
toolchain/cleaners/post_alignment_cleaner.py

@@ -0,0 +1,90 @@
+import argparse
+import logging
+
+from toolchain.common.language_detector import LanguageDetector
+
+logger = logging.getLogger(__name__)
+
+class PostAlignmentCleaner:
+
+    DEFAULT_LANGUAGE_DETECTION_THRESHOLD = 40
+    DEFAULT_REJECTED_LINE_DELIMITER = "@@@"
+    REJECTION_EMPTY = "empty_segment"
+    REJECTION_NONALPHA = "nonalpha"
+    REJECTION_UNEXPECTED_LANGUAGE = "unexpected_language_[{0}:{1}]"
+
+    def __init__(self, lang_src, lang_tgt, config={}, language_detector=LanguageDetector()):
+        self.lang_src = lang_src
+        self.lang_tgt = lang_tgt
+        self.language_detector = language_detector
+        self.language_detection_threshold = int(config.get("language_detection_threshold", self.DEFAULT_LANGUAGE_DETECTION_THRESHOLD))
+        rejected_line_delimiter = config.get("rejected_line_delimiter", self.DEFAULT_REJECTED_LINE_DELIMITER)
+        self.rejected_line_template = rejected_line_delimiter.join(["{0}", "{1}", "{2}"])
+
+
+    def clean(self, input_path_src, input_path_tgt, output_path_src, output_path_tgt, output_path_rejected):
+        logger.info("Cleaning {0} and {1} to {2} and {3} with rejections to {4}.".format(input_path_src, input_path_tgt, output_path_src, output_path_tgt, output_path_rejected))
+        with open(input_path_src) as input_src, open(input_path_tgt) as input_tgt,\
+                open(output_path_src, "w") as output_src, open(output_path_tgt, "w") as output_tgt, open(output_path_rejected, "w") as output_rejected:
+            self.clean_text(input_src, input_tgt, output_src, output_tgt, output_rejected)
+
+
+    def clean_text(self, input_src, input_tgt, output_src, output_tgt, output_rejected):
+        for input_pair in zip(input_src, input_tgt):
+            term_src, term_tgt = input_pair[0].rstrip("\n"), input_pair[1].rstrip("\n")
+            should_include, message = self.should_include(term_src.strip(), term_tgt.strip())
+            if should_include:
+                self.write_file_line(output_src, term_src)
+                self.write_file_line(output_tgt, term_tgt)
+            else:
+                self.write_file_line(output_rejected, message)
+
+
+    def should_include(self, term_src, term_tgt):
+        if not term_src or not term_tgt:
+            message = self.rejected_line_template.format(self.REJECTION_EMPTY, term_src, term_tgt)
+            return False, message
+
+        if not self.contains_alpha(term_src) and not self.contains_alpha(term_tgt):
+            message = self.rejected_line_template.format(self.REJECTION_NONALPHA, term_src, term_tgt)
+            return False, message
+
+        if len(term_src) >= self.language_detection_threshold or len(term_tgt) >= self.language_detection_threshold or term_src == term_tgt:
+            detected_lang_src = self.language_detector.detect(term_src)
+            detected_lang_tgt = self.language_detector.detect(term_tgt)
+            if detected_lang_src != self.lang_src or detected_lang_tgt != self.lang_tgt:
+                reason = self.REJECTION_UNEXPECTED_LANGUAGE.format(detected_lang_src, detected_lang_tgt)
+                message = self.rejected_line_template.format(reason, term_src, term_tgt)
+                return False, message
+
+        return True, ""
+
+
+    def contains_alpha(self, token):
+        return any(c.isalpha() for c in token)
+
+
+    def write_file_line(self, file, text):
+        file.write(text + "\n")
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("lang_src", help="source language code")
+    argparser.add_argument("lang_tgt", help="target language code")
+    argparser.add_argument("input_path_src", help="path to input file of source language")
+    argparser.add_argument("input_path_tgt", help="path to input file of target language")
+    argparser.add_argument("output_path_src", help="path to output file of source language")
+    argparser.add_argument("output_path_tgt", help="path to output file of target language")
+    argparser.add_argument("output_path_rejected", help="path to output rejection file")
+    argparser.add_argument("--langdetect_threshold", type=int, default=40, help="check language of only lines of this number of characters or more")
+    argparser.add_argument("--rejected_line_delimiter", type=str, default="@@@", help="string to use to delimit fields of rejection lines")
+    args = argparser.parse_args()
+
+    config = {
+        "language_detection_threshold" : args.langdetect_threshold,
+        "rejected_line_delimiter" : args.rejected_line_delimiter,
+    }
+
+    PostAlignmentCleaner(args.lang_src, args.lang_tgt, config=config).clean(args.input_path_src, args.input_path_tgt,
+            args.output_path_src, args.output_path_tgt, args.output_path_rejected)

+ 31 - 0
toolchain/common/file_size_counter.py

@@ -0,0 +1,31 @@
+import argparse
+from collections import namedtuple
+import logging
+
+logger = logging.getLogger(__name__)
+
+FileSize = namedtuple("FileSize", ["lines", "words"])
+
+class FileSizeCounter:
+
+    def count(self, input_path):
+        logger.info("Counting size of {0}.".format(input_path))
+        with open(input_path) as input_file:
+            return self.count_sizes(input_file)
+
+
+    def count_sizes(self, input_file):
+        line_count = 0
+        word_count = 0
+        for line in input_file:
+            line_count += 1
+            word_count += len(line.strip().split())
+        return FileSize(lines=line_count, words=word_count)
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("input_path", help="path to input file")
+    args = argparser.parse_args()
+
+    print(FileSizeCounter().count(args.input_path))

+ 113 - 0
toolchain/common/language_detector.py

@@ -0,0 +1,113 @@
+import argparse
+from collections import namedtuple
+from langdetect import detect, detect_langs, DetectorFactory
+from langdetect.lang_detect_exception import LangDetectException
+import logging
+
+logger = logging.getLogger(__name__)
+
+DetectorFactory.seed = 0
+
+DetectionResult = namedtuple("DetectionResult", ["language", "probability", "total", "tested"])
+
+class LanguageDetector:
+
+    DEFAULT_MIN_FILE_LINE_LENGTH = 0
+    DEFAULT_MIN_INITIAL_LINES = 50
+    DEFAULT_SAMPLING_INTERVAL = 100
+    DEFAULT_LANGUAGE = ""
+    DEFAULT_PROBABILITY = -1.0
+
+    def detect(self, str):
+        detected = self.detect_language(str)
+        return detected.language
+
+
+    def detect_in_file(self, input_path, config={}):
+        logger.info("Detecting language in file {0}.".format(input_path))
+        with open(input_path) as input_file:
+            detected = self.detect_language_in_file(input_file, config)
+            return detected.language
+
+
+    def detect_language(self, str):
+        try:
+            detected_langs = detect_langs(str.lower())
+            logger.debug("Detected language(s) with probabilities {0}.".format(detected_langs))
+            detected_lang = max(detected_langs, key=lambda x: x.prob)
+            return DetectionResult(language=detected_lang.lang, probability=detected_lang.prob, total=1, tested=1)
+        except LangDetectException as lde:
+            # langdetect produces a lot of these, mostly for empty or otherwise irrelevant lines
+            logger.warning("Language could not be detected, exception: {0}.".format(lde))
+            return DetectionResult(language=self.DEFAULT_LANGUAGE, probability=self.DEFAULT_PROBABILITY, total=1, tested=0)
+        except Exception as e:
+            logger.exception("Language could not be detected, exception: {0}.".format(e))
+            return DetectionResult(language=self.DEFAULT_LANGUAGE, probability=self.DEFAULT_PROBABILITY, total=1, tested=0)
+
+
+    def detect_language_in_file(self, lines, config={}):
+        min_file_line_length = int(config.get("min_file_line_length", self.DEFAULT_MIN_FILE_LINE_LENGTH))
+        min_initial_lines = int(config.get("min_initial_lines", self.DEFAULT_MIN_INITIAL_LINES))
+        sampling_interval = int(config.get("sampling_interval", self.DEFAULT_SAMPLING_INTERVAL))
+
+        combined_probabilities = {}
+        count = 0
+        tested_count = 0
+
+        for line in lines:
+            count += 1
+
+            if len(line) >= min_file_line_length and (count <= min_initial_lines or count % sampling_interval == 0):
+                try:
+                    detected_langs = detect_langs(line.lower())
+                    tested_count += 1
+
+                    for detected_lang in detected_langs:
+                        code = detected_lang.lang
+                        if not code in combined_probabilities:
+                            combined_probabilities[code] = 0
+                        combined_probabilities[code] += detected_lang.prob
+
+                except LangDetectException as lde:
+                    # langdetect produces a lot of these, mostly for empty or otherwise irrelevant lines;
+                    logger.warning("Language not detected for line (line ignored), langdetect exception: \"{0}\".".format(str(lde)))
+                    continue
+
+                except Exception as e:
+                    logger.exception("Unexpected exception (line ignored): {0}.".format(e))
+                    continue
+
+        language, total_probability = self.find_max_probability(combined_probabilities)
+        normalized_probability = self.calculate_normalized_probability(total_probability, tested_count)
+        return DetectionResult(language=language, probability=normalized_probability, total=count, tested=tested_count)
+
+
+    def find_max_probability(self, combined_probabilities):
+        return max(combined_probabilities.items(), key=lambda x: x[1], default=(self.DEFAULT_LANGUAGE, self.DEFAULT_PROBABILITY))
+
+
+    def calculate_normalized_probability(self, total_probability, tested_count):
+        if tested_count <= 0:
+            return self.DEFAULT_PROBABILITY
+        return total_probability / tested_count
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("input_line", help="input string or filename (default is string)")
+    argparser.add_argument("--file", action="store_true", help="input_line is a file")
+    argparser.add_argument("--min_file_line_length", type=int, default=0, help="minimum number of characters in a line of a file to check")
+    argparser.add_argument("--min_initial_lines", type=int, default=50, help="minimum number of lines to check at start of file")
+    argparser.add_argument("--sampling_interval", type=int, default=100, help="sampling interval of lines in file")
+    args = argparser.parse_args()
+
+    config = {
+        "min_file_line_length" : args.min_file_line_length,
+        "min_initial_lines" : args.min_initial_lines,
+        "sampling_interval" : args.sampling_interval,
+    }
+
+    if args.file:
+        print(LanguageDetector().detect_in_file(args.input_line, config))
+    else:
+        print(LanguageDetector().detect(args.input_line))

+ 21 - 0
toolchain/common/raw_file_indexer.py

@@ -0,0 +1,21 @@
+import logging
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+class RawFileIndexer:
+
+    LINE_TEMPLATE = "{0}\t{1}\n"
+
+    def index_files(self, input_dir, extensions, output_file):
+        logger.info("Indexing files in directory {0} to {1}.".format(input_dir, output_file))
+
+        files = [file for file in Path(input_dir).rglob("*") if file.suffix.lower() in extensions]
+        with open(output_file, "w") as output:
+            self.index(files, output)
+
+
+    def index(self, files, output):
+            for index, file in enumerate(files):
+                logger.debug("Indexing file {0}.".format(file))
+                output.write(self.LINE_TEMPLATE.format(index+1, file))

+ 9 - 0
toolchain/common/templates.py

@@ -0,0 +1,9 @@
+class OutputPathTemplate:
+    ALIGNED = "{0}.aligned.{1}.txt"
+    CLEANED = "{0}.cleaned.{1}.txt"
+    CONCATENATED = "concatenated.{0}.txt"
+    EXTRACTED = "{0}.extracted.txt"
+    NORMALIZED_KNOWN = "{0}.normalized.{1}.txt"
+    NORMALIZED_UNKNOWN = "{0}.normalized.txt"
+    PARSED = "{0}.parsed.{1}.txt"
+    SPLIT = "{0}.split.{1}.txt"

+ 2 - 0
toolchain/common/toolchain_error.py

@@ -0,0 +1,2 @@
+class ToolchainError(Exception):
+    pass

+ 245 - 0
toolchain/doc_to_tmx_processor.py

@@ -0,0 +1,245 @@
+import argparse
+import logging
+import os
+import pathlib
+import shutil
+from collections import namedtuple
+
+from toolchain.cleaners.monolingual_cleaner import MonolingualCleaner
+from toolchain.cleaners.post_alignment_cleaner import PostAlignmentCleaner
+from toolchain.common.file_size_counter import FileSizeCounter
+from toolchain.common.language_detector import LanguageDetector
+from toolchain.common.raw_file_indexer import RawFileIndexer
+from toolchain.common.templates import OutputPathTemplate
+from toolchain.common.toolchain_error import ToolchainError
+from toolchain.docalign.document_aligner import DocumentAligner
+from toolchain.extractors.editable_text_extractor import EditableTextExtractor
+from toolchain.extractors.pdf_text_extractor import PdfTextExtractor
+from toolchain.extractors.plain_text_extractor import PlainTextExtractor
+from toolchain.normalizer.unicode_normalizer import UnicodeNormalizer
+from toolchain.sentalign.sentence_aligner import SentenceAligner
+from toolchain.splitters.editable_sentence_splitter import EditableSentenceSplitter
+from toolchain.splitters.pdf_sentence_splitter import PdfSentenceSplitter
+from toolchain.toolchain_processor import ToolchainProcessor
+from toolchain.writers.tmx_creator import TmxCreator
+from toolchain.writers.file_concatenator import FileConcatenator
+
+logger = logging.getLogger(__name__)
+
+ParserTypes = namedtuple("ParserTypes", ["extractor", "extraction_tool", "splitter"])
+
+class DocToTmxProcessor(ToolchainProcessor):
+
+    HUNALIGN = pathlib.Path(os.environ["HUNALIGNPATH"]).resolve()
+    PDFTOTEXT = pathlib.Path(os.environ["PDFTOTEXTPATH"]).resolve()
+    LIBREOFFICE = pathlib.Path(os.environ["LIBREOFFICEPATH"]).resolve()
+
+    BASE_OUTPUT_FILESTEM = "doc_{0}"
+    PARSERS = {
+        ".doc" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
+        ".docx" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
+        ".odt" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
+        ".pdf" : ParserTypes(extractor=PdfTextExtractor, extraction_tool=PDFTOTEXT, splitter=PdfSentenceSplitter),
+        ".rtf" : ParserTypes(extractor=EditableTextExtractor, extraction_tool=LIBREOFFICE, splitter=EditableSentenceSplitter),
+        ".txt" : ParserTypes(extractor=PlainTextExtractor, extraction_tool=None, splitter=EditableSentenceSplitter),
+    }
+    EXTENSIONS = PARSERS.keys()
+
+    def process(self, id, input_dir, artefact_dir, output_dir):
+        logger.info("Starting Doc to TMX toolchain for input directory {0}.".format(input_dir))
+
+        try:
+            input_dirpath = pathlib.Path(input_dir).resolve()
+            artefact_dirpath = pathlib.Path(artefact_dir).resolve()
+            output_dirpath = pathlib.Path(output_dir).resolve()
+
+            artefact_dirpath.mkdir(parents=True, exist_ok=True)
+            output_dirpath.mkdir(parents=True, exist_ok=True)
+
+            return self.process_directory(id, input_dirpath, artefact_dirpath, output_dirpath)
+        except Exception as e:
+            logger.exception(e)
+            logger.error("Error processing documents: {0}.".format(e))
+            raise
+
+
+    def process_directory(self, id, input_basedir, artefact_basedir, output_basedir):
+        langident_dir = self.create_directory(artefact_basedir, "langident")
+        rejected_dir = self.create_directory(artefact_basedir, "rejected")
+        docalign_dir = self.create_directory(artefact_basedir, "docalign")
+        sentalign_dir = self.create_directory(artefact_basedir, "sentalign")
+        clean_dir = self.create_directory(artefact_basedir, "clean")
+        concatenate_dir = self.create_directory(artefact_basedir, "concatenate")
+        bilingual_dir = self.create_directory(output_basedir, "bilingual")
+        monolingual_dir = self.create_directory(output_basedir, "monolingual")
+
+        index_file = artefact_basedir.joinpath(self.INDEX_FILENAME)
+        RawFileIndexer().index_files(input_basedir, self.EXTENSIONS, index_file)
+        logger.info("Indexed {0} to {1}.".format(input_basedir, index_file))
+
+        additional_tmx_args = dict(self.global_tmx_args)
+        additional_tmx_args["o_tmf"] = "Corpus"
+
+        rejected = 0
+        files_by_language = {}
+        with open(index_file) as index:
+            for line in index.read().splitlines():
+                file_rejected = self.preprocess_file(line, rejected_dir, artefact_basedir, files_by_language)
+                if file_rejected:
+                    rejected += 1
+
+        unaligned_file_list_src = files_by_language.get(self.lang_src, [])
+        unaligned_file_list_tgt = files_by_language.get(self.lang_tgt, [])
+
+        aligned_document_pairs, unmatched_list_src, unmatched_list_tgt = DocumentAligner(self.docalign_config).align(
+                unaligned_file_list_src, unaligned_file_list_tgt, docalign_dir)
+        monolingual_file_info_src = self.resolve_unmatched_files(id, unmatched_list_src, self.lang_src, clean_dir,
+                rejected_dir, monolingual_dir, self.keep_unmatched_src, self.monolingual_filename_template_source)
+        monolingual_file_info_tgt = self.resolve_unmatched_files(id, unmatched_list_tgt, self.lang_tgt, clean_dir,
+                rejected_dir, monolingual_dir, self.keep_unmatched_tgt, self.monolingual_filename_template_target)
+
+        cleaned_file_list_src = []
+        cleaned_file_list_tgt = []
+        for index, document_pair in enumerate(aligned_document_pairs):
+            pair_label = "sa_{0}".format(index+1)
+            path_docaligned_src = document_pair[0]
+            path_docaligned_tgt = document_pair[1]
+
+            sentalign_pair_dir = self.create_directory(sentalign_dir, pair_label)
+            sentalign_pair_base_path = sentalign_pair_dir.joinpath(pair_label)
+            path_sentaligned_src = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_src))
+            path_sentaligned_tgt = pathlib.Path(OutputPathTemplate.ALIGNED.format(sentalign_pair_base_path, self.lang_tgt))
+            SentenceAligner(self.HUNALIGN).align(path_docaligned_src, path_docaligned_tgt,
+                    path_sentaligned_src, path_sentaligned_tgt, sentalign_pair_dir, self.sentalign_config)
+            logger.info("Sentence aligned to {0} and {1}.".format(path_sentaligned_src, path_sentaligned_tgt))
+
+            clean_base_path = clean_dir.joinpath(pair_label)
+            path_cleaned_src = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_src))
+            path_cleaned_tgt = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, self.lang_tgt))
+            path_cleaned_rejected = pathlib.Path(OutputPathTemplate.CLEANED.format(clean_base_path, "rejected"))
+            PostAlignmentCleaner(self.lang_src, self.lang_tgt, config=self.cleaner_config).clean(
+                    path_sentaligned_src, path_sentaligned_tgt, path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected)
+            shutil.copy(path_cleaned_rejected, rejected_dir)
+            cleaned_file_list_src.append(path_cleaned_src)
+            cleaned_file_list_tgt.append(path_cleaned_tgt)
+            logger.info("Cleaned to {0} and {1} with rejections in {2}.".format(path_cleaned_src, path_cleaned_tgt, path_cleaned_rejected))
+
+        path_concatenated_src = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_src))
+        path_concatenated_tgt = concatenate_dir.joinpath(OutputPathTemplate.CONCATENATED.format(self.lang_tgt))
+
+        FileConcatenator().concatenate(cleaned_file_list_src, path_concatenated_src)
+        FileConcatenator().concatenate(cleaned_file_list_tgt, path_concatenated_tgt)
+        logger.info("Concatenated to {0} and {1}.".format(path_concatenated_src, path_concatenated_tgt))
+
+        parallel_file_sizes = FileSizeCounter().count(path_concatenated_src)
+        logger.info("Counted {0} lines in file {1}.".format(parallel_file_sizes.lines, path_concatenated_src))
+
+        parallel_file_info = None
+        if parallel_file_sizes.lines > 0:
+            parallel_file_info = self.create_file_info("bilingual", [self.lang_src, self.lang_tgt],
+                    parallel_file_sizes.lines, "translation_units")
+            output_filestem = self.BASE_OUTPUT_FILESTEM.format(id)
+            path_output_tmx = self.tmx_filename_template.format(bilingual_dir.joinpath(output_filestem))
+            TmxCreator().create(self.tmx_template, path_concatenated_src, path_concatenated_tgt, path_output_tmx, additional_tmx_args)
+            logger.info("Created TMX file at {0}.".format(path_output_tmx))
+        else:
+            logger.info("No parallel data found for resource {0}, skipping TMX creation.".format(id))
+
+        return rejected, list(filter(None, [monolingual_file_info_src, monolingual_file_info_tgt, parallel_file_info]))
+
+
+    def preprocess_file(self, index_line, rejected_dir, artefact_basedir, files_by_language):
+        try:
+            index_tokens = index_line.split("\t")
+            index_no = index_tokens[0]
+            input_path = index_tokens[1]
+
+            lower_extension = pathlib.Path(input_path).suffix.lower()
+            parser_types = self.PARSERS.get(lower_extension, None)
+
+            if not parser_types:
+                shutil.copy(input_path, rejected_dir)
+                logger.error("No extractor found for type {0}, skipping.".format(lower_extension))
+                return True
+
+            logger.info("Preprocessing index {0} file {1}.".format(index_no, input_path))
+
+            artefact_dir = self.create_directory(artefact_basedir, index_no)
+            artefact_basepath = artefact_dir.joinpath(index_no)
+
+            path_extracted = OutputPathTemplate.EXTRACTED.format(artefact_basepath)
+            path_normalized = OutputPathTemplate.NORMALIZED_UNKNOWN.format(artefact_basepath)
+
+            logger.info("Selecting parsers for {0}: extractor: {1}, extraction tool: {2}, sentence splitter: {3}.".format(
+                    input_path, parser_types.extractor, parser_types.extraction_tool, parser_types.splitter))
+            parser_types.extractor(parser_types.extraction_tool).extract(input_path, path_extracted, config=self.extractor_config)
+            logger.info("Extracted to {0}.".format(path_extracted))
+
+            UnicodeNormalizer().normalize(path_extracted, path_normalized, self.CUSTOM_CHARACTER_SUBSTITUTIONS)
+            logger.info("Normalized to {0}.".format(path_normalized))
+
+            language = LanguageDetector().detect_in_file(path_normalized, self.langdetect_config)
+            logger.info("Detected {0} as language [{1}].".format(path_normalized, language))
+
+            if language not in [self.lang_src, self.lang_tgt]:
+                shutil.copy(path_normalized, rejected_dir)
+                logger.info("Rejected {0} with invalid language [{1}].".format(path_normalized, language))
+            else:
+                path_split = OutputPathTemplate.SPLIT.format(artefact_basepath, language)
+                if not language in files_by_language:
+                    files_by_language[language] = []
+                files_by_language[language].append(path_split)
+                path_abbreviations = self.abbreviations_paths[language]
+                parser_types.splitter().split(path_abbreviations, path_normalized, path_split)
+                logger.info("Sentence-split to {0}.".format(path_split))
+
+            return False
+
+        except ToolchainError as te:
+            shutil.copy(input_path, rejected_dir)
+            logger.error("Error preprocessing file {0}, skipping.".format(input_path))
+            return True
+
+
+    def resolve_unmatched_files(self, id, file_list, lang, clean_dir, rejected_dir, monolingual_dir, keep_unmatched, monolingual_filename_template):
+        monolingual_paths = []
+        for filename in file_list:
+            if keep_unmatched:
+                stem = pathlib.Path(filename).stem
+                clean_base_path = clean_dir.joinpath(stem)
+                path_cleaned_retained = OutputPathTemplate.CLEANED.format(clean_base_path, lang)
+                path_cleaned_rejected = OutputPathTemplate.CLEANED.format(clean_base_path, "rejected")
+
+                MonolingualCleaner(lang, config=self.cleaner_config).clean(filename, path_cleaned_retained, path_cleaned_rejected)
+                logger.info("Cleaned monolingual file {0} to {1}.".format(filename, path_cleaned_retained))
+
+                monolingual_paths.append(path_cleaned_retained)
+                shutil.copy(path_cleaned_rejected, rejected_dir)
+            else:
+                shutil.copy(filename, rejected_dir)
+                logger.info("Rejected unmatched file {0}.".format(filename))
+
+        if monolingual_paths:
+            concatenated_monolingual_path = monolingual_filename_template.format(monolingual_dir.joinpath(str(id)))
+            FileConcatenator().concatenate(monolingual_paths, concatenated_monolingual_path)
+            logger.info("Concatenated file(s) {0} to combined monolingual file at {1}.".format(str(monolingual_paths), concatenated_monolingual_path))
+
+            file_sizes = FileSizeCounter().count(concatenated_monolingual_path)
+            logger.info("Counted {0} lines and {1} words in file {2}.".format(file_sizes.lines, file_sizes.words, concatenated_monolingual_path))
+
+            return self.create_file_info("monolingual", [lang], file_sizes.words, "words")
+
+        return None
+
+
+if __name__ == "__main__":
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("id", help="LR identifier")
+    argparser.add_argument("input_dir", help="path to input directory")
+    argparser.add_argument("artefact_dir", help="path to artefact directory")
+    argparser.add_argument("output_dir", help="path to output directory")
+    argparser.add_argument("config_path", help="path to config")
+    args = argparser.parse_args()
+
+    DocToTmxProcessor(args.config_path).process(args.id, args.input_dir, args.artefact_dir, args.output_dir)
+    print("Output written to {0}".format(args.output_dir))

+ 4 - 0
toolchain/docalign/docalign_error.py

@@ -0,0 +1,4 @@
+from toolchain.common.toolchain_error import ToolchainError
+
+class DocalignError(ToolchainError):
+    pass

Some files were not shown because too many files changed in this diff