123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225 |
- #! /usr/bin/python
- """
- This script deduplicates single or parallel corpora.
-
- Originally developed to deduplicate Symantec translation memory corpora.
-
- Version 1.4
- - Outputs indexes of the output lines in the original corpus.
- Deduplicating using UNIX sort command is optional now for regression
- test purposes.
-
- Version 1.3
- - Renamed from dedup-parallel-corpora.py to dedup-corpus.py
- - Now supports the same processes over a single corpus
-
- Version 1.2
- - Arbitrary number of input corpora is supported.
- - Optional lowercasing before deduplicating is supported.
- - Optional strip/split/merge before deduplicating
- - Results can be shuffled.
-
- Version 1.1
- - The default delimiter was changed from ### to ~#^, due to problems
- observed for lines ending with #
-
- """
- import sys, optparse, tempfile, subprocess, os.path , random, itertools
- def mergeFiles(plFiles, pDelim, pflgLowercase, pflgSSM):
- '''
- Merges files horizontally and preprocess them based on arguments.
- '''
-
- try:
- tmpMerged = tempfile.NamedTemporaryFile()
- except IOError:
- sys.exit("Cannot create temporary merge file in the current path")
-
- for vlLines in zip(*plFiles):
- vlLines = [line.strip() for line in vlLines]
- if pflgLowercase:
- vlLines = [line.lower() for line in vlLines]
- if pflgSSM:
- vlLines = [' '.join(line.split()) for line in vlLines]
-
- tmpMerged.write(pDelim.join(vlLines) + '\n')
-
- tmpMerged.seek(0,0)
- return tmpMerged
-
- def dedupUnixSort(pfInput):
- '''
- Deduplicates pfInput using UNIX sort command
- '''
-
- try:
- tmpDedupl = tempfile.NamedTemporaryFile()
- except IOError:
- print 'Can\'t create temporary deduplicated file in the current path'
- sys.exit(2)
-
- subprocess.call("sort -u " + pfInput.name + " > " + tmpDedupl.name, shell=True)
-
- return tmpDedupl
-
- def dedupWithIdx(pfInput):
- '''
- Deduplicates pfInput and keep track of indexes in the original corpus
- '''
-
- # reading lines to a list of tuples
- vlInput = []
- for i, vLine in enumerate(pfInput, start=1):
- vlInput.append((vLine, i))
-
- # sorting the list
- vlSorted = sorted(vlInput)
-
- # deduplicating
- vlDedupLinesAndIdxs = []
- for vItem, vIndexGroup in itertools.groupby(vlSorted, key=lambda line: line[0]):
- # only the first of duplicate indexes is taken
- vIndex = list(vIndexGroup)[0]
- vlDedupLinesAndIdxs.append((vItem, vIndex))
-
- return vlDedupLinesAndIdxs
-
- def splitLines(plInputLines, pDelim, plOutFilesnames):
- '''
- Splits input lines based on pDelim and writes into specified files.
- '''
-
- vlfOutFiles = []
- for vOutFilename in plOutFilesnames:
- try:
- vlfOutFiles.append(open(vOutFilename, 'w'))
- except IOError:
- sys.exit("Cannot open output file: %s" % vOutFilename)
- try:
- for vLine in plInputLines:
- vlSplitLines = vLine.split(pDelim)
- for vfOut, vLine in zip(vlfOutFiles, vlSplitLines):
- vfOut.write(vLine.strip() + '\n')
- finally:
- for vfOut in vlfOutFiles:
- vfOut.close()
-
- def writeOrgIdxs(plIdxs, pOutFilename):
- '''
- Writes indexes of deduplicated lines in the original corpus into a
- file .
- '''
-
- try:
- vfOut = open(pOutFilename, 'w')
- except IOError:
- sys.exit("Cannot creat index output file: %s" % pOutFilename)
-
- vfOut.write('\n'.join([str(item) for item in plIdxs]) + '\n')
-
- vfOut.close()
-
- ##======================================================================
- ## main
- def main(argv=None):
- if argv is None:
- arv = sys.argv
-
- parser = optparse.OptionParser(usage="%prog <INPUT FILE 1> [<INPUT FILE 2> [..]] <OUTPUT DIR>" +
- "\nThis script deduplicates single or parallel corpora.", version="%prog 1.4")
-
- parser.add_option("-d", "--delimiter", help="a delimiter string which does not appear in any of files", metavar="DELIMITER", dest="delim", default="~#^", action="store")
- parser.add_option("-u", "--unix-sort", help="use UNIX sort command", dest="unixSort", action="store_true")
- parser.add_option("-i", "--index-out", help="output file name for intexes in the original corpus (not including path)", dest="idxOutFile", default="org.idx", action="store")
- parser.add_option("-l", "--lowercase", help="lowercase before deduplication", dest="lower", action="store_true")
- parser.add_option("-s", "--ssm", help="strip, split and merge before deduplication", dest="ssm", action="store_true")
- parser.add_option("-r", "--shuffle", help="shuffle the deduplicated lines", dest="shuff", action="store_true")
- parser.add_option("-x", "--out-suffix", help="output files suffix", dest="outSuffix", action="store")
-
- (opts, posargs) = parser.parse_args()
-
- if len(posargs) < 2:
- parser.error("At least 2 arguments (one corpus and one output directory) are required.")
-
-
- # opening the input files and preparing output file names
- vlInFilenames = posargs[:-1]
- vlfInFiles = []
- vlOutfileExts = []
- for vFilename in vlInFilenames:
- try:
- vlfInFiles.append(open(vFilename, 'r'))
- vlOutfileExts.append(os.path.splitext(vFilename)[1])
- except IOError:
- sys.exit("Cannot open input file: %s " % vFilename)
-
- vOutDir = posargs[-1]
-
- # checking for identical file extensions
- vsOutfileExts = set(vlOutfileExts)
- if len(vsOutfileExts) != len(vlOutfileExts):
- sys.exit("Please provide files with different extensions")
-
- if opts.outSuffix == None:
- vSuffix = "dp"
- if opts.lower:
- vSuffix = 'l' + vSuffix
- if opts.shuff:
- vSuffix = vSuffix + 'r'
- else:
- vSuffix = vSuffix + 's'
- vSuffix = '.' + vSuffix
- else:
- vSuffix = '.' + opts.outSuffix
-
- # creating output file names
- vlOutFilenames = []
- for vFilename, vOutFileExt in zip(vlInFilenames, vlOutfileExts):
- vlOutFilenames.append(vOutDir + '/' + os.path.basename(os.path.splitext(vFilename)[0]) + vSuffix + vOutFileExt)
-
- # merging input files line by line
- tmpMerged = mergeFiles(vlfInFiles, opts.delim, opts.lower, opts.ssm)
-
- try:
- ## deduplicating the temporary merged file
- ## NOTE: if the indexes of the new lines in the original corpus is
- ## requested, UNIX sort cann't be used as it was in the previous
- ## version. While both functions do the same, deduplUnixSort() is
- ## kept only for backward-compatibilty test of output when needed.
- if opts.unixSort:
- tmpDedupl = dedupUnixSort(tmpMerged)
- vlDedupLines = tmpDedupl.read().splitlines()
- if opts.shuff:
- random.shuffle(vlDedupLines)
- else:
- vlDedupLinesAndIdxs = dedupWithIdx(tmpMerged)
- if opts.shuff:
- random.shuffle(vlDedupLinesAndIdxs)
- vlDedupLines = [item[0] for item in vlDedupLinesAndIdxs]
- writeOrgIdxs([item[1][1] for item in vlDedupLinesAndIdxs], vOutDir + '/' + opts.idxOutFile)
-
- # spliting the deduplicated lines
- splitLines(vlDedupLines, opts.delim, vlOutFilenames)
- finally:
- for vfIn in vlfInFiles:
- vfIn.close()
- tmpMerged.close()
- if opts.unixSort:
- tmpDedupl.close()
-
- ##======================================================================
- ## calling main
- if __name__ == "__main__":
- sys.exit(main())
|