dedup-corpus.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225
  1. #! /usr/bin/python
  2. """
  3. This script deduplicates single or parallel corpora.
  4. Originally developed to deduplicate Symantec translation memory corpora.
  5. Version 1.4
  6. - Outputs indexes of the output lines in the original corpus.
  7. Deduplicating using UNIX sort command is optional now for regression
  8. test purposes.
  9. Version 1.3
  10. - Renamed from dedup-parallel-corpora.py to dedup-corpus.py
  11. - Now supports the same processes over a single corpus
  12. Version 1.2
  13. - Arbitrary number of input corpora is supported.
  14. - Optional lowercasing before deduplicating is supported.
  15. - Optional strip/split/merge before deduplicating
  16. - Results can be shuffled.
  17. Version 1.1
  18. - The default delimiter was changed from ### to ~#^, due to problems
  19. observed for lines ending with #
  20. """
  21. import sys, optparse, tempfile, subprocess, os.path , random, itertools
  22. def mergeFiles(plFiles, pDelim, pflgLowercase, pflgSSM):
  23. '''
  24. Merges files horizontally and preprocess them based on arguments.
  25. '''
  26. try:
  27. tmpMerged = tempfile.NamedTemporaryFile()
  28. except IOError:
  29. sys.exit("Cannot create temporary merge file in the current path")
  30. for vlLines in zip(*plFiles):
  31. vlLines = [line.strip() for line in vlLines]
  32. if pflgLowercase:
  33. vlLines = [line.lower() for line in vlLines]
  34. if pflgSSM:
  35. vlLines = [' '.join(line.split()) for line in vlLines]
  36. tmpMerged.write(pDelim.join(vlLines) + '\n')
  37. tmpMerged.seek(0,0)
  38. return tmpMerged
  39. def dedupUnixSort(pfInput):
  40. '''
  41. Deduplicates pfInput using UNIX sort command
  42. '''
  43. try:
  44. tmpDedupl = tempfile.NamedTemporaryFile()
  45. except IOError:
  46. print 'Can\'t create temporary deduplicated file in the current path'
  47. sys.exit(2)
  48. subprocess.call("sort -u " + pfInput.name + " > " + tmpDedupl.name, shell=True)
  49. return tmpDedupl
  50. def dedupWithIdx(pfInput):
  51. '''
  52. Deduplicates pfInput and keep track of indexes in the original corpus
  53. '''
  54. # reading lines to a list of tuples
  55. vlInput = []
  56. for i, vLine in enumerate(pfInput, start=1):
  57. vlInput.append((vLine, i))
  58. # sorting the list
  59. vlSorted = sorted(vlInput)
  60. # deduplicating
  61. vlDedupLinesAndIdxs = []
  62. for vItem, vIndexGroup in itertools.groupby(vlSorted, key=lambda line: line[0]):
  63. # only the first of duplicate indexes is taken
  64. vIndex = list(vIndexGroup)[0]
  65. vlDedupLinesAndIdxs.append((vItem, vIndex))
  66. return vlDedupLinesAndIdxs
  67. def splitLines(plInputLines, pDelim, plOutFilesnames):
  68. '''
  69. Splits input lines based on pDelim and writes into specified files.
  70. '''
  71. vlfOutFiles = []
  72. for vOutFilename in plOutFilesnames:
  73. try:
  74. vlfOutFiles.append(open(vOutFilename, 'w'))
  75. except IOError:
  76. sys.exit("Cannot open output file: %s" % vOutFilename)
  77. try:
  78. for vLine in plInputLines:
  79. vlSplitLines = vLine.split(pDelim)
  80. for vfOut, vLine in zip(vlfOutFiles, vlSplitLines):
  81. vfOut.write(vLine.strip() + '\n')
  82. finally:
  83. for vfOut in vlfOutFiles:
  84. vfOut.close()
  85. def writeOrgIdxs(plIdxs, pOutFilename):
  86. '''
  87. Writes indexes of deduplicated lines in the original corpus into a
  88. file .
  89. '''
  90. try:
  91. vfOut = open(pOutFilename, 'w')
  92. except IOError:
  93. sys.exit("Cannot creat index output file: %s" % pOutFilename)
  94. vfOut.write('\n'.join([str(item) for item in plIdxs]) + '\n')
  95. vfOut.close()
  96. ##======================================================================
  97. ## main
  98. def main(argv=None):
  99. if argv is None:
  100. arv = sys.argv
  101. parser = optparse.OptionParser(usage="%prog <INPUT FILE 1> [<INPUT FILE 2> [..]] <OUTPUT DIR>" +
  102. "\nThis script deduplicates single or parallel corpora.", version="%prog 1.4")
  103. parser.add_option("-d", "--delimiter", help="a delimiter string which does not appear in any of files", metavar="DELIMITER", dest="delim", default="~#^", action="store")
  104. parser.add_option("-u", "--unix-sort", help="use UNIX sort command", dest="unixSort", action="store_true")
  105. parser.add_option("-i", "--index-out", help="output file name for intexes in the original corpus (not including path)", dest="idxOutFile", default="org.idx", action="store")
  106. parser.add_option("-l", "--lowercase", help="lowercase before deduplication", dest="lower", action="store_true")
  107. parser.add_option("-s", "--ssm", help="strip, split and merge before deduplication", dest="ssm", action="store_true")
  108. parser.add_option("-r", "--shuffle", help="shuffle the deduplicated lines", dest="shuff", action="store_true")
  109. parser.add_option("-x", "--out-suffix", help="output files suffix", dest="outSuffix", action="store")
  110. (opts, posargs) = parser.parse_args()
  111. if len(posargs) < 2:
  112. parser.error("At least 2 arguments (one corpus and one output directory) are required.")
  113. # opening the input files and preparing output file names
  114. vlInFilenames = posargs[:-1]
  115. vlfInFiles = []
  116. vlOutfileExts = []
  117. for vFilename in vlInFilenames:
  118. try:
  119. vlfInFiles.append(open(vFilename, 'r'))
  120. vlOutfileExts.append(os.path.splitext(vFilename)[1])
  121. except IOError:
  122. sys.exit("Cannot open input file: %s " % vFilename)
  123. vOutDir = posargs[-1]
  124. # checking for identical file extensions
  125. vsOutfileExts = set(vlOutfileExts)
  126. if len(vsOutfileExts) != len(vlOutfileExts):
  127. sys.exit("Please provide files with different extensions")
  128. if opts.outSuffix == None:
  129. vSuffix = "dp"
  130. if opts.lower:
  131. vSuffix = 'l' + vSuffix
  132. if opts.shuff:
  133. vSuffix = vSuffix + 'r'
  134. else:
  135. vSuffix = vSuffix + 's'
  136. vSuffix = '.' + vSuffix
  137. else:
  138. vSuffix = '.' + opts.outSuffix
  139. # creating output file names
  140. vlOutFilenames = []
  141. for vFilename, vOutFileExt in zip(vlInFilenames, vlOutfileExts):
  142. vlOutFilenames.append(vOutDir + '/' + os.path.basename(os.path.splitext(vFilename)[0]) + vSuffix + vOutFileExt)
  143. # merging input files line by line
  144. tmpMerged = mergeFiles(vlfInFiles, opts.delim, opts.lower, opts.ssm)
  145. try:
  146. ## deduplicating the temporary merged file
  147. ## NOTE: if the indexes of the new lines in the original corpus is
  148. ## requested, UNIX sort cann't be used as it was in the previous
  149. ## version. While both functions do the same, deduplUnixSort() is
  150. ## kept only for backward-compatibilty test of output when needed.
  151. if opts.unixSort:
  152. tmpDedupl = dedupUnixSort(tmpMerged)
  153. vlDedupLines = tmpDedupl.read().splitlines()
  154. if opts.shuff:
  155. random.shuffle(vlDedupLines)
  156. else:
  157. vlDedupLinesAndIdxs = dedupWithIdx(tmpMerged)
  158. if opts.shuff:
  159. random.shuffle(vlDedupLinesAndIdxs)
  160. vlDedupLines = [item[0] for item in vlDedupLinesAndIdxs]
  161. writeOrgIdxs([item[1][1] for item in vlDedupLinesAndIdxs], vOutDir + '/' + opts.idxOutFile)
  162. # spliting the deduplicated lines
  163. splitLines(vlDedupLines, opts.delim, vlOutFilenames)
  164. finally:
  165. for vfIn in vlfInFiles:
  166. vfIn.close()
  167. tmpMerged.close()
  168. if opts.unixSort:
  169. tmpDedupl.close()
  170. ##======================================================================
  171. ## calling main
  172. if __name__ == "__main__":
  173. sys.exit(main())