extract-sentences.py 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. #! /usr/bin/python
  2. """
  3. NOTE: THIS SCRIPT IS NOW ABSOLOTE. IT WILL BE REWRITTEN AS FUNCTIONS
  4. INTO OTHER CORRESPONDING MODULES. SOME FUNCTIONS CAN BE IMPROVED.
  5. FOR EXAMPLE, LEFTOVER CAN BE EXTRACTED MORE EFFICIENTLY KEEPING
  6. THE ORDER INTACT. (24-Feb-2013)
  7. This script extracts a number of sentences from an input file.
  8. Originally developed to divide a corpus into development and test
  9. datasets.
  10. Sentence selection is based on the order specified by -o option:
  11. -(r)andom: a number of sentences specified by -n option is randomly
  12. selected
  13. -(l)isted: sentences corresponding to line numbers specified in a
  14. comma-separated list of indices (starting from 1) are extracted
  15. To Do: -{s)traight: a number of sentences specified by -n option is
  16. selected starting from the beginning of the input
  17. file
  18. To Do: -re{v)erse: a number of sentences specified by -n option is
  19. selected starting from the end of the input file
  20. Note:
  21. It may not be the most efficient way to load the entire file into memory
  22. for large files. However, in random sampling, other methods, such as
  23. sampling file position in terms byte and selecting the next line, have
  24. their own deficiency in terms of the quality of selection. For example,
  25. in the mentioned method, if the selected position be in the last line,
  26. that line cannot be selected.
  27. NOTE: THIS SCRIPT IS NOW ABSOLOTE. IT WILL BE REWRITTEN AS FUNCTIONS
  28. INTO OTHER CORRESPONDING MODULES. SOME FUNCTIONS CAN BE IMPROVED.
  29. FOR EXAMPLE, LEFTOVER CAN BE EXTRACTED MORE EFFICIENTLY KEEPING
  30. THE ORDER INTACT. (24-Feb-2013)
  31. Version: 1.5 (20-Feb-2013)
  32. - Original order is added to extraction orders. The aim is to select
  33. sentences with special criteria like length but from the whole set
  34. and in the original order.
  35. Version: 1.4 (29-Jan-2013)
  36. - Minimum length limit for selected sentences is added.
  37. Version: 1.3 (19-Oct-2012)
  38. - Delimiter in index list file can be custom.
  39. Version: 1.2 (04-Oct-2012)
  40. - Arguments and options are simplified.
  41. - Complement selection is added (-c).
  42. Version: 1.1
  43. - Maximum length limit for selected sentences is added.
  44. """
  45. from random import shuffle
  46. import sys, optparse, random, time
  47. def randomSelection(inputLength, sntcCount):
  48. '''
  49. Randomly select numbers which represent 0-based indexes of the lines
  50. in the input corpus.
  51. '''
  52. selection = []
  53. # random sampling
  54. population = range(0, inputLength)
  55. selection = random.sample(population, sntcCount)
  56. selection.sort()
  57. return selection
  58. def indexSelection(idxFileName, pflgComplement, pCorpusSize, pDelim):
  59. '''
  60. Returns 0-based indexes of the 1-based indexes in idxFilename or its
  61. complement.
  62. '''
  63. vsIndexes = []
  64. # opening the index list file
  65. try:
  66. idxFile = open(idxFileName, 'r')
  67. except IOError:
  68. print 'Can\'t open index list file: ' + idxFileName
  69. sys.exit(2)
  70. ## treating \n as special case, since passing it through command line
  71. ## options is tricky
  72. if pDelim == "\\n":
  73. pDelim = '\n'
  74. vIndexes = idxFile.read().strip()
  75. vsIndexes = set([int(idx) - 1 for idx in vIndexes.split(pDelim)])
  76. if pflgComplement:
  77. vsIndexes = set(range(pCorpusSize)) - vsIndexes
  78. vlIndexes = list(vsIndexes)
  79. # Though it seems set() sorts the elements, we sort just in case.
  80. vlIndexes.sort()
  81. return vlIndexes
  82. ##======================================================================
  83. ## main
  84. def main(argv=None):
  85. if argv is None:
  86. argv = sys.argv
  87. parser = optparse.OptionParser(usage="%prog <INPUT FILE>" +
  88. "\nExtracts sentences from input file according to the options provided.", version="%prog 1.2")
  89. parser.add_option("--sel-out-name", help="output file name for selected sentences", dest="selOutFilename", action="store")
  90. parser.add_option("--left-out-name", help="output file name for leftover sentences (no leftover is output if not provided)", dest="leftOutFilename", action="store")
  91. parser.add_option("-o", "--order", help="extraction order: (r)andom [requires -n]; (l)ist [requires -i or -c]; (o)riginal [used by some criteria]", metavar="ORDER", dest="order", default="r", action="store")
  92. parser.add_option("-n", "--number", help="number of sentences to be extracted (default: 1)", metavar="NUMBER", dest="sntcCount", type="int", default=1, action="store")
  93. parser.add_option("-i", "--indices", help="CSV list of sentence indexes to be selected (starting from 1)", metavar="INDICES FILE", dest="idxFileName", action="store")
  94. parser.add_option("-d", "--idxdelim", help="delimiter used in CVS list of sentence indexes", metavar="INDEX CSV DELIM", dest="idxDelim", default=',', action="store")
  95. parser.add_option("-c", "--complement", help="select complement sentences of index list provided instead of themselves (used with -i)", dest="idxComplement", action="store_true")
  96. parser.add_option("-m", "--minlength", help="minimum length of sentences to be selected", metavar="MIN LENGTH", dest="minLength", default=0, action="store")
  97. parser.add_option("-x", "--maxlength", help="maximum length of sentences to be selected", metavar="Max LENGTH", dest="maxLength", default=0, action="store")
  98. (opts, posargs) = parser.parse_args()
  99. if len(posargs) < 1:
  100. parser.error("Input file is not provided.")
  101. vMinLength = int(opts.minLength)
  102. vMaxLength = int(opts.maxLength)
  103. # openning the input file
  104. inFileName = posargs[0]
  105. try:
  106. inFile = open(inFileName, 'r')
  107. except IOError:
  108. print 'Can\'t open input file: ' + inFileName
  109. sys.exit(2)
  110. # loading file into memory to have the number of sampling population
  111. inputLines = []
  112. for i, vLine in enumerate(inFile, start = 1):
  113. #vValid = True
  114. vLen = len(vLine.split())
  115. if vMinLength > 0:
  116. if vLen < vMinLength:
  117. continue
  118. if vMaxLength > 0:
  119. if vLen > vMaxLength:
  120. continue
  121. inputLines.append((i, vLine))
  122. inputLength = len(inputLines)
  123. inFile.close()
  124. # processing options
  125. selection = []
  126. if opts.order == "o":
  127. selection = range(inputLength)
  128. elif opts.order == "r":
  129. selection = randomSelection(inputLength, opts.sntcCount)
  130. elif opts.order == "l":
  131. if opts.idxFileName is None:
  132. print '-i option is missing'
  133. parser.print_help()
  134. sys.exit(2)
  135. else:
  136. selection = indexSelection(opts.idxFileName, opts.idxComplement, inputLength, opts.idxDelim)
  137. # writing selected and leftover sentences into files
  138. if opts.selOutFilename == None:
  139. vSelOutFileName = inFileName + ".sel"
  140. else:
  141. vSelOutFileName = opts.selOutFilename
  142. try:
  143. selOutFile = open(vSelOutFileName, 'w')
  144. except IOError:
  145. sys.exit('Can\'t create output file for selected sentences: ' + vSelOutFileName)
  146. if opts.leftOutFilename != None:
  147. vLeftOutFileName = opts.leftOutFilename
  148. try:
  149. leftOutFile = open(vLeftOutFileName, 'w')
  150. except IOError:
  151. sys.exit('Can\'t create output file for leftover sentences: ' + vLeftOutFileName)
  152. vlSel = []
  153. vCntr = 0
  154. for idx in selection:
  155. # write leftover if requested
  156. if opts.leftOutFilename != None:
  157. for count in range(vCntr, idx):
  158. leftOutFile.write(inputLines[count][1])
  159. vCntr = idx + 1
  160. # write selection
  161. selOutFile.write(inputLines[idx][1])
  162. vlSel.append(str(inputLines[idx][0]))
  163. # writing the rest of leftover sentences (after last selection idx)
  164. if opts.leftOutFilename != None:
  165. for count in range(vCntr, inputLength):
  166. leftOutFile.write(inputLines[count][1])
  167. leftOutFile.close()
  168. selOutFile.close()
  169. # if the selection is not based on index list, output selection indexes
  170. if opts.order != "l":
  171. try:
  172. vSelIdxOutFilename = vSelOutFileName + ".idx"
  173. vfSelIdxOut = open(vSelIdxOutFilename, 'w')
  174. except IOError:
  175. sys.exit("Cannot create selection index output file: " + vSelIdxOutFilename)
  176. vfSelIdxOut.write(','.join(vlSel) + '\n')
  177. vfSelIdxOut.close()
  178. ##======================================================================
  179. ## calling main
  180. if __name__ == "__main__":
  181. sys.exit(main())