123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239 |
- #! /usr/bin/python
- """
- NOTE: THIS SCRIPT IS NOW ABSOLOTE. IT WILL BE REWRITTEN AS FUNCTIONS
- INTO OTHER CORRESPONDING MODULES. SOME FUNCTIONS CAN BE IMPROVED.
- FOR EXAMPLE, LEFTOVER CAN BE EXTRACTED MORE EFFICIENTLY KEEPING
- THE ORDER INTACT. (24-Feb-2013)
-
- This script extracts a number of sentences from an input file.
-
- Originally developed to divide a corpus into development and test
- datasets.
-
- Sentence selection is based on the order specified by -o option:
- -(r)andom: a number of sentences specified by -n option is randomly
- selected
- -(l)isted: sentences corresponding to line numbers specified in a
- comma-separated list of indices (starting from 1) are extracted
- To Do: -{s)traight: a number of sentences specified by -n option is
- selected starting from the beginning of the input
- file
- To Do: -re{v)erse: a number of sentences specified by -n option is
- selected starting from the end of the input file
-
- Note:
- It may not be the most efficient way to load the entire file into memory
- for large files. However, in random sampling, other methods, such as
- sampling file position in terms byte and selecting the next line, have
- their own deficiency in terms of the quality of selection. For example,
- in the mentioned method, if the selected position be in the last line,
- that line cannot be selected.
-
- NOTE: THIS SCRIPT IS NOW ABSOLOTE. IT WILL BE REWRITTEN AS FUNCTIONS
- INTO OTHER CORRESPONDING MODULES. SOME FUNCTIONS CAN BE IMPROVED.
- FOR EXAMPLE, LEFTOVER CAN BE EXTRACTED MORE EFFICIENTLY KEEPING
- THE ORDER INTACT. (24-Feb-2013)
-
- Version: 1.5 (20-Feb-2013)
- - Original order is added to extraction orders. The aim is to select
- sentences with special criteria like length but from the whole set
- and in the original order.
-
- Version: 1.4 (29-Jan-2013)
- - Minimum length limit for selected sentences is added.
-
- Version: 1.3 (19-Oct-2012)
- - Delimiter in index list file can be custom.
-
- Version: 1.2 (04-Oct-2012)
- - Arguments and options are simplified.
- - Complement selection is added (-c).
-
- Version: 1.1
- - Maximum length limit for selected sentences is added.
-
- """
- from random import shuffle
- import sys, optparse, random, time
- def randomSelection(inputLength, sntcCount):
- '''
- Randomly select numbers which represent 0-based indexes of the lines
- in the input corpus.
- '''
-
- selection = []
-
- # random sampling
-
- population = range(0, inputLength)
- selection = random.sample(population, sntcCount)
-
- selection.sort()
-
- return selection
-
- def indexSelection(idxFileName, pflgComplement, pCorpusSize, pDelim):
- '''
- Returns 0-based indexes of the 1-based indexes in idxFilename or its
- complement.
- '''
-
- vsIndexes = []
-
- # opening the index list file
-
- try:
- idxFile = open(idxFileName, 'r')
- except IOError:
- print 'Can\'t open index list file: ' + idxFileName
- sys.exit(2)
-
- ## treating \n as special case, since passing it through command line
- ## options is tricky
- if pDelim == "\\n":
- pDelim = '\n'
-
- vIndexes = idxFile.read().strip()
- vsIndexes = set([int(idx) - 1 for idx in vIndexes.split(pDelim)])
- if pflgComplement:
- vsIndexes = set(range(pCorpusSize)) - vsIndexes
-
- vlIndexes = list(vsIndexes)
- # Though it seems set() sorts the elements, we sort just in case.
- vlIndexes.sort()
- return vlIndexes
-
- ##======================================================================
- ## main
- def main(argv=None):
- if argv is None:
- argv = sys.argv
-
- parser = optparse.OptionParser(usage="%prog <INPUT FILE>" +
- "\nExtracts sentences from input file according to the options provided.", version="%prog 1.2")
-
- parser.add_option("--sel-out-name", help="output file name for selected sentences", dest="selOutFilename", action="store")
- parser.add_option("--left-out-name", help="output file name for leftover sentences (no leftover is output if not provided)", dest="leftOutFilename", action="store")
- parser.add_option("-o", "--order", help="extraction order: (r)andom [requires -n]; (l)ist [requires -i or -c]; (o)riginal [used by some criteria]", metavar="ORDER", dest="order", default="r", action="store")
- parser.add_option("-n", "--number", help="number of sentences to be extracted (default: 1)", metavar="NUMBER", dest="sntcCount", type="int", default=1, action="store")
- parser.add_option("-i", "--indices", help="CSV list of sentence indexes to be selected (starting from 1)", metavar="INDICES FILE", dest="idxFileName", action="store")
- parser.add_option("-d", "--idxdelim", help="delimiter used in CVS list of sentence indexes", metavar="INDEX CSV DELIM", dest="idxDelim", default=',', action="store")
- parser.add_option("-c", "--complement", help="select complement sentences of index list provided instead of themselves (used with -i)", dest="idxComplement", action="store_true")
- parser.add_option("-m", "--minlength", help="minimum length of sentences to be selected", metavar="MIN LENGTH", dest="minLength", default=0, action="store")
- parser.add_option("-x", "--maxlength", help="maximum length of sentences to be selected", metavar="Max LENGTH", dest="maxLength", default=0, action="store")
- (opts, posargs) = parser.parse_args()
- if len(posargs) < 1:
- parser.error("Input file is not provided.")
-
- vMinLength = int(opts.minLength)
- vMaxLength = int(opts.maxLength)
-
- # openning the input file
-
- inFileName = posargs[0]
- try:
- inFile = open(inFileName, 'r')
- except IOError:
- print 'Can\'t open input file: ' + inFileName
- sys.exit(2)
-
- # loading file into memory to have the number of sampling population
-
- inputLines = []
-
- for i, vLine in enumerate(inFile, start = 1):
- #vValid = True
- vLen = len(vLine.split())
- if vMinLength > 0:
- if vLen < vMinLength:
- continue
- if vMaxLength > 0:
- if vLen > vMaxLength:
- continue
-
- inputLines.append((i, vLine))
-
- inputLength = len(inputLines)
-
- inFile.close()
-
- # processing options
-
- selection = []
-
- if opts.order == "o":
- selection = range(inputLength)
- elif opts.order == "r":
- selection = randomSelection(inputLength, opts.sntcCount)
- elif opts.order == "l":
- if opts.idxFileName is None:
- print '-i option is missing'
- parser.print_help()
- sys.exit(2)
- else:
- selection = indexSelection(opts.idxFileName, opts.idxComplement, inputLength, opts.idxDelim)
-
- # writing selected and leftover sentences into files
-
- if opts.selOutFilename == None:
- vSelOutFileName = inFileName + ".sel"
- else:
- vSelOutFileName = opts.selOutFilename
- try:
- selOutFile = open(vSelOutFileName, 'w')
- except IOError:
- sys.exit('Can\'t create output file for selected sentences: ' + vSelOutFileName)
-
- if opts.leftOutFilename != None:
- vLeftOutFileName = opts.leftOutFilename
- try:
- leftOutFile = open(vLeftOutFileName, 'w')
- except IOError:
- sys.exit('Can\'t create output file for leftover sentences: ' + vLeftOutFileName)
-
- vlSel = []
- vCntr = 0
- for idx in selection:
- # write leftover if requested
- if opts.leftOutFilename != None:
- for count in range(vCntr, idx):
- leftOutFile.write(inputLines[count][1])
- vCntr = idx + 1
- # write selection
- selOutFile.write(inputLines[idx][1])
- vlSel.append(str(inputLines[idx][0]))
-
- # writing the rest of leftover sentences (after last selection idx)
- if opts.leftOutFilename != None:
- for count in range(vCntr, inputLength):
- leftOutFile.write(inputLines[count][1])
- leftOutFile.close()
-
- selOutFile.close()
-
- # if the selection is not based on index list, output selection indexes
- if opts.order != "l":
- try:
- vSelIdxOutFilename = vSelOutFileName + ".idx"
- vfSelIdxOut = open(vSelIdxOutFilename, 'w')
- except IOError:
- sys.exit("Cannot create selection index output file: " + vSelIdxOutFilename)
-
- vfSelIdxOut.write(','.join(vlSel) + '\n')
- vfSelIdxOut.close()
-
- ##======================================================================
- ## calling main
- if __name__ == "__main__":
- sys.exit(main())
-
|