rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
							#! /usr/bin/python

"""
	NOTE: THIS SCRIPT IS NOW ABSOLOTE. IT WILL BE REWRITTEN AS FUNCTIONS 
	      INTO OTHER CORRESPONDING MODULES. SOME FUNCTIONS CAN BE IMPROVED.
	      FOR EXAMPLE, LEFTOVER CAN BE EXTRACTED MORE EFFICIENTLY KEEPING
	      THE ORDER INTACT.									(24-Feb-2013)
	
	This script extracts a number of sentences from an input file.
	
	Originally developed to divide a corpus into development and test 
	datasets.
	
	Sentence selection is based on the order specified by -o option:
	-(r)andom: a number of sentences specified by -n option is randomly 
               selected
	-(l)isted: sentences corresponding to line numbers specified in a 
               comma-separated list of indices (starting from 1) are extracted
	To Do: -{s)traight: a number of sentences specified by -n option is 
                        selected starting from the beginning of the input
                        file
	To Do: -re{v)erse: a number of sentences specified by -n option is 
                       selected starting from the end of the input file
	
	Note:
	It may not be the most efficient way to load the entire file into memory 
	for large files. However, in random sampling, other methods, such as 
	sampling file position in terms byte and selecting the next line, have 
	their own deficiency in terms of the quality of selection. For example, 
	in the mentioned method, if the selected position be in the last line, 
	that line cannot be selected.
	
	NOTE: THIS SCRIPT IS NOW ABSOLOTE. IT WILL BE REWRITTEN AS FUNCTIONS 
	      INTO OTHER CORRESPONDING MODULES. SOME FUNCTIONS CAN BE IMPROVED.
	      FOR EXAMPLE, LEFTOVER CAN BE EXTRACTED MORE EFFICIENTLY KEEPING
	      THE ORDER INTACT.									(24-Feb-2013)
	
	Version: 1.5											(20-Feb-2013)
	- Original order is added to extraction orders. The aim is to select
	sentences with special criteria like length but from the whole set 
	and in the original order.
	
	Version: 1.4											(29-Jan-2013)
	- Minimum length limit for selected sentences is added.
	
	Version: 1.3											(19-Oct-2012)
	- Delimiter in index list file can be custom.
	
	Version: 1.2											(04-Oct-2012)
	- Arguments and options are simplified.
	- Complement selection is added (-c).
	
	Version: 1.1
	- Maximum length limit for selected sentences is added.
	
"""

from random import shuffle
import sys, optparse, random, time

def randomSelection(inputLength, sntcCount):
	'''
	Randomly select numbers which represent 0-based indexes of the lines
	in the input corpus.
	'''
	
	selection = []
	
	# random sampling
	
	population = range(0, inputLength)
	selection = random.sample(population, sntcCount)
	
	selection.sort()
	
	return selection
	

def indexSelection(idxFileName, pflgComplement, pCorpusSize, pDelim):
	'''
	Returns 0-based indexes of the 1-based indexes in idxFilename or its
	complement.
	'''
	
	vsIndexes = []
	
	# opening the index list file
	
	try:
		idxFile  = open(idxFileName, 'r')
	except IOError:
		print 'Can\'t open index list file: ' + idxFileName
		sys.exit(2)
	
	## treating \n as special case, since passing it through command line
	## options is tricky
	if pDelim == "\\n":
		pDelim = '\n'
		
 	vIndexes = idxFile.read().strip()
	vsIndexes = set([int(idx) - 1 for idx in vIndexes.split(pDelim)])
	if pflgComplement:
		vsIndexes = set(range(pCorpusSize)) - vsIndexes
	
	vlIndexes = list(vsIndexes)
	# Though it seems set() sorts the elements, we sort just in case.
	vlIndexes.sort()
	return vlIndexes
	

##======================================================================
## main
def main(argv=None):
	if argv is None:
		argv = sys.argv
	
	parser = optparse.OptionParser(usage="%prog <INPUT FILE>" +
										 "\nExtracts sentences from input file according to the options provided.", version="%prog 1.2")
	
	parser.add_option("--sel-out-name", help="output file name for selected sentences", dest="selOutFilename", action="store")
	parser.add_option("--left-out-name", help="output file name for leftover sentences (no leftover is output if not provided)", dest="leftOutFilename", action="store")
	parser.add_option("-o", "--order", help="extraction order: (r)andom [requires -n]; (l)ist [requires -i or -c]; (o)riginal [used by some criteria]", metavar="ORDER", dest="order", default="r", action="store")
	parser.add_option("-n", "--number", help="number of sentences to be extracted (default: 1)", metavar="NUMBER", dest="sntcCount", type="int", default=1, action="store")
	parser.add_option("-i", "--indices", help="CSV list of sentence indexes to be selected (starting from 1)", metavar="INDICES FILE", dest="idxFileName", action="store")
	parser.add_option("-d", "--idxdelim", help="delimiter used in CVS list of sentence indexes", metavar="INDEX CSV DELIM", dest="idxDelim", default=',', action="store")
	parser.add_option("-c", "--complement", help="select complement sentences of index list provided instead of themselves (used with -i)", dest="idxComplement", action="store_true")
	parser.add_option("-m", "--minlength", help="minimum length of sentences to be selected", metavar="MIN LENGTH", dest="minLength", default=0, action="store")
	parser.add_option("-x", "--maxlength", help="maximum length of sentences to be selected", metavar="Max LENGTH", dest="maxLength", default=0, action="store")

	(opts, posargs) = parser.parse_args()

	if len(posargs) < 1:
		parser.error("Input file is not provided.")
	
	vMinLength = int(opts.minLength)
	vMaxLength = int(opts.maxLength)
	
	# openning the input file
	
	inFileName = posargs[0]
	try:
		inFile  = open(inFileName, 'r')
	except IOError:
		print 'Can\'t open input file: ' + inFileName
		sys.exit(2)
	
	# loading file into memory to have the number of sampling population
	
	inputLines = [] 
	
	for i, vLine in enumerate(inFile, start = 1):
		#vValid = True
		vLen = len(vLine.split())
		if vMinLength > 0:
			if vLen < vMinLength:
				continue
		if vMaxLength > 0:
			if vLen > vMaxLength:
				continue
		
		inputLines.append((i, vLine))
	
	inputLength = len(inputLines)
	
	inFile.close()
	
	# processing options
	
	selection = []
	
	if opts.order == "o":
		selection = range(inputLength)
	elif opts.order == "r":
		selection =  randomSelection(inputLength, opts.sntcCount)
	elif opts.order == "l":
		if opts.idxFileName is None:
			print '-i option is missing'
			parser.print_help()
			sys.exit(2)
		else:
			selection = indexSelection(opts.idxFileName, opts.idxComplement, inputLength, opts.idxDelim)
	
	# writing selected and leftover sentences into files
	
	if opts.selOutFilename == None:
		vSelOutFileName = inFileName + ".sel"
	else:
		vSelOutFileName = opts.selOutFilename
	try:
		selOutFile = open(vSelOutFileName, 'w')
	except IOError:
		sys.exit('Can\'t create output file for selected sentences: ' + vSelOutFileName)
	
	if opts.leftOutFilename != None:
		vLeftOutFileName = opts.leftOutFilename
		try:
			leftOutFile = open(vLeftOutFileName, 'w')
		except IOError:
			sys.exit('Can\'t create output file for leftover sentences: ' + vLeftOutFileName)
	
	vlSel = []
	vCntr = 0
	for idx in selection:
		# write leftover if requested
		if opts.leftOutFilename != None:
			for count in range(vCntr, idx):
				leftOutFile.write(inputLines[count][1])
			vCntr = idx + 1
		# write selection
		selOutFile.write(inputLines[idx][1])
		vlSel.append(str(inputLines[idx][0]))
	
	# writing the rest of leftover sentences (after last selection idx)
	if opts.leftOutFilename != None:
		for count in range(vCntr, inputLength):
			leftOutFile.write(inputLines[count][1])
		leftOutFile.close()
	
	selOutFile.close()
	
	# if the selection is not based on index list, output selection indexes
	if opts.order != "l":
		try:
			vSelIdxOutFilename = vSelOutFileName + ".idx"
			vfSelIdxOut = open(vSelIdxOutFilename, 'w')
		except IOError:
			sys.exit("Cannot create selection index output file: " + vSelIdxOutFilename)
			
		vfSelIdxOut.write(','.join(vlSel) + '\n')
		vfSelIdxOut.close()
	

##======================================================================
## calling main
if __name__ == "__main__":
	sys.exit(main())