rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
							#! /usr/bin/python

## This script extracts the N best parses according to the scores predicted
## by parser accuracy predictor. It gets the parse file in Treebank format
## (one parse tree per line) and a file containing the predictions (currently
## weka classification output) for every parse tree, then outputs a file
## containing the requested top parses based on those scores.
##
## Current version: 1
##

from collections import namedtuple
from operator import itemgetter
import sys, optparse, weka


##======================================================================
## main
def main(argv=None):
 if argv is None:
  argv = sys.argv
 
 parser = optparse.OptionParser(usage="%prog <PARSE FILE> <WEKA CLASSIFIER OUTPUT> <NUMBER OF BEST PARSES TO EXTRACT>" + 
                                      "\nThis script extracts the N best parses according to the scores predicted by parser accuracy predictor.", version="%prog 1.0")

 (opts, posArgs) = parser.parse_args()
 
 # checking arguments
 
 if len(posArgs) < 3:
  parser.error("At least 3 arguments are required!")
  
 if not posArgs[2].isdigit():
  parser.error("The second argument should be a number!")
 else:
  vBestNum = int(posArgs[2])


 # extracting scores from classifier output

 vlScores = []
 # we supposed the 2nd column contains predictions
 vlScores = weka.extractClassifierPreds(posArgs[1], 2, False)
 
 
 # sorting scores
 
 # data structure to store scores with sentence number
 ParseScore = namedtuple('ParseScore', 'sentenceNo score')
 
 vlParseScores = []
 vSentenceNo = 1
 for vScore in vlScores:
  vlParseScores.append(ParseScore(vSentenceNo, vScore)) 
  vSentenceNo += 1
 vlScores = []
  
 vlParseScores.sort(key=itemgetter(1), reverse=True)
 
 
 # exracting the sentence number of best parses
 vlBestParseNo = []
 vCntr = 1
 for vParseScore in vlParseScores:
  vlBestParseNo.append(vParseScore.sentenceNo)
  if vCntr == vBestNum:
   break
  else:
   vCntr += 1
 vlParseScores = []
 
 
 # opening parse file
  
 vParseFileName = posArgs[0]
 try:
  vfParse  = open(vParseFileName, 'r')
 except IOError:
  sys.exit('Can\'t open parse file: ' + vParseFileName)

 
 ## Instead of loading all parses into memory and then selecting those 
 ## indecies which exist in vlBestParsesNo, or iterating through parses
 ## in file and searching for each sentence number in vlBestParsesNo,
 ## we chose a more efficient way which sorts the vlBestParsesNo in
 ## ascending order, iterates through it and for each parse number,
 ## iterates the file ignoring all parses until reaches that pasrse number.

 vlBestParseNo.sort()
 
 vParseCntr = 0
 for vBestParseNo in vlBestParseNo:
  for vParse in vfParse:
   vParseCntr += 1
   if vBestParseNo == vParseCntr:
    print vParse,
    break
   
    
 vfParse.close()


##======================================================================
## calling main
if __name__ == "__main__":
 sys.exit(main())