123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- #! /usr/bin/python
- ## This script extracts the N best parses according to the scores predicted
- ## by parser accuracy predictor. It gets the parse file in Treebank format
- ## (one parse tree per line) and a file containing the predictions (currently
- ## weka classification output) for every parse tree, then outputs a file
- ## containing the requested top parses based on those scores.
- ##
- ## Current version: 1
- ##
- from collections import namedtuple
- from operator import itemgetter
- import sys, optparse, weka
- ##======================================================================
- ## main
- def main(argv=None):
- if argv is None:
- argv = sys.argv
-
- parser = optparse.OptionParser(usage="%prog <PARSE FILE> <WEKA CLASSIFIER OUTPUT> <NUMBER OF BEST PARSES TO EXTRACT>" +
- "\nThis script extracts the N best parses according to the scores predicted by parser accuracy predictor.", version="%prog 1.0")
- (opts, posArgs) = parser.parse_args()
-
- # checking arguments
-
- if len(posArgs) < 3:
- parser.error("At least 3 arguments are required!")
-
- if not posArgs[2].isdigit():
- parser.error("The second argument should be a number!")
- else:
- vBestNum = int(posArgs[2])
- # extracting scores from classifier output
- vlScores = []
- # we supposed the 2nd column contains predictions
- vlScores = weka.extractClassifierPreds(posArgs[1], 2, False)
-
-
- # sorting scores
-
- # data structure to store scores with sentence number
- ParseScore = namedtuple('ParseScore', 'sentenceNo score')
-
- vlParseScores = []
- vSentenceNo = 1
- for vScore in vlScores:
- vlParseScores.append(ParseScore(vSentenceNo, vScore))
- vSentenceNo += 1
- vlScores = []
-
- vlParseScores.sort(key=itemgetter(1), reverse=True)
-
-
- # exracting the sentence number of best parses
- vlBestParseNo = []
- vCntr = 1
- for vParseScore in vlParseScores:
- vlBestParseNo.append(vParseScore.sentenceNo)
- if vCntr == vBestNum:
- break
- else:
- vCntr += 1
- vlParseScores = []
-
-
- # opening parse file
-
- vParseFileName = posArgs[0]
- try:
- vfParse = open(vParseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open parse file: ' + vParseFileName)
-
- ## Instead of loading all parses into memory and then selecting those
- ## indecies which exist in vlBestParsesNo, or iterating through parses
- ## in file and searching for each sentence number in vlBestParsesNo,
- ## we chose a more efficient way which sorts the vlBestParsesNo in
- ## ascending order, iterates through it and for each parse number,
- ## iterates the file ignoring all parses until reaches that pasrse number.
- vlBestParseNo.sort()
-
- vParseCntr = 0
- for vBestParseNo in vlBestParseNo:
- for vParse in vfParse:
- vParseCntr += 1
- if vBestParseNo == vParseCntr:
- print vParse,
- break
-
-
- vfParse.close()
- ##======================================================================
- ## calling main
- if __name__ == "__main__":
- sys.exit(main())
|