#! /usr/bin/python ## This script extracts best reranked parses from Charniak & Johansson ## (2005) output which are arranged in packeges. It also replaces S1 with ## TOP. ## ## Current version: 1.1 ## ## Added in current version: ## Optionally, it now replaces un-re-ranked parses, which are identified ## by UNRERANKED mask, with the corresponding best first-stage parse. ## These parses are provided as optional argument to the script. ## Note that currently this feature is not supported with multiple packages ## input by wildcards. from operator import itemgetter import sys, optparse, brownparses, codecs ##====================================================================== ## main def main(argv=None): if argv is None: argv = sys.argv parser = optparse.OptionParser(usage="%prog [options]" + "\nExtracts best reranked parses from Charniak & Johansson (2005) output which are arranged in packeges.", version="%prog 1.0") parser.add_option("-f", "--firststage", help="first-stage parser output", metavar="FIRST-STAGE PARSER OUTPUT", dest="fsParses", action="store") # processing input arguments (opts, posArgs) = parser.parse_args() if len(posArgs) < 4: parser.error("At least 4 arguments are required") vPackageSize = int(posArgs[0]) vCorpusSize = int(posArgs[1]) # if first-stage parse file option is not provided set flag to false if opts.fsParses != None: vFSParseFileName = opts.fsParses vReplaceUnReranked = True else: vFSParseFileName = "" vReplaceUnReranked = False # creating output file try: vfOutput = codecs.open(posArgs[2], mode='w', encoding='utf-8') except IOError: sys.exit('Can\'t create output file: ' + posArgs[2]) # sorting packages by name vlPackages = posArgs[3:] vlPackages.sort() # extracting best parses from all packages. vlBestParses = [] vPackCntr = 0 for vPackage in vlPackages: print vPackage vPackCntr += 1 vlPackageParses = [] ## wrapping the content of output file of the reranker into a root node ## to avoid problem by brownparses.extractBestParse() which uses XML ## format to extract parses. ## Uncomment when using brownparses.extractBestParseXML() #brownparses.wrapContent(vPackage, str(vPackCntr)) vlPackageParses += brownparses.extractBestParse(vPackage, True, False, True, vReplaceUnReranked, vFSParseFileName) ## adjusting sentence numbers based on package number (packages are sorted ## above) for vParseTree in vlPackageParses: if vParseTree.sentenceNo >= vPackageSize: sys.exit("Sentence number exceeds the package size! package: " + vPackage) else: vlBestParses.append(brownparses.Parse(vPackageSize * (vPackCntr - 1) + vParseTree.sentenceNo + 1, vParseTree.tree)) vlPackageParses = [] # inserting dummy parse trees for missing sentences vDummyParse = '(TOP (UNPARSED UNPARSED))' vTreeCntr = 1 # this first loop treats missing trees from the middle while (vTreeCntr <= vlBestParses[-1].sentenceNo): if vlBestParses[vTreeCntr - 1].sentenceNo > vTreeCntr: vlBestParses.insert(vTreeCntr - 1, brownparses.Parse(vTreeCntr, vDummyParse)) elif vlBestParses[vTreeCntr - 1].sentenceNo < vTreeCntr: sys.exit("Unknown error: sentence match problem") try: vfOutput.write(vlBestParses[vTreeCntr - 1].tree.decode('UTF-8') + '\n') except UnicodeEncodeError: vfOutput.write(vlBestParses[vTreeCntr - 1].tree + '\n') print vTreeCntr vTreeCntr += 1 # this last loop treats missing trees from the end while (vTreeCntr <= vCorpusSize): vlBestParses.append(brownparses.Parse(vTreeCntr, vDummyParse)) try: vfOutput.write(vlBestParses[vTreeCntr - 1].tree.decode('UTF-8') + '\n') except UnicodeEncodeError: vfOutput.write(vlBestParses[vTreeCntr - 1].tree + '\n') print vTreeCntr vTreeCntr += 1 vfOutput.close() ##====================================================================== ## calling main if __name__ == "__main__": sys.exit(main())