123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129 |
- #! /usr/bin/python
- ## This script extracts best reranked parses from Charniak & Johansson
- ## (2005) output which are arranged in packeges. It also replaces S1 with
- ## TOP.
- ##
- ## Current version: 1.1
- ##
- ## Added in current version:
- ## Optionally, it now replaces un-re-ranked parses, which are identified
- ## by UNRERANKED mask, with the corresponding best first-stage parse.
- ## These parses are provided as optional argument to the script.
- ## Note that currently this feature is not supported with multiple packages
- ## input by wildcards.
- from operator import itemgetter
- import sys, optparse, brownparses, codecs
- ##======================================================================
- ## main
- def main(argv=None):
- if argv is None:
- argv = sys.argv
- parser = optparse.OptionParser(usage="%prog <PACKAGE SIZE> <CORPUS SIZE> <OUTPUT FILE NAME> <PACKAGES FILES NAME PATTERN> [options]" +
- "\nExtracts best reranked parses from Charniak & Johansson (2005) output which are arranged in packeges.", version="%prog 1.0")
- parser.add_option("-f", "--firststage", help="first-stage parser output", metavar="FIRST-STAGE PARSER OUTPUT", dest="fsParses", action="store")
- # processing input arguments
- (opts, posArgs) = parser.parse_args()
- if len(posArgs) < 4:
- parser.error("At least 4 arguments are required")
- vPackageSize = int(posArgs[0])
- vCorpusSize = int(posArgs[1])
- # if first-stage parse file option is not provided set flag to false
- if opts.fsParses != None:
- vFSParseFileName = opts.fsParses
- vReplaceUnReranked = True
- else:
- vFSParseFileName = ""
- vReplaceUnReranked = False
- # creating output file
- try:
- vfOutput = codecs.open(posArgs[2], mode='w', encoding='utf-8')
- except IOError:
- sys.exit('Can\'t create output file: ' + posArgs[2])
- # sorting packages by name
- vlPackages = posArgs[3:]
- vlPackages.sort()
- # extracting best parses from all packages.
- vlBestParses = []
- vPackCntr = 0
- for vPackage in vlPackages:
- print vPackage
- vPackCntr += 1
- vlPackageParses = []
- ## wrapping the content of output file of the reranker into a root node
- ## to avoid problem by brownparses.extractBestParse() which uses XML
- ## format to extract parses.
- ## Uncomment when using brownparses.extractBestParseXML()
- #brownparses.wrapContent(vPackage, str(vPackCntr))
- vlPackageParses += brownparses.extractBestParse(vPackage, True, False, True, vReplaceUnReranked, vFSParseFileName)
- ## adjusting sentence numbers based on package number (packages are sorted
- ## above)
- for vParseTree in vlPackageParses:
- if vParseTree.sentenceNo >= vPackageSize:
- sys.exit("Sentence number exceeds the package size! package: " + vPackage)
- else:
- vlBestParses.append(brownparses.Parse(vPackageSize * (vPackCntr - 1) + vParseTree.sentenceNo + 1, vParseTree.tree))
- vlPackageParses = []
- # inserting dummy parse trees for missing sentences
- vDummyParse = '(TOP (UNPARSED UNPARSED))'
- vTreeCntr = 1
- # this first loop treats missing trees from the middle
- while (vTreeCntr <= vlBestParses[-1].sentenceNo):
- if vlBestParses[vTreeCntr - 1].sentenceNo > vTreeCntr:
- vlBestParses.insert(vTreeCntr - 1, brownparses.Parse(vTreeCntr, vDummyParse))
- elif vlBestParses[vTreeCntr - 1].sentenceNo < vTreeCntr:
- sys.exit("Unknown error: sentence match problem")
- try:
- vfOutput.write(vlBestParses[vTreeCntr - 1].tree.decode('UTF-8') + '\n')
- except UnicodeEncodeError:
- vfOutput.write(vlBestParses[vTreeCntr - 1].tree + '\n')
- print vTreeCntr
- vTreeCntr += 1
- # this last loop treats missing trees from the end
- while (vTreeCntr <= vCorpusSize):
- vlBestParses.append(brownparses.Parse(vTreeCntr, vDummyParse))
- try:
- vfOutput.write(vlBestParses[vTreeCntr - 1].tree.decode('UTF-8') + '\n')
- except UnicodeEncodeError:
- vfOutput.write(vlBestParses[vTreeCntr - 1].tree + '\n')
- print vTreeCntr
- vTreeCntr += 1
- vfOutput.close()
- ##======================================================================
- ## calling main
- if __name__ == "__main__":
- sys.exit(main())
|