123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- #! /usr/bin/python
- ## This module defines functions to process Charniak & Johanson (2005)
- ## parser output.
- ##
- ## Current version: 1.3
- ##
- ## Added:
- ## - replaceUnScorable: replaces the problematic parses which evalb couldn't
- ## process with a dummy parse tree
- ## - extractBestParse() was refactored to not use XML to avoid unicode problem.
- ## The previous code was not thrown away and renamed to extractBestParseXML()
- ##
- ## Changed:
- ## - extractBestParse() was renamed to extractBestParseXML() to use the
- ## former for another version of function which doesn't use XML and thus
- ## does not have unicode problem
- ##
- ## Version 1.2:
- ##
- ## Added:
- ## - replaceUnRerankable: replaces the n-best parses of the sentence which
- ## could not be re-ranked with an dummy parse tree
- ##
- ## Changed:
- ## - extractBestParse: changed to support replacing the UNRERANKED parses
- ## with best first-stage parse of equivalent sentence.
- ##
- ##
- ## Version 1.0:
- ##
- ## - wrapContent(): wrapes a root tag around the file content which also
- ## include a number as attribute
- ## - extractBestParse: extracts the best parse trees from reranker output
- ##
- from operator import itemgetter
- from xml.etree.ElementTree import ElementTree, tostring
- from collections import namedtuple
- from decimal import *
- import sys, shutil, os
- ##----------------------------------------------------------------------
- ## Types
- # data structure for returning best parse
- Parse = namedtuple('Parse', 'sentenceNo tree')
- ##----------------------------------------------------------------------
- ## Since the output file of the reranker lacks a root node, it causes
- ## problem for ElementTree XML parser we use here. This function wrapes
- ## a root tag around the file content which also include a number as
- ## attribute. This number can serve as a package identifier when parses
- ## are divided into packeges and each file is a package.
- def wrapContent(pRRParseFileName, pPackageID):
- # reading the file content
- try:
- vfRRParse = open(pRRParseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open re-ranked parses file: ' + pRRParseFileName)
-
- vContent = vfRRParse.read();
- vfRRParse.close()
-
- # adding root tag if it's not already there
- if not vContent.startswith('<pack id='):
- vContent = '<pack id="' + pPackageID + '">\n' + vContent + '</pack>\n';
- # overwritting the file content
- try:
- vfRRParse = open(pRRParseFileName, 'w')
- except IOError:
- sys.exit('Can\'t open re-ranked parses file: ' + pRRParseFileName)
-
- vfRRParse.write(vContent);
- vfRRParse.close()
-
- vfRRParse.close()
- ##----------------------------------------------------------------------
- ## Extracts the best parse trees from reranker output, and optionally
- ## sorts parses. It also optionally replaces the S1 with TOP, and bracket
- ## masks with actual brackets (e.g. -RRB- with ')').
- ## It also optionally replaces UNRERANKED mask with best first-stage parse
- ## of the sentence from its equivalent first-stage parse file provided.
- ## Note that suppose both files are sentence-aligned, so that sentence ID
- ## in both files corresponds. Otherwise, it will stop with the appropriate
- ## error message.
- def extractBestParse(pRRParseFileName, pflgReplaceTOP, pflgReplaceBrackets, pflgSort,
- pflgRplcUnreranked, pFSParseFileName):
- vlBestParseList = []
- # opening re-ranked parse file
- try:
- vfRRParse = open(pRRParseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open re-ranked parse file: ' + pRRParseFileName)
- # opening first-stage parse file to replace UNRERANKED if requested
- if pflgRplcUnreranked:
- try:
- vfFSParse = open(pFSParseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open first-stage parse file: ' + pFSParseFileName)
- ## extracting best parse for each sentence: the first parse is supposed
- ## to be the best one
- # sentence iteration
- for vSentence in vfRRParse:
- # if the line doesn't start with '<s id=' jus skip
- if not vSentence.startswith("<s id="):
- continue
- # extracting the sentence ID
- vSentenceID = int(vSentence[7:vSentence.find('"', 7)])
- # extracting the first parse
- vStartPos = vSentence.find("(S1 ")
- vEndPos = vSentence.find("</parse>")
- vBestTree = vSentence[vStartPos:vEndPos]
- # replacing the UNRERANKED if requested
- if pflgRplcUnreranked:
- vFSSentence = vfFSParse.readline()
- # checking the sentence ID match
- vFSSentenceID = int(vFSSentence[7:vFSSentence.find('">', 7)])
- if vFSSentenceID != vSentenceID:
- sys.exit("Sentence ID mismatch between first-stage and reranked parses: " + str(vSentenceID) + ' & ' + str(vFSSentenceID))
- elif vBestTree.find("UNRERANKED UNRERANKED") != -1:
- vBestTree = vFSSentence[vFSSentence.find('">(S1 ') + 2: vFSSentence.find('</parse>')]
- # replacing S1 with TOP
- if pflgReplaceTOP:
- vBestTree = "(TOP" + vBestTree[3:]
- # replacing barcket masks
- if pflgReplaceBrackets:
- vBestTree = vBestTree.replace('-LCB-)', '{)')
- vBestTree = vBestTree.replace('-RCB-)', '})')
- vBestTree = vBestTree.replace('-LRB-)', '()')
- vBestTree = vBestTree.replace('-RRB-)', '))')
- vlBestParseList.append(Parse(vSentenceID, vBestTree))
- # optional sorting
- if pflgSort:
- vlBestParseList.sort(key=itemgetter(0), reverse=False)
- vfRRParse.close()
- vfFSParse.close()
- return vlBestParseList
- ##----------------------------------------------------------------------
- ## XML version of extractBestParse() (see above)
- def extractBestParseXML(pRRParseFileName, pflgReplaceTOP, pflgReplaceBrackets, pflgSort,
- pflgRplcUnreranked, pFSParseFileName):
- vlBestParseList = []
- # parsing XML format of the reranked parse file
- rrParses = ElementTree()
- rrParses.parse(pRRParseFileName)
- # opening first-stage parse files to replace UNRERANKED if requested
- if pflgRplcUnreranked:
- try:
- vfFSParse = open(pFSParseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open first-stage parses file: ' + pFSParseFileName)
- ## extracting best parse for each sentence: the first parse is supposed
- ## to be the best one
- vPack = rrParses.getroot()
- # sentence iteration
- for vSentence in list(vPack.getiterator("s")):
- # extracting the first parse
- vBestTree = vSentence.find("parse").text
- vSentenceID = int(vSentence.attrib["id"])
- # replacing the UNRERANKED if requested
- if pflgRplcUnreranked:
- vFSSentence = vfFSParse.readline()
- # checking the sentence ID match
- vFSSentenceID = int(vFSSentence[7:vFSSentence.find('">', 7)])
- if vFSSentenceID != vSentenceID:
- sys.exit("Sentence ID mismatch between first-stage and reranked parses: " + str(vSentenceID) + ' & ' + str(vFSSentenceID))
- elif vBestTree.find("UNRERANKED UNRERANKED") != -1:
- vBestTree = vFSSentence[vFSSentence.find('">(S1 ') + 2: vFSSentence.find('</parse>')]
- # replacing S1 with TOP
- if pflgReplaceTOP:
- vBestTree = "(TOP" + vBestTree[3:]
- # replacing barcket masks
- if pflgReplaceBrackets:
- vBestTree = vBestTree.replace('-LCB-)', '{)')
- vBestTree = vBestTree.replace('-RCB-)', '})')
- vBestTree = vBestTree.replace('-LRB-)', '()')
- vBestTree = vBestTree.replace('-RRB-)', '))')
- vlBestParseList.append(Parse(vSentenceID, vBestTree))
- # optional sorting
- if pflgSort:
- vlBestParseList.sort(key=itemgetter(0), reverse=False)
- vfFSParse.close()
- return vlBestParseList
- ##----------------------------------------------------------------------
- ## This function takes the output of first-stage parser and replaces the
- ## n-best parses of the sentence which could not be re-ranked with an
- ## dummy parse tree. The id of the sentence (zero-based index) is provided
- ## to the function.
- def replaceUnrerankable(pFSParseFileName, pSentenceID, pflgOverwrite):
- ## Unlike extractBestParse(), XML is not used here to keep it efficient,
- ## as the task is simple.
- # opening the input file
- try:
- vfFSParse = open(pFSParseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open first-stage parses file: ' + pFSParseFileName)
- ## creating the output file
- ## The new file is named the same as input file followed by .new suffix.
- ## However, if pflgOverwrite is set to true, it will later overwrite the
- ## input file
- try:
- vReplacedFileName = pFSParseFileName + '.new'
- vfReplaced = open(vReplacedFileName, 'w')
- except IOError:
- print 'Can\'t create output file'
- sys.exit(2)
- for vSentence in vfFSParse:
- if pSentenceID < 10:
- vSentenceIDStr = '0' + str(pSentenceID)
- else:
- vSentenceIDStr = str(pSentenceID)
- if vSentence.startswith('<s id="' + vSentenceIDStr + '"'):
- vPos = vSentence.find('">(S1 ')
- if vPos == -1:
- sys.exit("Format looks invalid!: " + vSentence)
- else:
- vReplacement = vSentence[:vPos + 6] + "(UNRERANKED UNRERANKED))</parse></s>"
- # write new sentence into file
- vfReplaced.write(vReplacement + '\n')
- else:
- # if this is not the target sentence, write it as it is
- vfReplaced.write(vSentence)
- vfFSParse.close()
- vfReplaced.close()
- ## if pflgOverwrite is true, the input file will be overwritten
- if pflgOverwrite:
- shutil.copy(vReplacedFileName, pFSParseFileName)
- os.remove(vReplacedFileName)
- ##----------------------------------------------------------------------
- ## This function takes the parse file (tree per line) and replaces the
- ## problematic parses which evalb couldn't process with a dummy parse tree.
- ## Currently, the sentence is identified by its line number in the file
- ## is an argument to this function. It can be imporoved to detect such
- ## sentences itself!
- ## Note that this function is not specific to Brown output, as its input
- ## is considered in a tree-per-line format.
- def replaceUnscorable(pParseFileName, pSentenceLineNo, pflgOverwrite):
- # opening the input file
- try:
- vfParse = open(pParseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open parse file: ' + pParseFileName)
- ## creating the output file
- ## The new file is named the same as input file followed by .new suffix.
- ## However, if pflgOverwrite is set to true, it will later overwrite the
- ## input file
- try:
- vReplacedFileName = pParseFileName + '.new'
- vfReplaced = open(vReplacedFileName, 'w')
- except IOError:
- print 'Can\'t create output file'
- sys.exit(2)
- vLineCntr = 1
- for vSentence in vfParse:
- if vLineCntr == pSentenceLineNo:
- # write the dummy pasre into file
- vfReplaced.write("(TOP (UNSCORABLE UNSCORABLE))\n")
- else:
- # if this is not the target sentence, write it as it is
- vfReplaced.write(vSentence)
- vLineCntr += 1
- vfParse.close()
- vfReplaced.close()
- ## if pflgOverwrite is true, the input file will be overwritten
- if pflgOverwrite:
- shutil.copy(vReplacedFileName, pParseFileName)
- os.remove(vReplacedFileName)
|