rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
							#! /usr/bin/python

## This module defines functions to process Charniak & Johanson (2005)
## parser output.
##
## Current version: 1.3
##
## Added:
## - replaceUnScorable: replaces the problematic parses which evalb couldn't
##   process with a dummy parse tree
## - extractBestParse() was refactored to not use XML to avoid unicode problem.
##   The previous code was not thrown away and renamed to extractBestParseXML()
##
## Changed:
## - extractBestParse() was renamed to extractBestParseXML() to use the
##   former for another version of function which doesn't use XML and thus
##   does not have unicode problem
##
## Version 1.2:
##
## Added:
## - replaceUnRerankable: replaces the n-best parses of the sentence which
##   could not be re-ranked with an dummy parse tree
##
## Changed:
## - extractBestParse: changed to support replacing the UNRERANKED parses
##   with best first-stage parse of equivalent sentence.
##
##
## Version 1.0:
##
## - wrapContent(): wrapes a root tag around the file content which also 
##   include a number as attribute
## - extractBestParse: extracts the best parse trees from reranker output
##

from operator import itemgetter
from xml.etree.ElementTree import ElementTree, tostring
from collections import namedtuple
from decimal import *
import sys, shutil, os


##----------------------------------------------------------------------
## Types

# data structure for returning best parse
Parse = namedtuple('Parse', 'sentenceNo tree')


##----------------------------------------------------------------------
## Since the output file of the reranker lacks a root node, it causes 
## problem for ElementTree XML parser we use here. This function wrapes
## a root tag around the file content which also include a number as 
## attribute. This number can serve as a package identifier when parses 
## are divided into packeges and each file is a package.
def wrapContent(pRRParseFileName, pPackageID):

 # reading the file content

 try:
  vfRRParse  = open(pRRParseFileName, 'r')
 except IOError:
  sys.exit('Can\'t open re-ranked parses file: ' + pRRParseFileName)
 
 vContent = vfRRParse.read();
 vfRRParse.close()
  
 # adding root tag if it's not already there
 if not vContent.startswith('<pack id='):
  vContent = '<pack id="' + pPackageID + '">\n' + vContent + '</pack>\n';

 # overwritting the file content

 try:
  vfRRParse  = open(pRRParseFileName, 'w')
 except IOError:
  sys.exit('Can\'t open re-ranked parses file: ' + pRRParseFileName)
 
 vfRRParse.write(vContent);
 vfRRParse.close()
   

 vfRRParse.close()


##----------------------------------------------------------------------
## Extracts the best parse trees from reranker output, and optionally
## sorts parses. It also optionally replaces the S1 with TOP, and bracket
## masks with actual brackets (e.g. -RRB- with ')').
## It also optionally replaces UNRERANKED mask with best first-stage parse
## of the sentence from its equivalent first-stage parse file provided.
## Note that suppose both files are sentence-aligned, so that sentence ID
## in both files corresponds. Otherwise, it will stop with the appropriate
## error message.
def extractBestParse(pRRParseFileName, pflgReplaceTOP, pflgReplaceBrackets, pflgSort,
                     pflgRplcUnreranked, pFSParseFileName):
 vlBestParseList = []

 # opening re-ranked parse file
 try:
  vfRRParse  = open(pRRParseFileName, 'r')
 except IOError:
  sys.exit('Can\'t open re-ranked parse file: ' + pRRParseFileName)


 # opening first-stage parse file to replace UNRERANKED if requested
 if pflgRplcUnreranked:
  try:
   vfFSParse  = open(pFSParseFileName, 'r')
  except IOError:
   sys.exit('Can\'t open first-stage parse file: ' + pFSParseFileName)


 ## extracting best parse for each sentence: the first parse is supposed
 ## to be the best one

 # sentence iteration
 for vSentence in vfRRParse:
  # if the line doesn't start with '<s id=' jus skip
  if not vSentence.startswith("<s id="):
   continue

  # extracting the sentence ID
  vSentenceID = int(vSentence[7:vSentence.find('"', 7)])

  # extracting the first parse
  vStartPos = vSentence.find("(S1 ")
  vEndPos = vSentence.find("</parse>")
  vBestTree = vSentence[vStartPos:vEndPos]
  # replacing the UNRERANKED if requested
  if pflgRplcUnreranked:
   vFSSentence = vfFSParse.readline()
   # checking the sentence ID match
   vFSSentenceID = int(vFSSentence[7:vFSSentence.find('">', 7)])
   if vFSSentenceID != vSentenceID:
    sys.exit("Sentence ID mismatch between first-stage and reranked parses: " + str(vSentenceID) + ' & ' + str(vFSSentenceID))
   elif vBestTree.find("UNRERANKED UNRERANKED") != -1:
    vBestTree = vFSSentence[vFSSentence.find('">(S1 ') + 2: vFSSentence.find('</parse>')]

  # replacing S1 with TOP
  if pflgReplaceTOP:
   vBestTree = "(TOP" + vBestTree[3:]

  # replacing barcket masks
  if pflgReplaceBrackets:
   vBestTree = vBestTree.replace('-LCB-)', '{)')
   vBestTree = vBestTree.replace('-RCB-)', '})')
   vBestTree = vBestTree.replace('-LRB-)', '()')
   vBestTree = vBestTree.replace('-RRB-)', '))')

  vlBestParseList.append(Parse(vSentenceID, vBestTree))

 # optional sorting
 if pflgSort:
  vlBestParseList.sort(key=itemgetter(0), reverse=False)


 vfRRParse.close()
 vfFSParse.close()

 return vlBestParseList


##----------------------------------------------------------------------
## XML version of extractBestParse() (see above)
def extractBestParseXML(pRRParseFileName, pflgReplaceTOP, pflgReplaceBrackets, pflgSort,
                     pflgRplcUnreranked, pFSParseFileName):
 vlBestParseList = []

 # parsing XML format of the reranked parse file
 rrParses = ElementTree()
 rrParses.parse(pRRParseFileName)


 # opening first-stage parse files to replace UNRERANKED if requested
 if pflgRplcUnreranked:
  try:
   vfFSParse  = open(pFSParseFileName, 'r')
  except IOError:
   sys.exit('Can\'t open first-stage parses file: ' + pFSParseFileName)


 ## extracting best parse for each sentence: the first parse is supposed
 ## to be the best one

 vPack = rrParses.getroot()
 # sentence iteration
 for vSentence in list(vPack.getiterator("s")):
  # extracting the first parse
  vBestTree = vSentence.find("parse").text

  vSentenceID = int(vSentence.attrib["id"])

  # replacing the UNRERANKED if requested
  if pflgRplcUnreranked:
   vFSSentence = vfFSParse.readline()
   # checking the sentence ID match
   vFSSentenceID = int(vFSSentence[7:vFSSentence.find('">', 7)])
   if vFSSentenceID != vSentenceID:
    sys.exit("Sentence ID mismatch between first-stage and reranked parses: " + str(vSentenceID) + ' & ' + str(vFSSentenceID))
   elif vBestTree.find("UNRERANKED UNRERANKED") != -1:
    vBestTree = vFSSentence[vFSSentence.find('">(S1 ') + 2: vFSSentence.find('</parse>')]

  # replacing S1 with TOP
  if pflgReplaceTOP:
   vBestTree = "(TOP" + vBestTree[3:]

  # replacing barcket masks
  if pflgReplaceBrackets:
   vBestTree = vBestTree.replace('-LCB-)', '{)')
   vBestTree = vBestTree.replace('-RCB-)', '})')
   vBestTree = vBestTree.replace('-LRB-)', '()')
   vBestTree = vBestTree.replace('-RRB-)', '))')

  vlBestParseList.append(Parse(vSentenceID, vBestTree))

 # optional sorting
 if pflgSort:
  vlBestParseList.sort(key=itemgetter(0), reverse=False)


 vfFSParse.close()

 return vlBestParseList


##----------------------------------------------------------------------
## This function takes the output of first-stage parser and replaces the
## n-best parses of the sentence which could not be re-ranked with an
## dummy parse tree. The id of the sentence (zero-based index) is provided
## to the function.
def replaceUnrerankable(pFSParseFileName, pSentenceID, pflgOverwrite):
 ## Unlike extractBestParse(), XML is not used here to keep it efficient,
 ## as the task is simple.

 # opening the input file
 try:
  vfFSParse  = open(pFSParseFileName, 'r')
 except IOError:
  sys.exit('Can\'t open first-stage parses file: ' + pFSParseFileName)

 ## creating the output file
 ## The new file is named the same as input file followed by .new suffix.
 ## However, if pflgOverwrite is set to true, it will later overwrite the
 ## input file
 try:
  vReplacedFileName = pFSParseFileName + '.new'
  vfReplaced = open(vReplacedFileName, 'w')
 except IOError:
  print 'Can\'t create output file'
  sys.exit(2)

 for vSentence in vfFSParse:
  if pSentenceID < 10:
   vSentenceIDStr = '0' + str(pSentenceID)
  else:
   vSentenceIDStr = str(pSentenceID)
  if vSentence.startswith('<s id="' + vSentenceIDStr + '"'):
   vPos = vSentence.find('">(S1 ')
   if vPos == -1:
    sys.exit("Format looks invalid!: " + vSentence)
   else:
    vReplacement = vSentence[:vPos + 6] + "(UNRERANKED UNRERANKED))</parse></s>"
   # write new sentence into file
   vfReplaced.write(vReplacement + '\n')
  else:
   # if this is not the target sentence, write it as it is
   vfReplaced.write(vSentence)

 vfFSParse.close()
 vfReplaced.close()

 ## if pflgOverwrite is true, the input file will be overwritten
 if pflgOverwrite:
  shutil.copy(vReplacedFileName, pFSParseFileName)
  os.remove(vReplacedFileName)


##----------------------------------------------------------------------
## This function takes the parse file (tree per line) and replaces the
## problematic parses which evalb couldn't process with a dummy parse tree.
## Currently, the sentence is identified by its line number in the file
## is an argument to this function. It can be imporoved to detect such
## sentences itself!
## Note that this function is not specific to Brown output, as its input
## is considered in a tree-per-line format.
def replaceUnscorable(pParseFileName, pSentenceLineNo, pflgOverwrite):
 # opening the input file
 try:
  vfParse  = open(pParseFileName, 'r')
 except IOError:
  sys.exit('Can\'t open parse file: ' + pParseFileName)

 ## creating the output file
 ## The new file is named the same as input file followed by .new suffix.
 ## However, if pflgOverwrite is set to true, it will later overwrite the
 ## input file
 try:
  vReplacedFileName = pParseFileName + '.new'
  vfReplaced = open(vReplacedFileName, 'w')
 except IOError:
  print 'Can\'t create output file'
  sys.exit(2)

 vLineCntr = 1
 for vSentence in vfParse:
  if vLineCntr == pSentenceLineNo:
   # write the dummy pasre into file
   vfReplaced.write("(TOP (UNSCORABLE UNSCORABLE))\n")
  else:
   # if this is not the target sentence, write it as it is
   vfReplaced.write(vSentence)

  vLineCntr += 1

 vfParse.close()
 vfReplaced.close()

 ## if pflgOverwrite is true, the input file will be overwritten
 if pflgOverwrite:
  shutil.copy(vReplacedFileName, pParseFileName)
  os.remove(vReplacedFileName)