rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
							#! /usr/bin/python

## This module defines functions to process LORG parser output.
##
## Current version: 1.1
##
## Added:
## - replaceUnScorable: replaces the problematic parses which evalb couldn't
##   process with a dummy parse tree
##

from collections import namedtuple
import sys, shutil, os


##----------------------------------------------------------------------
## Types

# data structure for returning best parse
Parse = namedtuple('Parse', 'sentenceNo tree')


##----------------------------------------------------------------------
## Extracts the parse trees from LORG output, and optionally adds TOP
## tag. It also optionally ignores those line marked by LORG as comments
## (lines starting with "### comment: ". LORG considers input sentences
## starting with # as comments, ignores parsing them and outputs 
## "### comment: " followed by the sentence. This option is useful (by
## setting to false) when this treatment by LORG is problematic. In this
## cases, the sentence numbering is jumped by one, so that the calling 
## program can recognize that a parse tree is missing and takes necessary
## action. This function also consider line starting with 'no parse found'
## as those could not been parsed and jumps by one sentence number.
def extractParses(pParseFileName, pflgAddTOP, pflgIgnoreComment):
 try:
  vfParse  = open(pParseFileName, 'r')
 except IOError:
  sys.exit('Can\'t open parse file: ' + pParseFileName)

 vlParses = []
 vParseCntr = 0
 for line in vfParse:
  if line.startswith('### comment:') and not pflgIgnoreComment:
   vParseCntr += 1
  elif line.startswith('no parse found'):
   vParseCntr += 1
  elif line.startswith('( ('):
   vParseCntr += 1
   if pflgAddTOP:
    vlParses.append(Parse(vParseCntr, "(TOP" + line.strip('\n')[1:]))
   else:
    vlParses.append(Parse(vParseCntr, line.strip('\n')))
  elif not line.startswith('###'):
   sys.exit("Unknown line opening: " + line)
   break
   
 vfParse.close()
 
 return vlParses

 
##----------------------------------------------------------------------
## This function takes the parse file (tree per line) and replaces the 
## problematic parses which evalb couldn't process with a dummy parse tree.
## Currently, the sentence is identified by its line number in the file
## is an argument to this function. It can be imporoved to detect such
## sentences itself!
## Note that this function is not specific to LORG output, as its input
## is considered in a tree-per-line format.
def replaceUnscorable(pParseFileName, pSentenceLineNo, pflgOverwrite):
 # opening the input file 
 try:
  vfParse  = open(pParseFileName, 'r')
 except IOError:
  sys.exit('Can\'t open parse file: ' + pParseFileName)
  
 ## creating the output file
 ## The new file is named the same as input file followed by .new suffix.
 ## However, if pflgOverwrite is set to true, it will later overwrite the 
 ## input file
 try:
  vReplacedFileName = pParseFileName + '.new'
  vfReplaced = open(vReplacedFileName, 'w')
 except IOError:
  print 'Can\'t create output file'
  sys.exit(2)
 
 vLineCntr = 1
 for vSentence in vfParse:
  if vLineCntr == pSentenceLineNo:
   # write the dummy pasre into file
   vfReplaced.write("(TOP (UNSCORABLE UNSCORABLE))\n")  
  else:
   # if this is not the target sentence, write it as it is
   vfReplaced.write(vSentence)  
  
  vLineCntr += 1
 
 vfParse.close()
 vfReplaced.close()
 
 ## if pflgOverwrite is true, the input file will be overwritten
 if pflgOverwrite:
  shutil.copy(vReplacedFileName, pParseFileName)
  os.remove(vReplacedFileName)