brownparses.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. #! /usr/bin/python
  2. ## This module defines functions to process Charniak & Johanson (2005)
  3. ## parser output.
  4. ##
  5. ## Current version: 1.3
  6. ##
  7. ## Added:
  8. ## - replaceUnScorable: replaces the problematic parses which evalb couldn't
  9. ## process with a dummy parse tree
  10. ## - extractBestParse() was refactored to not use XML to avoid unicode problem.
  11. ## The previous code was not thrown away and renamed to extractBestParseXML()
  12. ##
  13. ## Changed:
  14. ## - extractBestParse() was renamed to extractBestParseXML() to use the
  15. ## former for another version of function which doesn't use XML and thus
  16. ## does not have unicode problem
  17. ##
  18. ## Version 1.2:
  19. ##
  20. ## Added:
  21. ## - replaceUnRerankable: replaces the n-best parses of the sentence which
  22. ## could not be re-ranked with an dummy parse tree
  23. ##
  24. ## Changed:
  25. ## - extractBestParse: changed to support replacing the UNRERANKED parses
  26. ## with best first-stage parse of equivalent sentence.
  27. ##
  28. ##
  29. ## Version 1.0:
  30. ##
  31. ## - wrapContent(): wrapes a root tag around the file content which also
  32. ## include a number as attribute
  33. ## - extractBestParse: extracts the best parse trees from reranker output
  34. ##
  35. from operator import itemgetter
  36. from xml.etree.ElementTree import ElementTree, tostring
  37. from collections import namedtuple
  38. from decimal import *
  39. import sys, shutil, os
  40. ##----------------------------------------------------------------------
  41. ## Types
  42. # data structure for returning best parse
  43. Parse = namedtuple('Parse', 'sentenceNo tree')
  44. ##----------------------------------------------------------------------
  45. ## Since the output file of the reranker lacks a root node, it causes
  46. ## problem for ElementTree XML parser we use here. This function wrapes
  47. ## a root tag around the file content which also include a number as
  48. ## attribute. This number can serve as a package identifier when parses
  49. ## are divided into packeges and each file is a package.
  50. def wrapContent(pRRParseFileName, pPackageID):
  51. # reading the file content
  52. try:
  53. vfRRParse = open(pRRParseFileName, 'r')
  54. except IOError:
  55. sys.exit('Can\'t open re-ranked parses file: ' + pRRParseFileName)
  56. vContent = vfRRParse.read();
  57. vfRRParse.close()
  58. # adding root tag if it's not already there
  59. if not vContent.startswith('<pack id='):
  60. vContent = '<pack id="' + pPackageID + '">\n' + vContent + '</pack>\n';
  61. # overwritting the file content
  62. try:
  63. vfRRParse = open(pRRParseFileName, 'w')
  64. except IOError:
  65. sys.exit('Can\'t open re-ranked parses file: ' + pRRParseFileName)
  66. vfRRParse.write(vContent);
  67. vfRRParse.close()
  68. vfRRParse.close()
  69. ##----------------------------------------------------------------------
  70. ## Extracts the best parse trees from reranker output, and optionally
  71. ## sorts parses. It also optionally replaces the S1 with TOP, and bracket
  72. ## masks with actual brackets (e.g. -RRB- with ')').
  73. ## It also optionally replaces UNRERANKED mask with best first-stage parse
  74. ## of the sentence from its equivalent first-stage parse file provided.
  75. ## Note that suppose both files are sentence-aligned, so that sentence ID
  76. ## in both files corresponds. Otherwise, it will stop with the appropriate
  77. ## error message.
  78. def extractBestParse(pRRParseFileName, pflgReplaceTOP, pflgReplaceBrackets, pflgSort,
  79. pflgRplcUnreranked, pFSParseFileName):
  80. vlBestParseList = []
  81. # opening re-ranked parse file
  82. try:
  83. vfRRParse = open(pRRParseFileName, 'r')
  84. except IOError:
  85. sys.exit('Can\'t open re-ranked parse file: ' + pRRParseFileName)
  86. # opening first-stage parse file to replace UNRERANKED if requested
  87. if pflgRplcUnreranked:
  88. try:
  89. vfFSParse = open(pFSParseFileName, 'r')
  90. except IOError:
  91. sys.exit('Can\'t open first-stage parse file: ' + pFSParseFileName)
  92. ## extracting best parse for each sentence: the first parse is supposed
  93. ## to be the best one
  94. # sentence iteration
  95. for vSentence in vfRRParse:
  96. # if the line doesn't start with '<s id=' jus skip
  97. if not vSentence.startswith("<s id="):
  98. continue
  99. # extracting the sentence ID
  100. vSentenceID = int(vSentence[7:vSentence.find('"', 7)])
  101. # extracting the first parse
  102. vStartPos = vSentence.find("(S1 ")
  103. vEndPos = vSentence.find("</parse>")
  104. vBestTree = vSentence[vStartPos:vEndPos]
  105. # replacing the UNRERANKED if requested
  106. if pflgRplcUnreranked:
  107. vFSSentence = vfFSParse.readline()
  108. # checking the sentence ID match
  109. vFSSentenceID = int(vFSSentence[7:vFSSentence.find('">', 7)])
  110. if vFSSentenceID != vSentenceID:
  111. sys.exit("Sentence ID mismatch between first-stage and reranked parses: " + str(vSentenceID) + ' & ' + str(vFSSentenceID))
  112. elif vBestTree.find("UNRERANKED UNRERANKED") != -1:
  113. vBestTree = vFSSentence[vFSSentence.find('">(S1 ') + 2: vFSSentence.find('</parse>')]
  114. # replacing S1 with TOP
  115. if pflgReplaceTOP:
  116. vBestTree = "(TOP" + vBestTree[3:]
  117. # replacing barcket masks
  118. if pflgReplaceBrackets:
  119. vBestTree = vBestTree.replace('-LCB-)', '{)')
  120. vBestTree = vBestTree.replace('-RCB-)', '})')
  121. vBestTree = vBestTree.replace('-LRB-)', '()')
  122. vBestTree = vBestTree.replace('-RRB-)', '))')
  123. vlBestParseList.append(Parse(vSentenceID, vBestTree))
  124. # optional sorting
  125. if pflgSort:
  126. vlBestParseList.sort(key=itemgetter(0), reverse=False)
  127. vfRRParse.close()
  128. vfFSParse.close()
  129. return vlBestParseList
  130. ##----------------------------------------------------------------------
  131. ## XML version of extractBestParse() (see above)
  132. def extractBestParseXML(pRRParseFileName, pflgReplaceTOP, pflgReplaceBrackets, pflgSort,
  133. pflgRplcUnreranked, pFSParseFileName):
  134. vlBestParseList = []
  135. # parsing XML format of the reranked parse file
  136. rrParses = ElementTree()
  137. rrParses.parse(pRRParseFileName)
  138. # opening first-stage parse files to replace UNRERANKED if requested
  139. if pflgRplcUnreranked:
  140. try:
  141. vfFSParse = open(pFSParseFileName, 'r')
  142. except IOError:
  143. sys.exit('Can\'t open first-stage parses file: ' + pFSParseFileName)
  144. ## extracting best parse for each sentence: the first parse is supposed
  145. ## to be the best one
  146. vPack = rrParses.getroot()
  147. # sentence iteration
  148. for vSentence in list(vPack.getiterator("s")):
  149. # extracting the first parse
  150. vBestTree = vSentence.find("parse").text
  151. vSentenceID = int(vSentence.attrib["id"])
  152. # replacing the UNRERANKED if requested
  153. if pflgRplcUnreranked:
  154. vFSSentence = vfFSParse.readline()
  155. # checking the sentence ID match
  156. vFSSentenceID = int(vFSSentence[7:vFSSentence.find('">', 7)])
  157. if vFSSentenceID != vSentenceID:
  158. sys.exit("Sentence ID mismatch between first-stage and reranked parses: " + str(vSentenceID) + ' & ' + str(vFSSentenceID))
  159. elif vBestTree.find("UNRERANKED UNRERANKED") != -1:
  160. vBestTree = vFSSentence[vFSSentence.find('">(S1 ') + 2: vFSSentence.find('</parse>')]
  161. # replacing S1 with TOP
  162. if pflgReplaceTOP:
  163. vBestTree = "(TOP" + vBestTree[3:]
  164. # replacing barcket masks
  165. if pflgReplaceBrackets:
  166. vBestTree = vBestTree.replace('-LCB-)', '{)')
  167. vBestTree = vBestTree.replace('-RCB-)', '})')
  168. vBestTree = vBestTree.replace('-LRB-)', '()')
  169. vBestTree = vBestTree.replace('-RRB-)', '))')
  170. vlBestParseList.append(Parse(vSentenceID, vBestTree))
  171. # optional sorting
  172. if pflgSort:
  173. vlBestParseList.sort(key=itemgetter(0), reverse=False)
  174. vfFSParse.close()
  175. return vlBestParseList
  176. ##----------------------------------------------------------------------
  177. ## This function takes the output of first-stage parser and replaces the
  178. ## n-best parses of the sentence which could not be re-ranked with an
  179. ## dummy parse tree. The id of the sentence (zero-based index) is provided
  180. ## to the function.
  181. def replaceUnrerankable(pFSParseFileName, pSentenceID, pflgOverwrite):
  182. ## Unlike extractBestParse(), XML is not used here to keep it efficient,
  183. ## as the task is simple.
  184. # opening the input file
  185. try:
  186. vfFSParse = open(pFSParseFileName, 'r')
  187. except IOError:
  188. sys.exit('Can\'t open first-stage parses file: ' + pFSParseFileName)
  189. ## creating the output file
  190. ## The new file is named the same as input file followed by .new suffix.
  191. ## However, if pflgOverwrite is set to true, it will later overwrite the
  192. ## input file
  193. try:
  194. vReplacedFileName = pFSParseFileName + '.new'
  195. vfReplaced = open(vReplacedFileName, 'w')
  196. except IOError:
  197. print 'Can\'t create output file'
  198. sys.exit(2)
  199. for vSentence in vfFSParse:
  200. if pSentenceID < 10:
  201. vSentenceIDStr = '0' + str(pSentenceID)
  202. else:
  203. vSentenceIDStr = str(pSentenceID)
  204. if vSentence.startswith('<s id="' + vSentenceIDStr + '"'):
  205. vPos = vSentence.find('">(S1 ')
  206. if vPos == -1:
  207. sys.exit("Format looks invalid!: " + vSentence)
  208. else:
  209. vReplacement = vSentence[:vPos + 6] + "(UNRERANKED UNRERANKED))</parse></s>"
  210. # write new sentence into file
  211. vfReplaced.write(vReplacement + '\n')
  212. else:
  213. # if this is not the target sentence, write it as it is
  214. vfReplaced.write(vSentence)
  215. vfFSParse.close()
  216. vfReplaced.close()
  217. ## if pflgOverwrite is true, the input file will be overwritten
  218. if pflgOverwrite:
  219. shutil.copy(vReplacedFileName, pFSParseFileName)
  220. os.remove(vReplacedFileName)
  221. ##----------------------------------------------------------------------
  222. ## This function takes the parse file (tree per line) and replaces the
  223. ## problematic parses which evalb couldn't process with a dummy parse tree.
  224. ## Currently, the sentence is identified by its line number in the file
  225. ## is an argument to this function. It can be imporoved to detect such
  226. ## sentences itself!
  227. ## Note that this function is not specific to Brown output, as its input
  228. ## is considered in a tree-per-line format.
  229. def replaceUnscorable(pParseFileName, pSentenceLineNo, pflgOverwrite):
  230. # opening the input file
  231. try:
  232. vfParse = open(pParseFileName, 'r')
  233. except IOError:
  234. sys.exit('Can\'t open parse file: ' + pParseFileName)
  235. ## creating the output file
  236. ## The new file is named the same as input file followed by .new suffix.
  237. ## However, if pflgOverwrite is set to true, it will later overwrite the
  238. ## input file
  239. try:
  240. vReplacedFileName = pParseFileName + '.new'
  241. vfReplaced = open(vReplacedFileName, 'w')
  242. except IOError:
  243. print 'Can\'t create output file'
  244. sys.exit(2)
  245. vLineCntr = 1
  246. for vSentence in vfParse:
  247. if vLineCntr == pSentenceLineNo:
  248. # write the dummy pasre into file
  249. vfReplaced.write("(TOP (UNSCORABLE UNSCORABLE))\n")
  250. else:
  251. # if this is not the target sentence, write it as it is
  252. vfReplaced.write(vSentence)
  253. vLineCntr += 1
  254. vfParse.close()
  255. vfReplaced.close()
  256. ## if pflgOverwrite is true, the input file will be overwritten
  257. if pflgOverwrite:
  258. shutil.copy(vReplacedFileName, pParseFileName)
  259. os.remove(vReplacedFileName)