extract-best-reranked-parses.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. #! /usr/bin/python
  2. ## This script extracts best reranked parses from Charniak & Johansson
  3. ## (2005) output which are arranged in packeges. It also replaces S1 with
  4. ## TOP.
  5. ##
  6. ## Current version: 1.1
  7. ##
  8. ## Added in current version:
  9. ## Optionally, it now replaces un-re-ranked parses, which are identified
  10. ## by UNRERANKED mask, with the corresponding best first-stage parse.
  11. ## These parses are provided as optional argument to the script.
  12. ## Note that currently this feature is not supported with multiple packages
  13. ## input by wildcards.
  14. from operator import itemgetter
  15. import sys, optparse, brownparses, codecs
  16. ##======================================================================
  17. ## main
  18. def main(argv=None):
  19. if argv is None:
  20. argv = sys.argv
  21. parser = optparse.OptionParser(usage="%prog <PACKAGE SIZE> <CORPUS SIZE> <OUTPUT FILE NAME> <PACKAGES FILES NAME PATTERN> [options]" +
  22. "\nExtracts best reranked parses from Charniak & Johansson (2005) output which are arranged in packeges.", version="%prog 1.0")
  23. parser.add_option("-f", "--firststage", help="first-stage parser output", metavar="FIRST-STAGE PARSER OUTPUT", dest="fsParses", action="store")
  24. # processing input arguments
  25. (opts, posArgs) = parser.parse_args()
  26. if len(posArgs) < 4:
  27. parser.error("At least 4 arguments are required")
  28. vPackageSize = int(posArgs[0])
  29. vCorpusSize = int(posArgs[1])
  30. # if first-stage parse file option is not provided set flag to false
  31. if opts.fsParses != None:
  32. vFSParseFileName = opts.fsParses
  33. vReplaceUnReranked = True
  34. else:
  35. vFSParseFileName = ""
  36. vReplaceUnReranked = False
  37. # creating output file
  38. try:
  39. vfOutput = codecs.open(posArgs[2], mode='w', encoding='utf-8')
  40. except IOError:
  41. sys.exit('Can\'t create output file: ' + posArgs[2])
  42. # sorting packages by name
  43. vlPackages = posArgs[3:]
  44. vlPackages.sort()
  45. # extracting best parses from all packages.
  46. vlBestParses = []
  47. vPackCntr = 0
  48. for vPackage in vlPackages:
  49. print vPackage
  50. vPackCntr += 1
  51. vlPackageParses = []
  52. ## wrapping the content of output file of the reranker into a root node
  53. ## to avoid problem by brownparses.extractBestParse() which uses XML
  54. ## format to extract parses.
  55. ## Uncomment when using brownparses.extractBestParseXML()
  56. #brownparses.wrapContent(vPackage, str(vPackCntr))
  57. vlPackageParses += brownparses.extractBestParse(vPackage, True, False, True, vReplaceUnReranked, vFSParseFileName)
  58. ## adjusting sentence numbers based on package number (packages are sorted
  59. ## above)
  60. for vParseTree in vlPackageParses:
  61. if vParseTree.sentenceNo >= vPackageSize:
  62. sys.exit("Sentence number exceeds the package size! package: " + vPackage)
  63. else:
  64. vlBestParses.append(brownparses.Parse(vPackageSize * (vPackCntr - 1) + vParseTree.sentenceNo + 1, vParseTree.tree))
  65. vlPackageParses = []
  66. # inserting dummy parse trees for missing sentences
  67. vDummyParse = '(TOP (UNPARSED UNPARSED))'
  68. vTreeCntr = 1
  69. # this first loop treats missing trees from the middle
  70. while (vTreeCntr <= vlBestParses[-1].sentenceNo):
  71. if vlBestParses[vTreeCntr - 1].sentenceNo > vTreeCntr:
  72. vlBestParses.insert(vTreeCntr - 1, brownparses.Parse(vTreeCntr, vDummyParse))
  73. elif vlBestParses[vTreeCntr - 1].sentenceNo < vTreeCntr:
  74. sys.exit("Unknown error: sentence match problem")
  75. try:
  76. vfOutput.write(vlBestParses[vTreeCntr - 1].tree.decode('UTF-8') + '\n')
  77. except UnicodeEncodeError:
  78. vfOutput.write(vlBestParses[vTreeCntr - 1].tree + '\n')
  79. print vTreeCntr
  80. vTreeCntr += 1
  81. # this last loop treats missing trees from the end
  82. while (vTreeCntr <= vCorpusSize):
  83. vlBestParses.append(brownparses.Parse(vTreeCntr, vDummyParse))
  84. try:
  85. vfOutput.write(vlBestParses[vTreeCntr - 1].tree.decode('UTF-8') + '\n')
  86. except UnicodeEncodeError:
  87. vfOutput.write(vlBestParses[vTreeCntr - 1].tree + '\n')
  88. print vTreeCntr
  89. vTreeCntr += 1
  90. vfOutput.close()
  91. ##======================================================================
  92. ## calling main
  93. if __name__ == "__main__":
  94. sys.exit(main())