extract-best-pred-parses.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. #! /usr/bin/python
  2. ## This script extracts the N best parses according to the scores predicted
  3. ## by parser accuracy predictor. It gets the parse file in Treebank format
  4. ## (one parse tree per line) and a file containing the predictions (currently
  5. ## weka classification output) for every parse tree, then outputs a file
  6. ## containing the requested top parses based on those scores.
  7. ##
  8. ## Current version: 1
  9. ##
  10. from collections import namedtuple
  11. from operator import itemgetter
  12. import sys, optparse, weka
  13. ##======================================================================
  14. ## main
  15. def main(argv=None):
  16. if argv is None:
  17. argv = sys.argv
  18. parser = optparse.OptionParser(usage="%prog <PARSE FILE> <WEKA CLASSIFIER OUTPUT> <NUMBER OF BEST PARSES TO EXTRACT>" +
  19. "\nThis script extracts the N best parses according to the scores predicted by parser accuracy predictor.", version="%prog 1.0")
  20. (opts, posArgs) = parser.parse_args()
  21. # checking arguments
  22. if len(posArgs) < 3:
  23. parser.error("At least 3 arguments are required!")
  24. if not posArgs[2].isdigit():
  25. parser.error("The second argument should be a number!")
  26. else:
  27. vBestNum = int(posArgs[2])
  28. # extracting scores from classifier output
  29. vlScores = []
  30. # we supposed the 2nd column contains predictions
  31. vlScores = weka.extractClassifierPreds(posArgs[1], 2, False)
  32. # sorting scores
  33. # data structure to store scores with sentence number
  34. ParseScore = namedtuple('ParseScore', 'sentenceNo score')
  35. vlParseScores = []
  36. vSentenceNo = 1
  37. for vScore in vlScores:
  38. vlParseScores.append(ParseScore(vSentenceNo, vScore))
  39. vSentenceNo += 1
  40. vlScores = []
  41. vlParseScores.sort(key=itemgetter(1), reverse=True)
  42. # exracting the sentence number of best parses
  43. vlBestParseNo = []
  44. vCntr = 1
  45. for vParseScore in vlParseScores:
  46. vlBestParseNo.append(vParseScore.sentenceNo)
  47. if vCntr == vBestNum:
  48. break
  49. else:
  50. vCntr += 1
  51. vlParseScores = []
  52. # opening parse file
  53. vParseFileName = posArgs[0]
  54. try:
  55. vfParse = open(vParseFileName, 'r')
  56. except IOError:
  57. sys.exit('Can\'t open parse file: ' + vParseFileName)
  58. ## Instead of loading all parses into memory and then selecting those
  59. ## indecies which exist in vlBestParsesNo, or iterating through parses
  60. ## in file and searching for each sentence number in vlBestParsesNo,
  61. ## we chose a more efficient way which sorts the vlBestParsesNo in
  62. ## ascending order, iterates through it and for each parse number,
  63. ## iterates the file ignoring all parses until reaches that pasrse number.
  64. vlBestParseNo.sort()
  65. vParseCntr = 0
  66. for vBestParseNo in vlBestParseNo:
  67. for vParse in vfParse:
  68. vParseCntr += 1
  69. if vBestParseNo == vParseCntr:
  70. print vParse,
  71. break
  72. vfParse.close()
  73. ##======================================================================
  74. ## calling main
  75. if __name__ == "__main__":
  76. sys.exit(main())