extract-parser-pred-features.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. #! /usr/bin/python
  2. ## This script extracts a set of features for parser accuracy prediction
  3. ## and output it into an ARFF file to be used in Weka machine learning
  4. ## toolkit.
  5. ##
  6. ## To extract all implemented features, option --all (-a) must be used.
  7. ## To select a subset of features, the corresponding option to each
  8. ## feature should be used.
  9. ##
  10. ## Option --all overrides individual feature options. If no individual
  11. ## feature option is provided, all features will be extracted.
  12. ##
  13. ## Current version: 1.4
  14. ## - The root and syntactic tag sets for Tiget Treebank is updated.
  15. ## - Bug fixed for tags containing comma.
  16. ##
  17. ## Version: 1.3
  18. ## - extractTreeDepth() and extractConstituentCnt() were moved to parse.py module.
  19. ##
  20. ## Version: 1.2
  21. ## - Root and syntactic tags are customized for Treebanks.
  22. ## - The tag set for French Treebank and Tiger is added in addidtion to
  23. ## existing WSJ tag set.
  24. ## - Accordingly, an option is added to specify the tag set to be used.
  25. ##
  26. ## Version: 1.1
  27. ## - UNSCORABLE and UNRERANKED tags are added to root tag set
  28. import sys, optparse, weka, evalbproc, parse
  29. ##----------------------------------------------------------------------
  30. ## extracts input sentence length feature
  31. def extractLength(pSentence):
  32. return len(pSentence.split(' '))
  33. ##----------------------------------------------------------------------
  34. ## extracts number of words in parser test data unseen in parser training data
  35. def extractUnseenNum(pSentence, pParserTrainingYields):
  36. vNum = 0
  37. for word in pSentence.split():
  38. if word not in pParserTrainingYields:
  39. vNum += 1
  40. return vNum
  41. ##----------------------------------------------------------------------
  42. ## extract the root tag of parse tree
  43. def extractRootTag(pParseTree, pRootTagList):
  44. # root tag starts at index 6 following (TOP (
  45. vRootHeadChuck = pParseTree[6:]
  46. vRootTag = vRootHeadChuck.split(' ')[0]
  47. if ',' in vRootTag:
  48. vRootTag = '"' + vRootTag + '"'
  49. if vRootTag not in pRootTagList:
  50. sys.exit("Unknown root tag: " + vRootTag)
  51. else:
  52. return vRootTag
  53. ##----------------------------------------------------------------------
  54. ## extracts the number of syntactic tag seen in the parse tree
  55. def extractSynTagNum(pParseTree, pSynTagList):
  56. vSynTagNumsStr = ""
  57. ## Since Ravi et al. (2008) state that they used syntactic labels of
  58. ## INTERNAL nodes, we consider internal to be any node other than root.
  59. ## For this, we ignore first 2 items of split parse tree.
  60. vInternalTreeStart = pParseTree.find(' ')
  61. vInternalTreeStart += pParseTree[vInternalTreeStart + 1:].find(' ')
  62. vInternalTree = pParseTree[vInternalTreeStart + 2:]
  63. ## looping through syntactic tag set to count the occurrence of each tag
  64. ## in the parse tree
  65. for tag in pSynTagList:
  66. if ',' in tag:
  67. vTag = "'" + tag + "'"
  68. else:
  69. vTag = tag
  70. ## To secure that we only count the real node labels, not those which
  71. ## have the same spelling to tag but are part of text (e.g. you have
  72. ## parsed the text of a paper published in a parsing conference), we
  73. ## append '(' to the begining of tag. Also, a space is added to the
  74. ## end to avoid counting of e.g. NN for NNP as well.
  75. vSecureTag = '(' + vTag + ' '
  76. vSynTagNumsStr += str(vInternalTree.count(vSecureTag)) + ','
  77. return vSynTagNumsStr
  78. ##----------------------------------------------------------------------
  79. ## extracts the number of to-ig ranked words in the input sentence
  80. def extractIGLexCount(pSentence, pIGWordList):
  81. vIGLexCountsStr = ""
  82. ## looping through syntactic tag set to count the occurrence of each tag
  83. ## in the parse tree
  84. for word in pIGWordList:
  85. vInputTokens = pSentence.split()
  86. vIGLexCountsStr += str(vInputTokens.count(word)) + ','
  87. return vIGLexCountsStr
  88. ##----------------------------------------------------------------------
  89. ## extracts features and writes into output ARFF file
  90. def extractFeatures(pScores, pOutFile, pTxtInputFile, pParseFile, pInputPPLFile,
  91. pParserTrainingYields, pLength, pUnseenNum, pPPL, pRootTag, pRootTagList,
  92. pSynTagNum, pSynTagList, pIGLexCount, pIGWordList, pRef, pRefScores,
  93. pTreeDepth, pConstituentCnt):
  94. vLoopCntr = 0
  95. for vScore in pScores:
  96. if pLength or pUnseenNum or pIGLexCount:
  97. vSentence = pTxtInputFile.readline().splitlines()[0]
  98. if pLength:
  99. pOutFile.write(str(extractLength(vSentence)) + ',');
  100. if pUnseenNum:
  101. pOutFile.write(str(extractUnseenNum(vSentence, pParserTrainingYields)) + ',');
  102. if pPPL:
  103. ppl = pInputPPLFile.readline()
  104. if not ppl:
  105. exit("Perplexity score file does not match input file in line number")
  106. else:
  107. pOutFile.write(ppl.splitlines()[0] + ',');
  108. if pRootTag or pSynTagNum:
  109. parseTree = pParseFile.readline()
  110. if not parseTree:
  111. sys.exit("Parse file does not match input file in line number")
  112. if pRootTag:
  113. pOutFile.write(extractRootTag(parseTree, pRootTagList) + ',');
  114. if pSynTagNum:
  115. pOutFile.write(extractSynTagNum(parseTree, pSynTagList));
  116. if pIGLexCount:
  117. pOutFile.write(extractIGLexCount(vSentence, pIGWordList));
  118. if pRef:
  119. pOutFile.write(str(pRefScores[vLoopCntr]) + ',');
  120. # new features
  121. if pTreeDepth:
  122. pOutFile.write(str(parse.extractTreeDepth(parseTree)) + ',')
  123. if pConstituentCnt:
  124. pOutFile.write(str(parse.extractConstituentCnt(parseTree)) + ',')
  125. # writing outcome (score) feature value
  126. pOutFile.write(str(vScore) + '\n');
  127. vLoopCntr += 1
  128. pOutFile.write('\n')
  129. return
  130. ##----------------------------------------------------------------------
  131. ## writes the header of the output file according to the header file provided
  132. ## ignores any attribute provided in the header file
  133. ## also writes the @DATA at the end
  134. def writeHeader(pHeaderFile, pOutFile, pLength, pUnseenNum, pPPL, pRootTag, pRootTagList,
  135. pSynTagNum, pSynTagList, pIGLexCount, pIGWordList, pFRef,
  136. pTreeDepth, pConstituentCnt):
  137. for line in pHeaderFile:
  138. if line.upper().startswith("@RELATION"):
  139. pOutFile.write(line)
  140. break
  141. else:
  142. pOutFile.write(line)
  143. pOutFile.write('\n')
  144. # adding attribute declarations for features
  145. if pLength:
  146. pOutFile.write("@ATTRIBUTE\tlength\t\t\tinteger\n")
  147. if pUnseenNum:
  148. pOutFile.write("@ATTRIBUTE\tunseennum\t\tinteger\n")
  149. if pPPL:
  150. pOutFile.write("@ATTRIBUTE\tperplexity\t\treal\n")
  151. if pRootTag:
  152. pOutFile.write("@ATTRIBUTE\troottag\t\t\t{")
  153. vTagList = ""
  154. for tag in pRootTagList:
  155. vTagList += tag + ','
  156. vTagList = vTagList[:-1]
  157. pOutFile.write(vTagList + "}\n")
  158. if pSynTagNum:
  159. for tag in pSynTagList:
  160. pOutFile.write("@ATTRIBUTE\t\"syntag" + tag + "\"\t\tinteger\n")
  161. if pIGLexCount:
  162. for word in pIGWordList:
  163. if '"' in word:
  164. pOutFile.write("@ATTRIBUTE\t'iglex" + word + "'\t\tinteger\n")
  165. else:
  166. pOutFile.write("@ATTRIBUTE\t\"iglex" + word + "\"\t\tinteger\n")
  167. if pFRef:
  168. pOutFile.write("@ATTRIBUTE\tfref\t\treal\n")
  169. # new features
  170. if pTreeDepth:
  171. pOutFile.write("@ATTRIBUTE\ttreedepth\t\tinteger\n")
  172. if pConstituentCnt:
  173. pOutFile.write("@ATTRIBUTE\tconstcnt\t\tinteger\n")
  174. # adding attribute declarations for outcome (score)
  175. pOutFile.write("@ATTRIBUTE\tfscore\t\treal\n")
  176. pOutFile.write('\n')
  177. pOutFile.write("@DATA\n")
  178. ##======================================================================
  179. ## main
  180. def main(argv=None):
  181. if argv is None:
  182. argv = sys.argv
  183. parser = optparse.OptionParser(usage="%prog <ARFF FILE HEADER> <OUTPUT FILE NAME> [OPTIONS]" +
  184. "\nExtracts a set of features for parser accuracy prediction and output it into an ARFF file to be used in Weka machine learning toolkit.", version="%prog 1.4")
  185. parser.add_option("-s", "--scores", help="parser evalb score file", metavar="PARSER EVALB SCORE FILE", dest="scoreFileName", action="store")
  186. parser.add_option("-n", "--samplesize", help="number of instances if score file is not specified (for unlabeled instances)", metavar="SAMPLE SIZE", dest="sampleSize", action="store")
  187. parser.add_option("-a", "--all", help="extract all implemented features (overrides all other individual feature options)", dest="extAll", action="store_true")
  188. parser.add_option("--length", help="extract input sentence word length as feature", dest="extLength", action="store_true")
  189. parser.add_option("--unseennum", help="extract number of words in the input data unseen in parser training data as feature", dest="extUnseenNum", action="store_true")
  190. parser.add_option("-t", "--textinput", help="tokenized raw text input data set", metavar="TOKENIZED RAW TEXT INPUT DATA", dest="txtInputFileName", action="store")
  191. parser.add_option("--tagset", help="tag set of input parses: wsj, ftb, tiger", metavar="PARSER TAG SET", default="wsj", dest="tagSet", action="store")
  192. parser.add_option("--roottag", help="extract the root tag of parse tree as feature", dest="extRootTag", action="store_true")
  193. parser.add_option("--syntagnum", help="extract the number of syntactic tags seen in the parse tree as feature", dest="extSynTagNum", action="store_true")
  194. parser.add_option("-p", "--parse", help="parsed input data set", metavar="PARSED INPUT DATA", dest="parseFileName", action="store")
  195. parser.add_option("--perplexity", help="extract the perplexity of each sentence with respect to the parser training data (--ptrain) as feature", dest="extPPL", action="store_true")
  196. parser.add_option("-P", "--pplscores", help="perplexity scores for the input sentences", metavar="PERPLEXITY SCORES", dest="inputPPLFileName", action="store")
  197. parser.add_option("-T", "--ptraintext", help="tokenized raw training data set of the parser", metavar="TOKENIZED RAW TEXT PARSER TRAINING DATA", dest="txtParserTrainFileName", action="store")
  198. parser.add_option("--iglexcount", help="extract the number of top-ig words seen in the parse tree as feature", dest="extIGLexCount", action="store_true")
  199. parser.add_option("-g", "--igwords", help="top-ig words list file", metavar="TOP IG WORDS FILE", dest="topIGWordsFileName", action="store")
  200. parser.add_option("--fref", help="extract f-score of the input parse with respect to a reference parser output as feature", dest="extFRef", action="store_true")
  201. parser.add_option("-r", "--refscores", help="evalb score file of reference parser", metavar="REFERENCE PARSER EVALB SCORE FILE", dest="refEvalFileName", action="store")
  202. parser.add_option("--treedepth", help="tree depth", metavar="TREE DEPTH", dest="extTreeDepth", action="store_true")
  203. parser.add_option("--constcount", help="number of syntactic constituents", metavar="CONSTITUENT COUNT", dest="extConstituentCnt", action="store_true")
  204. (opts, posArgs) = parser.parse_args()
  205. # processing arguments
  206. if len(posArgs) < 2:
  207. parser.error("At least 2 arguments are required!")
  208. # opening header file
  209. try:
  210. vfHeader = open(posArgs[0], 'r')
  211. except IOError:
  212. sys.exit('Can\'t open header file: ' + posArgs[0])
  213. # creating output file
  214. try:
  215. vfOutput = open(posArgs[1] + '.arff', 'w')
  216. except IOError:
  217. sys.exit('Can\'t create output file: ' + posArgs[1])
  218. # processing options
  219. ## loading parse scores (outcome) into memory
  220. ## Note that for unlabeled test data, this is not given. Instead, the
  221. ## sample set size (-n or --samplesize option) must be provided. In this
  222. ## case, -1 is a outcome template for each instance.
  223. vlScore = []
  224. if opts.scoreFileName != None:
  225. vlScores = evalbproc.extractSentenceScores(opts.scoreFileName, False, False, False, False)
  226. for vScore in vlScores:
  227. vlScore.append(vScore.fscore)
  228. elif opts.sampleSize != None:
  229. vlScore = [-1] * int(opts.sampleSize)
  230. else:
  231. sys.exit('Either sample size or parse score file should be provided!')
  232. ## processing feature options (if no option, --all will be set)
  233. ## remember to add new feature options here
  234. if not (opts.extAll or opts.extLength or opts.extUnseenNum or opts.extPPL or
  235. opts.extRootTag or opts.extSynTagNum or opts.extIGLexCount or opts.extFRef or
  236. opts.extTreeDepth, opts.extConstituentCnt):
  237. opts.extAll = True
  238. ## overriding individual feature options if --all is set (i.e. all
  239. ## individual feature options are considered set to be extracted)
  240. ## remember to add new feature options here
  241. if opts.extAll:
  242. opts.extLength = True
  243. opts.extUnseenNum = True
  244. opts.extPPL = True
  245. opts.extRootTag = True
  246. opts.extSynTagNum = True
  247. opts.extIGLexCount = True
  248. opts.extFRef = True
  249. opts.extTreeDepth = True
  250. opts.extConstituentCnt = True
  251. ## opening raw text file, if any related feature is requested
  252. ## remember to add new such features hear
  253. vfTextInput = None
  254. if opts.extLength or opts.extUnseenNum or opts.extIGLexCount:
  255. if opts.txtInputFileName == None:
  256. sys.exit('Text input file is not provided!')
  257. else:
  258. try:
  259. vfTextInput = open(opts.txtInputFileName, 'r')
  260. except IOError:
  261. sys.exit('Can\'t open text input file: ' + opts.txtInputFileName)
  262. ## opening parse file, if any related feature is requested
  263. ## remember to add new such features hear
  264. vfParse = None
  265. if opts.extRootTag or opts.extSynTagNum or opts.extTreeDepth or opts.extConstituentCnt:
  266. if opts.parseFileName == None:
  267. sys.exit('Parse file is not provided!')
  268. else:
  269. try:
  270. vfParse = open(opts.parseFileName, 'r')
  271. except IOError:
  272. sys.exit('Can\'t open parse file: ' + opts.parseFileName)
  273. ## opening parser training raw text file, if any related feature is requested
  274. ## remember to add new such features hear
  275. vfParserTrainText = None
  276. if opts.extUnseenNum:
  277. if opts.txtParserTrainFileName == None:
  278. sys.exit('Parser training text file is not provided!')
  279. else:
  280. try:
  281. vfParserTrainText = open(opts.txtParserTrainFileName, 'r')
  282. except IOError:
  283. sys.exit('Can\'t open parser training text file: ' + opts.txtParserTrainFileName)
  284. ## opening perplexity scores file for the input sentences, if any related
  285. ## feature is requested
  286. vfInputPPL = None
  287. if opts.extPPL:
  288. if opts.inputPPLFileName == None:
  289. sys.exit('Perplexity scores file for the input sentences is not provided!')
  290. else:
  291. try:
  292. vfInputPPL = open(opts.inputPPLFileName, 'r')
  293. except IOError:
  294. sys.exit('Can\'t open perplexity scores file for the input sentences: ' + opts.inputPPLFileName)
  295. ## WSJ: The set is extracted from WSJ by Jennifer plus 1 UNPARSED tag for
  296. ## those sentences which could not be parsed, 1 UNSCORABLE tag for those
  297. ## sentences which could not be scored, and 1 UNRERANKED tag for those
  298. ## sentences which could not be re-ranked:
  299. ## FTB: The set is extracted from FTB plus the above additional tags and X,
  300. ## plus some found in the Berkeley output (trained on FTB).
  301. ## The only root tag found in FTB (train+dev+test) was SENT.
  302. ## TIGER: The set is extracted from Tiger plus the above additional tags,
  303. ## those found in Tiger.penn, and X.
  304. cRootTagList = [('WSJ', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', 'ADJP', 'ADVP', 'CONJP', 'FRAG', 'INTJ', 'LS', 'LST', 'NAC', 'NP', 'NX', 'PP', 'PRN', 'PRT', 'QP', 'RRC', 'S', 'SBAR', 'SBARQ', 'SINV', 'SQ', 'UCP', 'VP', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP', 'X']),
  305. ('FTB', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', 'SENT', 'NP', 'PP', 'COORD', 'VPinf', 'X']),
  306. ('TIGER', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', '$', '$.', '"$,"', 'ADJA', 'ADJD', 'ADV', 'AP', 'ART', 'AVP', 'CAP', 'CARD', 'CAVP', 'CH', 'CNP', 'CO', 'CPP', 'CS', 'CVP', 'DL', 'ITJ', 'KON', '$*LRB*', 'NE', 'NN', 'NP', 'PN', 'PP', 'PTKANT', 'PWAV', 'PWS', 'S', 'VMFIN', 'VP', 'VVPP', 'XY', 'X'])]
  307. # fetching the root tag set based on user option
  308. vRootTagList = [tagset[1] for tagset in cRootTagList if tagset[0] == opts.tagSet.upper()][0]
  309. ## WSJ: The set is extracted from WSJ by Jennifer plus AUX and AUXJ used by
  310. ## Brown.
  311. ## FTB: The set is extracted from FTB plus 'X'.
  312. ## TIGER: The set is extracted from Tiger.penn (727!) plus 'X'.
  313. cSynTagList = [('WSJ', ['#', '$', "''", ',', '.', ':', 'ADJP', 'ADVP', 'AUX', 'AUXJ', 'CC', 'CD', 'CONJP', 'DT', 'EX', 'FRAG', 'FW', 'IN', 'INTJ', 'JJ', 'JJR', 'JJS', 'LS', 'LST', 'MD', 'NAC', 'NN', 'NNP', 'NNPS', 'NNS', 'NP', 'NX', 'PDT', 'POS', 'PP', 'PRN', 'PRP', 'PRP$', 'PRT', 'PRT|ADVP', 'QP', 'RB', 'RBR', 'RBS', 'RP', 'RRC', 'S', 'SBAR', 'SBARQ', 'SINV', 'SQ', 'SYM', 'TO', 'UCP', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'VP', 'WDT', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP', 'WP', 'WP$', 'WRB', 'X', '``']),
  314. ('FTB', ['ADJ', 'ADJWH', 'AdP', 'ADV', 'ADVWH', 'AP', 'CC', 'CLO', 'CLR', 'CLS', 'COORD', 'CS', 'DET', 'DETWH', 'ET', 'I', 'NC', 'NP', 'NPP', 'P', 'P+D', 'PONCT', 'PP', 'P+PRO', 'PREF', 'PRO', 'PROREL', 'PROWH', 'SENT', 'Sint', 'Srel', 'Ssub', 'V', 'VIMP', 'VINF', 'VN', 'VPinf', 'VPP', 'VPpart', 'VPR', 'VS', 'X']),
  315. ('TIGER', ['$,', "'$.'", 'AA-CJ', 'AA-HD', 'AA-MO', 'AA-NK', 'AA-PD', 'ADJA', 'ADJA-ADC', 'ADJA-CJ', 'ADJA-HD', 'ADJA-MO', 'ADJA-NK', 'ADJA-NMC', 'ADJA-OA', 'ADJA-PD', 'ADJA-PNC', 'ADJA-SB', 'ADJD', 'ADJD-APP', 'ADJD-CJ', 'ADJD-CVC', 'ADJD-HD', 'ADJD-MNR', 'ADJD-MO', 'ADJD-NK', 'ADJD-OA', 'ADJD-PAR', 'ADJD-PD',
  316. 'ADJD-PH', 'ADJD-PNC', 'ADJD-SB', 'ADJD-UC', 'ADV', 'ADV-AC', 'ADV-APP', 'ADV-AVC', 'ADV-CD', 'ADV-CJ', 'ADV-DH', 'ADV-DM', 'ADV-HD', 'ADV-JU', 'ADV-MNR', 'ADV-MO', 'ADV-NG', 'ADV-NK', 'ADV-OA', 'ADV-OP', 'ADV-PAR', 'ADV-PH', 'ADV-PNC', 'ADV-SVP', 'ADV-UC', 'AP', 'AP-AG', 'AP-AMS', 'AP-APP', 'AP-CC',
  317. 'AP-CJ', 'AP-HD', 'AP-MNR', 'AP-MO', 'AP-NG', 'AP-NK', 'AP-NMC', 'AP-OA', 'AP-OC', 'AP-PAR', 'AP-PD', 'AP-PNC', 'APPO-AC', 'APPR', 'APPR-AC', 'APPRART', 'APPRART-AC', 'APPRART-CJ', 'APPRART-PNC', 'APPR-AVC', 'APPR-CD', 'APPR-CJ', 'APPR-CP', 'APPR-HD', 'APPR-MO', 'APPR-PNC', 'APPR-UC', 'AP-RE', 'AP-RS',
  318. 'AP-SB', 'APZR-AC', 'APZR-MO', 'ART', 'ART-CJ', 'ART-HD', 'ART-MO', 'ART-NK', 'ART-NMC', 'ART-PNC', 'AVP', 'AVP-APP', 'AVP-CC', 'AVP-CD', 'AVP-CJ', 'AVP-CP', 'AVP-CVC', 'AVP-DM', 'AVP-HD', 'AVP-JU', 'AVP-MNR', 'AVP-MO', 'AVP-NG', 'AVP-NK', 'AVP-OA', 'AVP-OC', 'AVP-OP', 'AVP-PAR', 'AVP-PD', 'AVP-PH',
  319. 'AVP-PNC', 'AVP-RE', 'AVP-RS', 'AVP-SB', 'CAC-AC', 'CAC-MO', 'CAP', 'CAP-APP', 'CAP-CJ', 'CAP-HD', 'CAP-MNR', 'CAP-MO', 'CAP-NK', 'CAP-NMC', 'CAP-OA', 'CAP-PAR', 'CAP-PD', 'CAP-PNC', 'CAP-RE', 'CAP-SB', 'CARD', 'CARD-AMS', 'CARD-APP', 'CARD-CJ', 'CARD-HD', 'CARD-MNR', 'CARD-MO', 'CARD-NK', 'CARD-NMC',
  320. 'CARD-OA', 'CARD-OP', 'CARD-PAR', 'CARD-PD', 'CARD-PNC', 'CARD-SB', 'CARD-UC', 'CAVP', 'CAVP-AC', 'CAVP-CJ', 'CAVP-HD', 'CAVP-MNR', 'CAVP-MO', 'CAVP-NG', 'CAVP-NK', 'CAVP-PAR', 'CAVP-PD', 'CAVP-SVP', 'CCP-CP', 'CH', 'CH-APP', 'CH-CJ', 'CH-CM', 'CH-DM', 'CH-HD', 'CH-MNR', 'CH-MO', 'CH-NK', 'CH-OA',
  321. 'CH-OC', 'CH-PAR', 'CH-PD', 'CH-PNC', 'CH-RE', 'CH-RS', 'CH-SB', 'CNP', 'CNP-AG', 'CNP-AMS', 'CNP-APP', 'CNP-CC', 'CNP-CJ', 'CNP-DA', 'CNP-DH', 'CNP-HD', 'CNP-MNR', 'CNP-MO', 'CNP-NK', 'CNP-NMC', 'CNP-OA', 'CNP-OC', 'CNP-OG', 'CNP-PAR', 'CNP-PD', 'CNP-PH', 'CNP-PNC', 'CNP-RE', 'CNP-RS', 'CNP-SB',
  322. 'CNP-VO', 'CO', 'CO-AG', 'CO-APP', 'CO-CD', 'CO-CJ', 'CO-CP', 'CO-HD', 'CO-MNR', 'CO-MO', 'CO-NK', 'CO-OA', 'CO-OC', 'CO-PAR', 'CO-PD', 'CO-PG', 'CO-PNC', 'CO-RE', 'CO-RS', 'CO-SB', 'CPP', 'CPP-APP', 'CPP-CC', 'CPP-CJ', 'CPP-CVC', 'CPP-MNR', 'CPP-MO', 'CPP-NK', 'CPP-OP', 'CPP-PAR', 'CPP-PG', 'CPP-PNC',
  323. 'CPP-RE', 'CPP-SBP', 'CS', 'CS-APP', 'CS-CC', 'CS-CJ', 'CS-DH', 'CS-HD', 'CS-MNR', 'CS-MO', 'CS-NK', 'CS-OC', 'CS-PAR', 'CS-PD', 'CS-PNC', 'CS-RC', 'CS-RE', 'CS-RS', 'CS-SB', 'CVP', 'CVP-APP', 'CVP-CC', 'CVP-CJ', 'CVP-HD', 'CVP-MNR', 'CVP-MO', 'CVP-NK', 'CVP-OC', 'CVP-PAR', 'CVP-PD', 'CVP-RE', 'CVP-SB',
  324. 'CVZ-HD', 'CVZ-OA', 'CVZ-OC', 'DL', 'DL-CJ', 'DL-MO', 'DL-PAR', 'FM-AC', 'FM-ADC', 'FM-APP', 'FM-AVC', 'FM-CJ', 'FM-MO', 'FM-NK', 'FM-OA', 'FM-OC', 'FM-OP', 'FM-PD', 'FM-PNC', 'FM-SB', 'FM-UC', 'ISU-CJ', 'ISU-MO', 'ISU-NK', 'ITJ', 'ITJ-DM', 'ITJ-MO', 'ITJ-NK', 'ITJ-OA', 'ITJ-UC', 'ITJ-VO', 'KOKOM-AC',
  325. 'KOKOM-CC', 'KOKOM-CD', 'KOKOM-CM', 'KOKOM-CP', 'KOKOM-MO', 'KOKOM-UC', 'KON', 'KON-AVC', 'KON-CD', 'KON-CJ', 'KON-DH', 'KON-HD', 'KON-JU', 'KON-MO', 'KON-PNC', 'KON-UC', 'KOUI-AC', 'KOUI-AVC', 'KOUI-CP', 'KOUS-AC', 'KOUS-AVC', 'KOUS-CJ', 'KOUS-CP', 'KOUS-HD', 'KOUS-MO', 'KOUS-PH', 'KOUS-PNC', '$*LRB*',
  326. '$*LRB*-PNC', 'MTA-CJ', 'MTA-NK', 'NE', 'NE-AC', 'NE-ADC', 'NE-AG', 'NE-APP', 'NE-CJ', 'NE-DA', 'NE-DH', 'NE-DM', 'NE-MNR', 'NE-MO', 'NE-NK', 'NE-OA', 'NE-OC', 'NE-OP', 'NE-PAR', 'NE-PD', 'NE-PNC', 'NE-RE', 'NE-RS', 'NE-SB', 'NE-UC', 'NE-VO', 'NM', 'NM-AMS', 'NM-APP', 'NM-CC', 'NM-CJ', 'NM-HD', 'NM-MNR',
  327. 'NM-MO', 'NM-NK', 'NM-OA', 'NM-PAR', 'NM-PD', 'NM-SB', 'NN', 'NN-ADC', 'NN-AG', 'NN-AMS', 'NN-APP', 'NN-CJ', 'NN-DA', 'NN-DH', 'NN-DM', 'NNE-NK', 'NN-HD', 'NN-MNR', 'NN-MO', 'NN-NK', 'NN-NMC', 'NN-OA', 'NN-OA2', 'NN-OC', 'NN-OG', 'NN-OP', 'NN-PAR', 'NN-PD', 'NN-PNC', 'NN-RE', 'NN-SB', 'NN-UC', 'NN-VO',
  328. 'NP', 'NP-AG', 'NP-AMS', 'NP-APP', 'NP-CC', 'NP-CJ', 'NP-CVC', 'NP-DA', 'NP-DH', 'NP-MNR', 'NP-MO', 'NP-NK', 'NP-NMC', 'NP-OA', 'NP-OA2', 'NP-OC', 'NP-OG', 'NP-OP', 'NP-PAR', 'NP-PD', 'NP-PG', 'NP-PH', 'NP-PNC', 'NP-RE', 'NP-RS', 'NP-SB', 'NP-SP', 'NP-UC', 'NP-VO', 'PDAT-AG', 'PDAT-CJ', 'PDAT-DA', 'PDAT-HD',
  329. 'PDAT-NK', 'PDAT-SB', 'PDS-AG', 'PDS-APP', 'PDS-CJ', 'PDS-DA', 'PDS-NK', 'PDS-OA', 'PDS-OA2', 'PDS-OG', 'PDS-PD', 'PDS-PH', 'PDS-SB', 'PDS-SP', 'PIAT-AG', 'PIAT-CJ', 'PIAT-HD', 'PIAT-MO', 'PIAT-NK', 'PIAT-NMC', 'PIAT-PNC', 'PIAT-SB', 'PIAT-UC', 'PIS-AG', 'PIS-APP', 'PIS-CJ', 'PIS-DA', 'PIS-HD', 'PIS-MNR',
  330. 'PIS-MO', 'PIS-NK', 'PIS-OA', 'PIS-OA2', 'PIS-OG', 'PIS-PD', 'PIS-PH', 'PIS-RE', 'PIS-SB', 'PIS-UC', 'PN', 'PN-AG', 'PN-APP', 'PN-CJ', 'PN-DA', 'PN-DH', 'PN-MO', 'PN-NK', 'PN-OA', 'PN-OC', 'PN-PAR', 'PN-PD', 'PN-PNC', 'PN-RE', 'PN-RS', 'PN-SB', 'PN-UC', 'PP', 'PP-AC', 'PP-AG', 'PP-AMS', 'PP-APP', 'PP-CC',
  331. 'PP-CJ', 'PP-CVC', 'PP-DH', 'PPER', 'PPER-APP', 'PPER-CJ', 'PPER-DA', 'PPER-EP', 'PPER-NK', 'PPER-OA', 'PPER-OA2', 'PPER-OG', 'PPER-OP', 'PPER-PAR', 'PPER-PD', 'PPER-PH', 'PPER-SB', 'PPER-UC', 'PPER-VO', 'PP-HD', 'PP-MNR', 'PP-MO', 'PP-NK', 'PP-OC', 'PP-OP', 'PPOSAT-DA', 'PPOSAT-NK', 'PPOSS-NK', 'PPOSS-SB',
  332. 'PP-PAR', 'PP-PD', 'PP-PG', 'PP-PNC', 'PP-RE', 'PP-RS', 'PP-SB', 'PP-SBP', 'PP-SVP', 'PP-UC', 'PRELAT-AG', 'PRELAT-NK', 'PRELS-DA', 'PRELS-MO', 'PRELS-NK', 'PRELS-OA', 'PRELS-OG', 'PRELS-PD', 'PRELS-PH', 'PRELS-SB', 'PRF-CJ', 'PRF-DA', 'PRF-MO', 'PRF-NK', 'PRF-OA', 'PRF-SB', 'PROAV-AC', 'PROAV-CD', 'PROAV-CJ',
  333. 'PROAV-CVC', 'PROAV-HD', 'PROAV-MNR', 'PROAV-MO', 'PROAV-NK', 'PROAV-OP', 'PROAV-PAR', 'PROAV-PD', 'PROAV-PG', 'PROAV-PH', 'PROAV-SBP', 'PTKA-AC', 'PTKA-HD', 'PTKA-MO', 'PTKANT', 'PTKANT-CJ', 'PTKANT-DM', 'PTKANT-HD', 'PTKANT-MO', 'PTKANT-NK', 'PTKANT-OA', 'PTKANT-OC', 'PTKANT-RS', 'PTKA-PM', 'PTKNEG-CJ',
  334. 'PTKNEG-HD', 'PTKNEG-MO', 'PTKNEG-NG', 'PTKVZ', 'PTKVZ-CJ', 'PTKVZ-MNR', 'PTKVZ-MO', 'PTKVZ-OA', 'PTKVZ-SVP', 'PTKZU', 'PTKZU-MO', 'PTKZU-PM', 'PTKZU-SVP', 'PWAT-AG', 'PWAT-CJ', 'PWAT-HD', 'PWAT-MO', 'PWAT-NK', 'PWAV', 'PWAV-CJ', 'PWAV-CM', 'PWAV-CP', 'PWAV-CVC', 'PWAV-HD', 'PWAV-MO', 'PWAV-NK', 'PWAV-OA',
  335. 'PWAV-OP', 'PWAV-PD', 'PWAV-RC', 'PWAV-RE', 'PWAV-SBP', 'PWS', 'PWS-CJ', 'PWS-DA', 'PWS-MO', 'PWS-NK', 'PWS-OA', 'PWS-OA2', 'PWS-PD', 'PWS-PH', 'PWS-SB', 'PWS-UC', 'S', 'S-APP', 'S-CC', 'S-CJ', 'S-DH', 'S-MNR', 'S-MO', 'S-NK', 'S-OA', 'S-OC', 'S-PAR', 'S-PD', 'S-PNC', 'S-RC', 'S-RE', 'S-RS', 'S-SB', 'S-SP',
  336. 'TOP', 'TRUNC-ADC', 'TRUNC-CJ', 'TRUNC-HD', 'TRUNC-MO', 'TRUNC-NK', 'TRUNC-OC', 'TRUNC-PD', 'TRUNC-PNC', '$.-UC', 'VAFIN', 'VAFIN-HD', 'VAFIN-UC', 'VAIMP-HD', 'VAINF-HD', 'VAINF-OC', 'VAPP-HD', 'VAPP-OC', 'VMFIN', 'VMFIN-HD', 'VMINF-HD', 'VMPP-HD', 'VMPP-PD', 'VP', 'VP-APP', 'VP-CC', 'VP-CJ', 'VP-HD', 'VP-MNR',
  337. 'VP-MO', 'VP-NK', 'VP-OA', 'VP-OC', 'VP-PAR', 'VP-PD', 'VP-PNC', 'VP-RC', 'VP-RE', 'VP-RS', 'VP-SB', 'VVFIN', 'VVFIN-CJ', 'VVFIN-HD', 'VVFIN-NK', 'VVFIN-OC', 'VVFIN-UC', 'VVIMP-CJ', 'VVIMP-HD', 'VVIMP-MO', 'VVINF', 'VVINF-CJ', 'VVINF-CVC', 'VVINF-HD', 'VVINF-NK', 'VVINF-OC', 'VVINF-PNC', 'VVINF-RE', 'VVINF-SB',
  338. 'VVINF-UC', 'VVIZU-CC', 'VVIZU-CJ', 'VVIZU-HD', 'VVIZU-OC', 'VVIZU-RE', 'VVPP', 'VVPP-CJ', 'VVPP-HD', 'VVPP-MO', 'VVPP-NK', 'VVPP-OC', 'VVPP-PAR', 'VVPP-PD', 'VVPP-PNC', 'VVPP-SB', 'VZ-CJ', 'VZ-HD', 'VZ-MNR', 'VZ-MO', 'VZ-NK', 'VZ-OC', 'VZ-PD', 'VZ-RE', 'XY', 'XY-APP', 'XY-CJ', 'XY-MNR', 'XY-MO', 'XY-NK', 'XY-OP',
  339. 'XY-PAR', 'XY-PNC', 'XY-UC', 'X'])]
  340. # fetching the syntactic tag set based on user option
  341. vSynTagList = [tagset[1] for tagset in cSynTagList if tagset[0] == opts.tagSet.upper()][0]
  342. ## if any feature related to raw text of parser training data is requested,
  343. ## load the data into memory first
  344. ## remember to add new such features hear
  345. vsetParserTrainingYields = set([])
  346. if opts.extUnseenNum:
  347. for line in vfParserTrainText.read().splitlines():
  348. vsetLine = set(line.split())
  349. vsetParserTrainingYields = vsetParserTrainingYields.union(vsetLine)
  350. # extracting top-IG words
  351. vIGWordList = []
  352. if opts.extIGLexCount:
  353. # checking top-IG words file
  354. if opts.topIGWordsFileName == None:
  355. sys.exit('Top IG-based ranked words file is not provided!')
  356. else:
  357. try:
  358. vfTopIGWords = open(opts.topIGWordsFileName, 'r')
  359. except IOError:
  360. sys.exit('Can\'t open top IG-based ranked words file: ' + opts.topIGWordsFileName)
  361. # creating to IG words list from file
  362. for word in vfTopIGWords.read().splitlines():
  363. vIGWordList.append(word)
  364. vlFRef = []
  365. if opts.extFRef:
  366. # checking reference parser evalb file
  367. if opts.refEvalFileName == None:
  368. sys.exit('Evalb score file of reference parser is not provided!');
  369. else:
  370. vRefScores = evalbproc.extractSentenceScores(opts.refEvalFileName, False, False, False, False)
  371. for vScore in vRefScores:
  372. vlFRef.append(vScore.fscore)
  373. ## writing output file header
  374. ## remember to add new feature here
  375. writeHeader(vfHeader, vfOutput, opts.extLength, opts.extUnseenNum, opts.extPPL, opts.extRootTag,
  376. vRootTagList, opts.extSynTagNum, vSynTagList, opts.extIGLexCount, vIGWordList, opts.extFRef,
  377. opts.extTreeDepth, opts.extConstituentCnt)
  378. ## extracting and writing features
  379. ## remember to add new feature here
  380. extractFeatures(vlScore, vfOutput, vfTextInput, vfParse, vfInputPPL, vsetParserTrainingYields,
  381. opts.extLength, opts.extUnseenNum, opts.extPPL, opts.extRootTag, vRootTagList,
  382. opts.extSynTagNum, vSynTagList, opts.extIGLexCount, vIGWordList, opts.extFRef, vlFRef,
  383. opts.extTreeDepth, opts.extConstituentCnt)
  384. vfHeader.close()
  385. vfOutput.close()
  386. if vfTextInput != None:
  387. vfTextInput.close()
  388. if vfParse != None:
  389. vfParse.close()
  390. if vfParserTrainText != None:
  391. vfParserTrainText.close()
  392. if vfInputPPL != None:
  393. vfInputPPL.close()
  394. if vfTopIGWords != None:
  395. vfTopIGWords.close()
  396. ##======================================================================
  397. ## calling main
  398. if __name__ == "__main__":
  399. sys.exit(main())