rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
							#! /usr/bin/python

## This script extracts a set of features for parser accuracy prediction
## and output it into an ARFF file to be used in Weka machine learning 
## toolkit.
##
## To extract all implemented features, option --all (-a) must be used. 
## To select a subset of features, the corresponding option to each 
## feature should be used.
##
## Option --all overrides individual feature options. If no individual 
## feature option is provided, all features will be extracted.
##
## Current version: 1.4
## - The root and syntactic tag sets for Tiget Treebank is updated.
## - Bug fixed for tags containing comma.
##
## Version: 1.3
## - extractTreeDepth() and extractConstituentCnt() were moved to parse.py module.
##
## Version: 1.2
## - Root and syntactic tags are customized for Treebanks. 
## - The tag set for French Treebank and Tiger is added in addidtion to 
##   existing WSJ tag set. 
## - Accordingly, an option is added to specify the tag set to be used.
## 
## Version: 1.1
## - UNSCORABLE and UNRERANKED tags are added to root tag set

import sys, optparse, weka, evalbproc, parse

##----------------------------------------------------------------------
## extracts input sentence length feature
def extractLength(pSentence):
 return len(pSentence.split(' '))


##----------------------------------------------------------------------
## extracts number of words in parser test data unseen in parser training data
def extractUnseenNum(pSentence, pParserTrainingYields):
 vNum = 0
 for word in pSentence.split():
  if word not in pParserTrainingYields:
   vNum += 1
   
 return vNum


##----------------------------------------------------------------------
## extract the root tag of parse tree
def extractRootTag(pParseTree, pRootTagList):
 # root tag starts at index 6 following (TOP (
 vRootHeadChuck = pParseTree[6:]
 vRootTag = vRootHeadChuck.split(' ')[0]
 if ',' in vRootTag:
  vRootTag = '"' + vRootTag + '"'
  
 if vRootTag not in pRootTagList:
  sys.exit("Unknown root tag: " + vRootTag)
 else:
  return vRootTag


##----------------------------------------------------------------------
## extracts the number of syntactic tag seen in the parse tree
def extractSynTagNum(pParseTree, pSynTagList):
 vSynTagNumsStr = ""
 
 ## Since Ravi et al. (2008) state that they used syntactic labels of 
 ## INTERNAL nodes, we consider internal to be any node other than root.
 ## For this, we ignore first 2 items of split parse tree.
 vInternalTreeStart = pParseTree.find(' ')
 vInternalTreeStart += pParseTree[vInternalTreeStart + 1:].find(' ')
 vInternalTree = pParseTree[vInternalTreeStart + 2:]
 
 ## looping through syntactic tag set to count the occurrence of each tag 
 ## in the parse tree
 for tag in pSynTagList:
  if ',' in tag:
   vTag = "'" + tag + "'"
  else:
   vTag = tag
  ## To secure that we only count the real node labels, not those which 
  ## have the same spelling to tag but are part of text (e.g. you have 
  ## parsed the text of a paper published in a parsing conference), we 
  ## append '(' to the begining of tag. Also, a space is added to the 
  ## end to avoid counting of e.g. NN for NNP as well.
  vSecureTag = '(' + vTag + ' '
  vSynTagNumsStr += str(vInternalTree.count(vSecureTag)) + ','

 return vSynTagNumsStr


##----------------------------------------------------------------------
## extracts the number of to-ig ranked words in the input sentence
def extractIGLexCount(pSentence, pIGWordList):
 vIGLexCountsStr = ""
 
 ## looping through syntactic tag set to count the occurrence of each tag 
 ## in the parse tree
 for word in pIGWordList:
  vInputTokens = pSentence.split()
  vIGLexCountsStr += str(vInputTokens.count(word)) + ','

 return vIGLexCountsStr


##----------------------------------------------------------------------
## extracts features and writes into output ARFF file
def extractFeatures(pScores, pOutFile, pTxtInputFile, pParseFile, pInputPPLFile, 
                    pParserTrainingYields, pLength, pUnseenNum, pPPL, pRootTag, pRootTagList, 
                    pSynTagNum, pSynTagList, pIGLexCount, pIGWordList, pRef, pRefScores,
                    pTreeDepth, pConstituentCnt):
 vLoopCntr = 0
 for vScore in pScores:
      
  if pLength or pUnseenNum or pIGLexCount:
   vSentence = pTxtInputFile.readline().splitlines()[0]
  
  if pLength:
   pOutFile.write(str(extractLength(vSentence)) + ',');
  
  if pUnseenNum:
   pOutFile.write(str(extractUnseenNum(vSentence, pParserTrainingYields)) + ',');
  
  if pPPL:
   ppl = pInputPPLFile.readline()
   if not ppl:
    exit("Perplexity score file does not match input file in line number")
   else:
    pOutFile.write(ppl.splitlines()[0] + ',');
  
  if pRootTag or pSynTagNum:
   parseTree = pParseFile.readline()
   if not parseTree:
    sys.exit("Parse file does not match input file in line number")

  if pRootTag:
   pOutFile.write(extractRootTag(parseTree, pRootTagList) + ',');

  if pSynTagNum:
   pOutFile.write(extractSynTagNum(parseTree, pSynTagList));
  
  if pIGLexCount:
   pOutFile.write(extractIGLexCount(vSentence, pIGWordList));
  
  if pRef:
   pOutFile.write(str(pRefScores[vLoopCntr]) + ',');
  
  # new features
 
  if pTreeDepth:
   pOutFile.write(str(parse.extractTreeDepth(parseTree)) + ',')
 
  if pConstituentCnt:
   pOutFile.write(str(parse.extractConstituentCnt(parseTree)) + ',')
 
  # writing outcome (score) feature value
  pOutFile.write(str(vScore) + '\n');
  
  vLoopCntr += 1

 pOutFile.write('\n')
 return


##----------------------------------------------------------------------
## writes the header of the output file according to the header file provided
## ignores any attribute provided in the header file
## also writes the @DATA at the end
def writeHeader(pHeaderFile, pOutFile, pLength, pUnseenNum, pPPL, pRootTag, pRootTagList, 
                pSynTagNum, pSynTagList, pIGLexCount, pIGWordList, pFRef, 
                pTreeDepth, pConstituentCnt):
 for line in pHeaderFile:
  if line.upper().startswith("@RELATION"):
   pOutFile.write(line)
   break
  else:
   pOutFile.write(line)
 pOutFile.write('\n')
 
 # adding attribute declarations for features
 
 if pLength:
  pOutFile.write("@ATTRIBUTE\tlength\t\t\tinteger\n")
 
 if pUnseenNum:
  pOutFile.write("@ATTRIBUTE\tunseennum\t\tinteger\n")
 
 if pPPL:
  pOutFile.write("@ATTRIBUTE\tperplexity\t\treal\n")
 
 if pRootTag:
  pOutFile.write("@ATTRIBUTE\troottag\t\t\t{")
  vTagList = ""
  for tag in pRootTagList:
   vTagList += tag + ','
  vTagList = vTagList[:-1]
  pOutFile.write(vTagList + "}\n")
 
 if pSynTagNum:
  for tag in pSynTagList:
   pOutFile.write("@ATTRIBUTE\t\"syntag" + tag + "\"\t\tinteger\n")
 
 if pIGLexCount:
  for word in pIGWordList:
   if '"' in word:
    pOutFile.write("@ATTRIBUTE\t'iglex" + word + "'\t\tinteger\n")
   else:
    pOutFile.write("@ATTRIBUTE\t\"iglex" + word + "\"\t\tinteger\n")
 
 if pFRef:
  pOutFile.write("@ATTRIBUTE\tfref\t\treal\n")
 
 # new features
 
 if pTreeDepth:
  pOutFile.write("@ATTRIBUTE\ttreedepth\t\tinteger\n")
 
 if pConstituentCnt:
  pOutFile.write("@ATTRIBUTE\tconstcnt\t\tinteger\n")

 # adding attribute declarations for outcome (score)
 pOutFile.write("@ATTRIBUTE\tfscore\t\treal\n")
   
 pOutFile.write('\n')
 pOutFile.write("@DATA\n")


##======================================================================
## main
def main(argv=None):
 if argv is None:
  argv = sys.argv
 
 parser = optparse.OptionParser(usage="%prog <ARFF FILE HEADER> <OUTPUT FILE NAME> [OPTIONS]" + 
                                      "\nExtracts a set of features for parser accuracy prediction and output it into an ARFF file to be used in Weka machine learning toolkit.", version="%prog 1.4")

 parser.add_option("-s", "--scores", help="parser evalb score file", metavar="PARSER EVALB SCORE FILE", dest="scoreFileName", action="store")
 parser.add_option("-n", "--samplesize", help="number of instances if score file is not specified (for unlabeled instances)", metavar="SAMPLE SIZE", dest="sampleSize", action="store")
 parser.add_option("-a", "--all", help="extract all implemented features (overrides all other individual feature options)", dest="extAll", action="store_true")
 parser.add_option("--length", help="extract input sentence word length as feature", dest="extLength", action="store_true")
 parser.add_option("--unseennum", help="extract number of words in the input data unseen in parser training data as feature", dest="extUnseenNum", action="store_true")
 parser.add_option("-t", "--textinput", help="tokenized raw text input data set", metavar="TOKENIZED RAW TEXT INPUT DATA", dest="txtInputFileName", action="store")
 parser.add_option("--tagset", help="tag set of input parses: wsj, ftb, tiger", metavar="PARSER TAG SET", default="wsj", dest="tagSet", action="store")
 parser.add_option("--roottag", help="extract the root tag of parse tree as feature", dest="extRootTag", action="store_true")
 parser.add_option("--syntagnum", help="extract the number of syntactic tags seen in the parse tree as feature", dest="extSynTagNum", action="store_true")
 parser.add_option("-p", "--parse", help="parsed input data set", metavar="PARSED INPUT DATA", dest="parseFileName", action="store")
 parser.add_option("--perplexity", help="extract the perplexity of each sentence with respect to the parser training data (--ptrain) as feature", dest="extPPL", action="store_true")
 parser.add_option("-P", "--pplscores", help="perplexity scores for the input sentences", metavar="PERPLEXITY SCORES", dest="inputPPLFileName", action="store")
 parser.add_option("-T", "--ptraintext", help="tokenized raw training data set of the parser", metavar="TOKENIZED RAW TEXT PARSER TRAINING DATA", dest="txtParserTrainFileName", action="store")
 parser.add_option("--iglexcount", help="extract the number of top-ig words seen in the parse tree as feature", dest="extIGLexCount", action="store_true")
 parser.add_option("-g", "--igwords", help="top-ig words list file", metavar="TOP IG WORDS FILE", dest="topIGWordsFileName", action="store")
 parser.add_option("--fref", help="extract f-score of the input parse with respect to a reference parser output as feature", dest="extFRef", action="store_true")
 parser.add_option("-r", "--refscores", help="evalb score file of reference parser", metavar="REFERENCE PARSER EVALB SCORE FILE", dest="refEvalFileName", action="store")
 parser.add_option("--treedepth", help="tree depth", metavar="TREE DEPTH", dest="extTreeDepth", action="store_true")
 parser.add_option("--constcount", help="number of syntactic constituents", metavar="CONSTITUENT COUNT", dest="extConstituentCnt", action="store_true")

 (opts, posArgs) = parser.parse_args()
 
 
 # processing arguments
 
 if len(posArgs) < 2:
  parser.error("At least 2 arguments are required!")


 # opening header file
 try:
  vfHeader  = open(posArgs[0], 'r')
 except IOError:
  sys.exit('Can\'t open header file: ' + posArgs[0])

 # creating output file
 try:
  vfOutput  = open(posArgs[1] + '.arff', 'w')
 except IOError:
  sys.exit('Can\'t create output file: ' + posArgs[1])

 
 # processing options

 ## loading parse scores (outcome) into memory
 ## Note that for unlabeled test data, this is not given. Instead, the 
 ## sample set size (-n or --samplesize option) must be provided. In this
 ## case, -1 is a outcome template for each instance.
 vlScore = []
 if opts.scoreFileName != None:
  vlScores = evalbproc.extractSentenceScores(opts.scoreFileName, False, False, False, False)
  for vScore in vlScores:
   vlScore.append(vScore.fscore)
 elif opts.sampleSize != None:
  vlScore = [-1] * int(opts.sampleSize)
 else:
  sys.exit('Either sample size or parse score file should be provided!')


 ## processing feature options (if no option, --all will be set)
 ## remember to add new feature options here
 if not (opts.extAll or opts.extLength or opts.extUnseenNum or opts.extPPL or 
         opts.extRootTag or opts.extSynTagNum or opts.extIGLexCount or opts.extFRef or
         opts.extTreeDepth, opts.extConstituentCnt):
  opts.extAll = True

 ## overriding individual feature options if --all is set (i.e. all 
 ## individual feature options are considered set to be extracted)
 ## remember to add new feature options here
 if opts.extAll:
  opts.extLength = True
  opts.extUnseenNum = True
  opts.extPPL = True
  opts.extRootTag = True
  opts.extSynTagNum = True
  opts.extIGLexCount = True
  opts.extFRef = True
  opts.extTreeDepth = True
  opts.extConstituentCnt = True

 ## opening raw text file, if any related feature is requested
 ## remember to add new such features hear
 vfTextInput = None
 if opts.extLength or opts.extUnseenNum or opts.extIGLexCount:
  if opts.txtInputFileName == None:
   sys.exit('Text input file is not provided!')
  else:
   try:
    vfTextInput  = open(opts.txtInputFileName, 'r')
   except IOError:
    sys.exit('Can\'t open text input file: ' + opts.txtInputFileName)

 ## opening parse file, if any related feature is requested
 ## remember to add new such features hear
 vfParse = None
 if opts.extRootTag or opts.extSynTagNum or opts.extTreeDepth or opts.extConstituentCnt:
  if opts.parseFileName == None:
   sys.exit('Parse file is not provided!')
  else:
   try:
    vfParse  = open(opts.parseFileName, 'r')
   except IOError:
    sys.exit('Can\'t open parse file: ' + opts.parseFileName)


 ## opening parser training raw text file, if any related feature is requested
 ## remember to add new such features hear
 vfParserTrainText = None
 if opts.extUnseenNum:
  if opts.txtParserTrainFileName == None:
   sys.exit('Parser training text file is not provided!')
  else:
   try:
    vfParserTrainText  = open(opts.txtParserTrainFileName, 'r')
   except IOError:
    sys.exit('Can\'t open parser training text file: ' + opts.txtParserTrainFileName)

 ## opening perplexity scores file for the input sentences, if any related 
 ## feature is requested
 vfInputPPL = None
 if opts.extPPL:
  if opts.inputPPLFileName == None:
   sys.exit('Perplexity scores file for the input sentences is not provided!')
  else:
   try:
    vfInputPPL  = open(opts.inputPPLFileName, 'r')
   except IOError:
    sys.exit('Can\'t open perplexity scores file for the input sentences: ' + opts.inputPPLFileName)


 ## WSJ: The set is extracted from WSJ by Jennifer plus 1 UNPARSED tag for 
 ##      those sentences which could not be parsed, 1 UNSCORABLE tag for those 
 ##      sentences which could not be scored, and 1 UNRERANKED tag for those
 ##      sentences which could not be re-ranked:
 ## FTB: The set is extracted from FTB plus the above additional tags and X,
 ##      plus some found in the Berkeley output (trained on FTB). 
 ##      The only root tag found in FTB (train+dev+test) was SENT.
 ## TIGER: The set is extracted from Tiger plus the above additional tags,  
 ##        those found in Tiger.penn, and X.
 cRootTagList = [('WSJ', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', 'ADJP', 'ADVP', 'CONJP', 'FRAG', 'INTJ', 'LS', 'LST', 'NAC', 'NP', 'NX', 'PP', 'PRN', 'PRT', 'QP', 'RRC', 'S', 'SBAR', 'SBARQ', 'SINV', 'SQ', 'UCP', 'VP', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP', 'X']),
                 ('FTB', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', 'SENT', 'NP', 'PP', 'COORD', 'VPinf', 'X']),
				 ('TIGER', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', '$', '$.', '"$,"', 'ADJA', 'ADJD', 'ADV', 'AP', 'ART', 'AVP', 'CAP', 'CARD', 'CAVP', 'CH', 'CNP', 'CO', 'CPP', 'CS', 'CVP', 'DL', 'ITJ', 'KON', '$*LRB*', 'NE', 'NN', 'NP', 'PN', 'PP', 'PTKANT', 'PWAV', 'PWS', 'S', 'VMFIN', 'VP', 'VVPP', 'XY', 'X'])]
 
 # fetching the root tag set based on user option 
 vRootTagList = [tagset[1] for tagset in cRootTagList if tagset[0] == opts.tagSet.upper()][0]

 
 ## WSJ: The set is extracted from WSJ by Jennifer plus AUX and AUXJ used by 
 ##      Brown.
 ## FTB: The set is extracted from FTB plus 'X'.
 ## TIGER: The set is extracted from Tiger.penn (727!) plus 'X'.
 cSynTagList = [('WSJ', ['#', '$', "''", ',', '.', ':', 'ADJP', 'ADVP', 'AUX', 'AUXJ', 'CC', 'CD', 'CONJP', 'DT', 'EX', 'FRAG', 'FW', 'IN', 'INTJ', 'JJ', 'JJR', 'JJS', 'LS', 'LST', 'MD', 'NAC', 'NN', 'NNP', 'NNPS', 'NNS', 'NP', 'NX', 'PDT', 'POS', 'PP', 'PRN', 'PRP', 'PRP$', 'PRT', 'PRT|ADVP', 'QP', 'RB', 'RBR', 'RBS', 'RP', 'RRC', 'S', 'SBAR', 'SBARQ', 'SINV', 'SQ', 'SYM', 'TO', 'UCP', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'VP', 'WDT', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP', 'WP', 'WP$', 'WRB', 'X', '``']),
                ('FTB', ['ADJ', 'ADJWH', 'AdP', 'ADV', 'ADVWH', 'AP', 'CC', 'CLO', 'CLR', 'CLS', 'COORD', 'CS', 'DET', 'DETWH', 'ET', 'I', 'NC', 'NP', 'NPP', 'P', 'P+D', 'PONCT', 'PP', 'P+PRO', 'PREF', 'PRO', 'PROREL', 'PROWH', 'SENT', 'Sint', 'Srel', 'Ssub', 'V', 'VIMP', 'VINF', 'VN', 'VPinf', 'VPP', 'VPpart', 'VPR', 'VS', 'X']),
				('TIGER', ['$,', "'$.'", 'AA-CJ', 'AA-HD', 'AA-MO', 'AA-NK', 'AA-PD', 'ADJA', 'ADJA-ADC', 'ADJA-CJ', 'ADJA-HD', 'ADJA-MO', 'ADJA-NK', 'ADJA-NMC', 'ADJA-OA', 'ADJA-PD', 'ADJA-PNC', 'ADJA-SB', 'ADJD', 'ADJD-APP', 'ADJD-CJ', 'ADJD-CVC', 'ADJD-HD', 'ADJD-MNR', 'ADJD-MO', 'ADJD-NK', 'ADJD-OA', 'ADJD-PAR', 'ADJD-PD', 
				           'ADJD-PH', 'ADJD-PNC', 'ADJD-SB', 'ADJD-UC', 'ADV', 'ADV-AC', 'ADV-APP', 'ADV-AVC', 'ADV-CD', 'ADV-CJ', 'ADV-DH', 'ADV-DM', 'ADV-HD', 'ADV-JU', 'ADV-MNR', 'ADV-MO', 'ADV-NG', 'ADV-NK', 'ADV-OA', 'ADV-OP', 'ADV-PAR', 'ADV-PH', 'ADV-PNC', 'ADV-SVP', 'ADV-UC', 'AP', 'AP-AG', 'AP-AMS', 'AP-APP', 'AP-CC',
				           'AP-CJ', 'AP-HD', 'AP-MNR', 'AP-MO', 'AP-NG', 'AP-NK', 'AP-NMC', 'AP-OA', 'AP-OC', 'AP-PAR', 'AP-PD', 'AP-PNC', 'APPO-AC', 'APPR', 'APPR-AC', 'APPRART', 'APPRART-AC', 'APPRART-CJ', 'APPRART-PNC', 'APPR-AVC', 'APPR-CD', 'APPR-CJ', 'APPR-CP', 'APPR-HD', 'APPR-MO', 'APPR-PNC', 'APPR-UC', 'AP-RE', 'AP-RS',
				           'AP-SB', 'APZR-AC', 'APZR-MO', 'ART', 'ART-CJ', 'ART-HD', 'ART-MO', 'ART-NK', 'ART-NMC', 'ART-PNC', 'AVP', 'AVP-APP', 'AVP-CC', 'AVP-CD', 'AVP-CJ', 'AVP-CP', 'AVP-CVC', 'AVP-DM', 'AVP-HD', 'AVP-JU', 'AVP-MNR', 'AVP-MO', 'AVP-NG', 'AVP-NK', 'AVP-OA', 'AVP-OC', 'AVP-OP', 'AVP-PAR', 'AVP-PD', 'AVP-PH', 
				           'AVP-PNC', 'AVP-RE', 'AVP-RS', 'AVP-SB', 'CAC-AC', 'CAC-MO', 'CAP', 'CAP-APP', 'CAP-CJ', 'CAP-HD', 'CAP-MNR', 'CAP-MO', 'CAP-NK', 'CAP-NMC', 'CAP-OA', 'CAP-PAR', 'CAP-PD', 'CAP-PNC', 'CAP-RE', 'CAP-SB', 'CARD', 'CARD-AMS', 'CARD-APP', 'CARD-CJ', 'CARD-HD', 'CARD-MNR', 'CARD-MO', 'CARD-NK', 'CARD-NMC',
				           'CARD-OA', 'CARD-OP', 'CARD-PAR', 'CARD-PD', 'CARD-PNC', 'CARD-SB', 'CARD-UC', 'CAVP', 'CAVP-AC', 'CAVP-CJ', 'CAVP-HD', 'CAVP-MNR', 'CAVP-MO', 'CAVP-NG', 'CAVP-NK', 'CAVP-PAR', 'CAVP-PD', 'CAVP-SVP', 'CCP-CP', 'CH', 'CH-APP', 'CH-CJ', 'CH-CM', 'CH-DM', 'CH-HD', 'CH-MNR', 'CH-MO', 'CH-NK', 'CH-OA', 
				           'CH-OC', 'CH-PAR', 'CH-PD', 'CH-PNC', 'CH-RE', 'CH-RS', 'CH-SB', 'CNP', 'CNP-AG', 'CNP-AMS', 'CNP-APP', 'CNP-CC', 'CNP-CJ', 'CNP-DA', 'CNP-DH', 'CNP-HD', 'CNP-MNR', 'CNP-MO', 'CNP-NK', 'CNP-NMC', 'CNP-OA', 'CNP-OC', 'CNP-OG', 'CNP-PAR', 'CNP-PD', 'CNP-PH', 'CNP-PNC', 'CNP-RE', 'CNP-RS', 'CNP-SB', 
				           'CNP-VO', 'CO', 'CO-AG', 'CO-APP', 'CO-CD', 'CO-CJ', 'CO-CP', 'CO-HD', 'CO-MNR', 'CO-MO', 'CO-NK', 'CO-OA', 'CO-OC', 'CO-PAR', 'CO-PD', 'CO-PG', 'CO-PNC', 'CO-RE', 'CO-RS', 'CO-SB', 'CPP', 'CPP-APP', 'CPP-CC', 'CPP-CJ', 'CPP-CVC', 'CPP-MNR', 'CPP-MO', 'CPP-NK', 'CPP-OP', 'CPP-PAR', 'CPP-PG', 'CPP-PNC',
				           'CPP-RE', 'CPP-SBP', 'CS', 'CS-APP', 'CS-CC', 'CS-CJ', 'CS-DH', 'CS-HD', 'CS-MNR', 'CS-MO', 'CS-NK', 'CS-OC', 'CS-PAR', 'CS-PD', 'CS-PNC', 'CS-RC', 'CS-RE', 'CS-RS', 'CS-SB', 'CVP', 'CVP-APP', 'CVP-CC', 'CVP-CJ', 'CVP-HD', 'CVP-MNR', 'CVP-MO', 'CVP-NK', 'CVP-OC', 'CVP-PAR', 'CVP-PD', 'CVP-RE', 'CVP-SB',
				           'CVZ-HD', 'CVZ-OA', 'CVZ-OC', 'DL', 'DL-CJ', 'DL-MO', 'DL-PAR', 'FM-AC', 'FM-ADC', 'FM-APP', 'FM-AVC', 'FM-CJ', 'FM-MO', 'FM-NK', 'FM-OA', 'FM-OC', 'FM-OP', 'FM-PD', 'FM-PNC', 'FM-SB', 'FM-UC', 'ISU-CJ', 'ISU-MO', 'ISU-NK', 'ITJ', 'ITJ-DM', 'ITJ-MO', 'ITJ-NK', 'ITJ-OA', 'ITJ-UC', 'ITJ-VO', 'KOKOM-AC', 
				           'KOKOM-CC', 'KOKOM-CD', 'KOKOM-CM', 'KOKOM-CP', 'KOKOM-MO', 'KOKOM-UC', 'KON', 'KON-AVC', 'KON-CD', 'KON-CJ', 'KON-DH', 'KON-HD', 'KON-JU', 'KON-MO', 'KON-PNC', 'KON-UC', 'KOUI-AC', 'KOUI-AVC', 'KOUI-CP', 'KOUS-AC', 'KOUS-AVC', 'KOUS-CJ', 'KOUS-CP', 'KOUS-HD', 'KOUS-MO', 'KOUS-PH', 'KOUS-PNC', '$*LRB*', 
				           '$*LRB*-PNC', 'MTA-CJ', 'MTA-NK', 'NE', 'NE-AC', 'NE-ADC', 'NE-AG', 'NE-APP', 'NE-CJ', 'NE-DA', 'NE-DH', 'NE-DM', 'NE-MNR', 'NE-MO', 'NE-NK', 'NE-OA', 'NE-OC', 'NE-OP', 'NE-PAR', 'NE-PD', 'NE-PNC', 'NE-RE', 'NE-RS', 'NE-SB', 'NE-UC', 'NE-VO', 'NM', 'NM-AMS', 'NM-APP', 'NM-CC', 'NM-CJ', 'NM-HD', 'NM-MNR', 
				           'NM-MO', 'NM-NK', 'NM-OA', 'NM-PAR', 'NM-PD', 'NM-SB', 'NN', 'NN-ADC', 'NN-AG', 'NN-AMS', 'NN-APP', 'NN-CJ', 'NN-DA', 'NN-DH', 'NN-DM', 'NNE-NK', 'NN-HD', 'NN-MNR', 'NN-MO', 'NN-NK', 'NN-NMC', 'NN-OA', 'NN-OA2', 'NN-OC', 'NN-OG', 'NN-OP', 'NN-PAR', 'NN-PD', 'NN-PNC', 'NN-RE', 'NN-SB', 'NN-UC', 'NN-VO', 
				           'NP', 'NP-AG', 'NP-AMS', 'NP-APP', 'NP-CC', 'NP-CJ', 'NP-CVC', 'NP-DA', 'NP-DH', 'NP-MNR', 'NP-MO', 'NP-NK', 'NP-NMC', 'NP-OA', 'NP-OA2', 'NP-OC', 'NP-OG', 'NP-OP', 'NP-PAR', 'NP-PD', 'NP-PG', 'NP-PH', 'NP-PNC', 'NP-RE', 'NP-RS', 'NP-SB', 'NP-SP', 'NP-UC', 'NP-VO', 'PDAT-AG', 'PDAT-CJ', 'PDAT-DA', 'PDAT-HD', 
				           'PDAT-NK', 'PDAT-SB', 'PDS-AG', 'PDS-APP', 'PDS-CJ', 'PDS-DA', 'PDS-NK', 'PDS-OA', 'PDS-OA2', 'PDS-OG', 'PDS-PD', 'PDS-PH', 'PDS-SB', 'PDS-SP', 'PIAT-AG', 'PIAT-CJ', 'PIAT-HD', 'PIAT-MO', 'PIAT-NK', 'PIAT-NMC', 'PIAT-PNC', 'PIAT-SB', 'PIAT-UC', 'PIS-AG', 'PIS-APP', 'PIS-CJ', 'PIS-DA', 'PIS-HD', 'PIS-MNR', 
				           'PIS-MO', 'PIS-NK', 'PIS-OA', 'PIS-OA2', 'PIS-OG', 'PIS-PD', 'PIS-PH', 'PIS-RE', 'PIS-SB', 'PIS-UC', 'PN', 'PN-AG', 'PN-APP', 'PN-CJ', 'PN-DA', 'PN-DH', 'PN-MO', 'PN-NK', 'PN-OA', 'PN-OC', 'PN-PAR', 'PN-PD', 'PN-PNC', 'PN-RE', 'PN-RS', 'PN-SB', 'PN-UC', 'PP', 'PP-AC', 'PP-AG', 'PP-AMS', 'PP-APP', 'PP-CC', 
				           'PP-CJ', 'PP-CVC', 'PP-DH', 'PPER', 'PPER-APP', 'PPER-CJ', 'PPER-DA', 'PPER-EP', 'PPER-NK', 'PPER-OA', 'PPER-OA2', 'PPER-OG', 'PPER-OP', 'PPER-PAR', 'PPER-PD', 'PPER-PH', 'PPER-SB', 'PPER-UC', 'PPER-VO', 'PP-HD', 'PP-MNR', 'PP-MO', 'PP-NK', 'PP-OC', 'PP-OP', 'PPOSAT-DA', 'PPOSAT-NK', 'PPOSS-NK', 'PPOSS-SB', 
				           'PP-PAR', 'PP-PD', 'PP-PG', 'PP-PNC', 'PP-RE', 'PP-RS', 'PP-SB', 'PP-SBP', 'PP-SVP', 'PP-UC', 'PRELAT-AG', 'PRELAT-NK', 'PRELS-DA', 'PRELS-MO', 'PRELS-NK', 'PRELS-OA', 'PRELS-OG', 'PRELS-PD', 'PRELS-PH', 'PRELS-SB', 'PRF-CJ', 'PRF-DA', 'PRF-MO', 'PRF-NK', 'PRF-OA', 'PRF-SB', 'PROAV-AC', 'PROAV-CD', 'PROAV-CJ', 
				           'PROAV-CVC', 'PROAV-HD', 'PROAV-MNR', 'PROAV-MO', 'PROAV-NK', 'PROAV-OP', 'PROAV-PAR', 'PROAV-PD', 'PROAV-PG', 'PROAV-PH', 'PROAV-SBP', 'PTKA-AC', 'PTKA-HD', 'PTKA-MO', 'PTKANT', 'PTKANT-CJ', 'PTKANT-DM', 'PTKANT-HD', 'PTKANT-MO', 'PTKANT-NK', 'PTKANT-OA', 'PTKANT-OC', 'PTKANT-RS', 'PTKA-PM', 'PTKNEG-CJ', 
				           'PTKNEG-HD', 'PTKNEG-MO', 'PTKNEG-NG', 'PTKVZ', 'PTKVZ-CJ', 'PTKVZ-MNR', 'PTKVZ-MO', 'PTKVZ-OA', 'PTKVZ-SVP', 'PTKZU', 'PTKZU-MO', 'PTKZU-PM', 'PTKZU-SVP', 'PWAT-AG', 'PWAT-CJ', 'PWAT-HD', 'PWAT-MO', 'PWAT-NK', 'PWAV', 'PWAV-CJ', 'PWAV-CM', 'PWAV-CP', 'PWAV-CVC', 'PWAV-HD', 'PWAV-MO', 'PWAV-NK', 'PWAV-OA', 
				           'PWAV-OP', 'PWAV-PD', 'PWAV-RC', 'PWAV-RE', 'PWAV-SBP', 'PWS', 'PWS-CJ', 'PWS-DA', 'PWS-MO', 'PWS-NK', 'PWS-OA', 'PWS-OA2', 'PWS-PD', 'PWS-PH', 'PWS-SB', 'PWS-UC', 'S', 'S-APP', 'S-CC', 'S-CJ', 'S-DH', 'S-MNR', 'S-MO', 'S-NK', 'S-OA', 'S-OC', 'S-PAR', 'S-PD', 'S-PNC', 'S-RC', 'S-RE', 'S-RS', 'S-SB', 'S-SP', 
				           'TOP', 'TRUNC-ADC', 'TRUNC-CJ', 'TRUNC-HD', 'TRUNC-MO', 'TRUNC-NK', 'TRUNC-OC', 'TRUNC-PD', 'TRUNC-PNC', '$.-UC', 'VAFIN', 'VAFIN-HD', 'VAFIN-UC', 'VAIMP-HD', 'VAINF-HD', 'VAINF-OC', 'VAPP-HD', 'VAPP-OC', 'VMFIN', 'VMFIN-HD', 'VMINF-HD', 'VMPP-HD', 'VMPP-PD', 'VP', 'VP-APP', 'VP-CC', 'VP-CJ', 'VP-HD', 'VP-MNR', 
				           'VP-MO', 'VP-NK', 'VP-OA', 'VP-OC', 'VP-PAR', 'VP-PD', 'VP-PNC', 'VP-RC', 'VP-RE', 'VP-RS', 'VP-SB', 'VVFIN', 'VVFIN-CJ', 'VVFIN-HD', 'VVFIN-NK', 'VVFIN-OC', 'VVFIN-UC', 'VVIMP-CJ', 'VVIMP-HD', 'VVIMP-MO', 'VVINF', 'VVINF-CJ', 'VVINF-CVC', 'VVINF-HD', 'VVINF-NK', 'VVINF-OC', 'VVINF-PNC', 'VVINF-RE', 'VVINF-SB', 
				           'VVINF-UC', 'VVIZU-CC', 'VVIZU-CJ', 'VVIZU-HD', 'VVIZU-OC', 'VVIZU-RE', 'VVPP', 'VVPP-CJ', 'VVPP-HD', 'VVPP-MO', 'VVPP-NK', 'VVPP-OC', 'VVPP-PAR', 'VVPP-PD', 'VVPP-PNC', 'VVPP-SB', 'VZ-CJ', 'VZ-HD', 'VZ-MNR', 'VZ-MO', 'VZ-NK', 'VZ-OC', 'VZ-PD', 'VZ-RE', 'XY', 'XY-APP', 'XY-CJ', 'XY-MNR', 'XY-MO', 'XY-NK', 'XY-OP', 
				           'XY-PAR', 'XY-PNC', 'XY-UC', 'X'])]
 
 # fetching the syntactic tag set based on user option 
 vSynTagList = [tagset[1] for tagset in cSynTagList if tagset[0] == opts.tagSet.upper()][0]

 
 ## if any feature related to raw text of parser training data is requested,
 ## load the data into memory first
 ## remember to add new such features hear
 vsetParserTrainingYields = set([])
 if opts.extUnseenNum:
  for line in vfParserTrainText.read().splitlines():
   vsetLine = set(line.split())
   vsetParserTrainingYields = vsetParserTrainingYields.union(vsetLine) 
  
   
 # extracting top-IG words
 vIGWordList = []
 if opts.extIGLexCount:
  # checking top-IG words file
  if opts.topIGWordsFileName == None:
   sys.exit('Top IG-based ranked words file is not provided!')
  else:
   try:
    vfTopIGWords = open(opts.topIGWordsFileName, 'r')
   except IOError:
    sys.exit('Can\'t open top IG-based ranked words file: ' + opts.topIGWordsFileName)
    
  # creating to IG words list from file  
  for word in vfTopIGWords.read().splitlines():
   vIGWordList.append(word)	
 

 vlFRef = []
 if opts.extFRef:
  # checking reference parser evalb file
  if opts.refEvalFileName == None:
   sys.exit('Evalb score file of reference parser is not provided!');
  else:
   vRefScores = evalbproc.extractSentenceScores(opts.refEvalFileName, False, False, False, False)
  
  for vScore in vRefScores:
   vlFRef.append(vScore.fscore)


 ## writing output file header
 ## remember to add new feature here
 writeHeader(vfHeader, vfOutput, opts.extLength, opts.extUnseenNum, opts.extPPL, opts.extRootTag, 
             vRootTagList, opts.extSynTagNum, vSynTagList, opts.extIGLexCount, vIGWordList, opts.extFRef, 
             opts.extTreeDepth, opts.extConstituentCnt)
 
 ## extracting and writing features
 ## remember to add new feature here
 extractFeatures(vlScore, vfOutput, vfTextInput, vfParse, vfInputPPL, vsetParserTrainingYields, 
                 opts.extLength, opts.extUnseenNum, opts.extPPL, opts.extRootTag, vRootTagList, 
                 opts.extSynTagNum, vSynTagList, opts.extIGLexCount, vIGWordList, opts.extFRef, vlFRef, 
                 opts.extTreeDepth, opts.extConstituentCnt)


 vfHeader.close()
 vfOutput.close()
 if vfTextInput != None:
  vfTextInput.close()
 if vfParse != None:
  vfParse.close()
 if vfParserTrainText != None:
  vfParserTrainText.close()
 if vfInputPPL != None:
  vfInputPPL.close()
 if vfTopIGWords != None:
  vfTopIGWords.close()

 
##======================================================================
## calling main
if __name__ == "__main__":
 sys.exit(main())