123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499 |
- #! /usr/bin/python
- ## This script extracts a set of features for parser accuracy prediction
- ## and output it into an ARFF file to be used in Weka machine learning
- ## toolkit.
- ##
- ## To extract all implemented features, option --all (-a) must be used.
- ## To select a subset of features, the corresponding option to each
- ## feature should be used.
- ##
- ## Option --all overrides individual feature options. If no individual
- ## feature option is provided, all features will be extracted.
- ##
- ## Current version: 1.4
- ## - The root and syntactic tag sets for Tiget Treebank is updated.
- ## - Bug fixed for tags containing comma.
- ##
- ## Version: 1.3
- ## - extractTreeDepth() and extractConstituentCnt() were moved to parse.py module.
- ##
- ## Version: 1.2
- ## - Root and syntactic tags are customized for Treebanks.
- ## - The tag set for French Treebank and Tiger is added in addidtion to
- ## existing WSJ tag set.
- ## - Accordingly, an option is added to specify the tag set to be used.
- ##
- ## Version: 1.1
- ## - UNSCORABLE and UNRERANKED tags are added to root tag set
- import sys, optparse, weka, evalbproc, parse
- ##----------------------------------------------------------------------
- ## extracts input sentence length feature
- def extractLength(pSentence):
- return len(pSentence.split(' '))
- ##----------------------------------------------------------------------
- ## extracts number of words in parser test data unseen in parser training data
- def extractUnseenNum(pSentence, pParserTrainingYields):
- vNum = 0
- for word in pSentence.split():
- if word not in pParserTrainingYields:
- vNum += 1
-
- return vNum
- ##----------------------------------------------------------------------
- ## extract the root tag of parse tree
- def extractRootTag(pParseTree, pRootTagList):
- # root tag starts at index 6 following (TOP (
- vRootHeadChuck = pParseTree[6:]
- vRootTag = vRootHeadChuck.split(' ')[0]
- if ',' in vRootTag:
- vRootTag = '"' + vRootTag + '"'
-
- if vRootTag not in pRootTagList:
- sys.exit("Unknown root tag: " + vRootTag)
- else:
- return vRootTag
- ##----------------------------------------------------------------------
- ## extracts the number of syntactic tag seen in the parse tree
- def extractSynTagNum(pParseTree, pSynTagList):
- vSynTagNumsStr = ""
-
- ## Since Ravi et al. (2008) state that they used syntactic labels of
- ## INTERNAL nodes, we consider internal to be any node other than root.
- ## For this, we ignore first 2 items of split parse tree.
- vInternalTreeStart = pParseTree.find(' ')
- vInternalTreeStart += pParseTree[vInternalTreeStart + 1:].find(' ')
- vInternalTree = pParseTree[vInternalTreeStart + 2:]
-
- ## looping through syntactic tag set to count the occurrence of each tag
- ## in the parse tree
- for tag in pSynTagList:
- if ',' in tag:
- vTag = "'" + tag + "'"
- else:
- vTag = tag
- ## To secure that we only count the real node labels, not those which
- ## have the same spelling to tag but are part of text (e.g. you have
- ## parsed the text of a paper published in a parsing conference), we
- ## append '(' to the begining of tag. Also, a space is added to the
- ## end to avoid counting of e.g. NN for NNP as well.
- vSecureTag = '(' + vTag + ' '
- vSynTagNumsStr += str(vInternalTree.count(vSecureTag)) + ','
- return vSynTagNumsStr
- ##----------------------------------------------------------------------
- ## extracts the number of to-ig ranked words in the input sentence
- def extractIGLexCount(pSentence, pIGWordList):
- vIGLexCountsStr = ""
-
- ## looping through syntactic tag set to count the occurrence of each tag
- ## in the parse tree
- for word in pIGWordList:
- vInputTokens = pSentence.split()
- vIGLexCountsStr += str(vInputTokens.count(word)) + ','
- return vIGLexCountsStr
- ##----------------------------------------------------------------------
- ## extracts features and writes into output ARFF file
- def extractFeatures(pScores, pOutFile, pTxtInputFile, pParseFile, pInputPPLFile,
- pParserTrainingYields, pLength, pUnseenNum, pPPL, pRootTag, pRootTagList,
- pSynTagNum, pSynTagList, pIGLexCount, pIGWordList, pRef, pRefScores,
- pTreeDepth, pConstituentCnt):
- vLoopCntr = 0
- for vScore in pScores:
-
- if pLength or pUnseenNum or pIGLexCount:
- vSentence = pTxtInputFile.readline().splitlines()[0]
-
- if pLength:
- pOutFile.write(str(extractLength(vSentence)) + ',');
-
- if pUnseenNum:
- pOutFile.write(str(extractUnseenNum(vSentence, pParserTrainingYields)) + ',');
-
- if pPPL:
- ppl = pInputPPLFile.readline()
- if not ppl:
- exit("Perplexity score file does not match input file in line number")
- else:
- pOutFile.write(ppl.splitlines()[0] + ',');
-
- if pRootTag or pSynTagNum:
- parseTree = pParseFile.readline()
- if not parseTree:
- sys.exit("Parse file does not match input file in line number")
- if pRootTag:
- pOutFile.write(extractRootTag(parseTree, pRootTagList) + ',');
- if pSynTagNum:
- pOutFile.write(extractSynTagNum(parseTree, pSynTagList));
-
- if pIGLexCount:
- pOutFile.write(extractIGLexCount(vSentence, pIGWordList));
-
- if pRef:
- pOutFile.write(str(pRefScores[vLoopCntr]) + ',');
-
- # new features
-
- if pTreeDepth:
- pOutFile.write(str(parse.extractTreeDepth(parseTree)) + ',')
-
- if pConstituentCnt:
- pOutFile.write(str(parse.extractConstituentCnt(parseTree)) + ',')
-
- # writing outcome (score) feature value
- pOutFile.write(str(vScore) + '\n');
-
- vLoopCntr += 1
- pOutFile.write('\n')
- return
- ##----------------------------------------------------------------------
- ## writes the header of the output file according to the header file provided
- ## ignores any attribute provided in the header file
- ## also writes the @DATA at the end
- def writeHeader(pHeaderFile, pOutFile, pLength, pUnseenNum, pPPL, pRootTag, pRootTagList,
- pSynTagNum, pSynTagList, pIGLexCount, pIGWordList, pFRef,
- pTreeDepth, pConstituentCnt):
- for line in pHeaderFile:
- if line.upper().startswith("@RELATION"):
- pOutFile.write(line)
- break
- else:
- pOutFile.write(line)
- pOutFile.write('\n')
-
- # adding attribute declarations for features
-
- if pLength:
- pOutFile.write("@ATTRIBUTE\tlength\t\t\tinteger\n")
-
- if pUnseenNum:
- pOutFile.write("@ATTRIBUTE\tunseennum\t\tinteger\n")
-
- if pPPL:
- pOutFile.write("@ATTRIBUTE\tperplexity\t\treal\n")
-
- if pRootTag:
- pOutFile.write("@ATTRIBUTE\troottag\t\t\t{")
- vTagList = ""
- for tag in pRootTagList:
- vTagList += tag + ','
- vTagList = vTagList[:-1]
- pOutFile.write(vTagList + "}\n")
-
- if pSynTagNum:
- for tag in pSynTagList:
- pOutFile.write("@ATTRIBUTE\t\"syntag" + tag + "\"\t\tinteger\n")
-
- if pIGLexCount:
- for word in pIGWordList:
- if '"' in word:
- pOutFile.write("@ATTRIBUTE\t'iglex" + word + "'\t\tinteger\n")
- else:
- pOutFile.write("@ATTRIBUTE\t\"iglex" + word + "\"\t\tinteger\n")
-
- if pFRef:
- pOutFile.write("@ATTRIBUTE\tfref\t\treal\n")
-
- # new features
-
- if pTreeDepth:
- pOutFile.write("@ATTRIBUTE\ttreedepth\t\tinteger\n")
-
- if pConstituentCnt:
- pOutFile.write("@ATTRIBUTE\tconstcnt\t\tinteger\n")
- # adding attribute declarations for outcome (score)
- pOutFile.write("@ATTRIBUTE\tfscore\t\treal\n")
-
- pOutFile.write('\n')
- pOutFile.write("@DATA\n")
- ##======================================================================
- ## main
- def main(argv=None):
- if argv is None:
- argv = sys.argv
-
- parser = optparse.OptionParser(usage="%prog <ARFF FILE HEADER> <OUTPUT FILE NAME> [OPTIONS]" +
- "\nExtracts a set of features for parser accuracy prediction and output it into an ARFF file to be used in Weka machine learning toolkit.", version="%prog 1.4")
- parser.add_option("-s", "--scores", help="parser evalb score file", metavar="PARSER EVALB SCORE FILE", dest="scoreFileName", action="store")
- parser.add_option("-n", "--samplesize", help="number of instances if score file is not specified (for unlabeled instances)", metavar="SAMPLE SIZE", dest="sampleSize", action="store")
- parser.add_option("-a", "--all", help="extract all implemented features (overrides all other individual feature options)", dest="extAll", action="store_true")
- parser.add_option("--length", help="extract input sentence word length as feature", dest="extLength", action="store_true")
- parser.add_option("--unseennum", help="extract number of words in the input data unseen in parser training data as feature", dest="extUnseenNum", action="store_true")
- parser.add_option("-t", "--textinput", help="tokenized raw text input data set", metavar="TOKENIZED RAW TEXT INPUT DATA", dest="txtInputFileName", action="store")
- parser.add_option("--tagset", help="tag set of input parses: wsj, ftb, tiger", metavar="PARSER TAG SET", default="wsj", dest="tagSet", action="store")
- parser.add_option("--roottag", help="extract the root tag of parse tree as feature", dest="extRootTag", action="store_true")
- parser.add_option("--syntagnum", help="extract the number of syntactic tags seen in the parse tree as feature", dest="extSynTagNum", action="store_true")
- parser.add_option("-p", "--parse", help="parsed input data set", metavar="PARSED INPUT DATA", dest="parseFileName", action="store")
- parser.add_option("--perplexity", help="extract the perplexity of each sentence with respect to the parser training data (--ptrain) as feature", dest="extPPL", action="store_true")
- parser.add_option("-P", "--pplscores", help="perplexity scores for the input sentences", metavar="PERPLEXITY SCORES", dest="inputPPLFileName", action="store")
- parser.add_option("-T", "--ptraintext", help="tokenized raw training data set of the parser", metavar="TOKENIZED RAW TEXT PARSER TRAINING DATA", dest="txtParserTrainFileName", action="store")
- parser.add_option("--iglexcount", help="extract the number of top-ig words seen in the parse tree as feature", dest="extIGLexCount", action="store_true")
- parser.add_option("-g", "--igwords", help="top-ig words list file", metavar="TOP IG WORDS FILE", dest="topIGWordsFileName", action="store")
- parser.add_option("--fref", help="extract f-score of the input parse with respect to a reference parser output as feature", dest="extFRef", action="store_true")
- parser.add_option("-r", "--refscores", help="evalb score file of reference parser", metavar="REFERENCE PARSER EVALB SCORE FILE", dest="refEvalFileName", action="store")
- parser.add_option("--treedepth", help="tree depth", metavar="TREE DEPTH", dest="extTreeDepth", action="store_true")
- parser.add_option("--constcount", help="number of syntactic constituents", metavar="CONSTITUENT COUNT", dest="extConstituentCnt", action="store_true")
- (opts, posArgs) = parser.parse_args()
-
-
- # processing arguments
-
- if len(posArgs) < 2:
- parser.error("At least 2 arguments are required!")
- # opening header file
- try:
- vfHeader = open(posArgs[0], 'r')
- except IOError:
- sys.exit('Can\'t open header file: ' + posArgs[0])
- # creating output file
- try:
- vfOutput = open(posArgs[1] + '.arff', 'w')
- except IOError:
- sys.exit('Can\'t create output file: ' + posArgs[1])
-
- # processing options
- ## loading parse scores (outcome) into memory
- ## Note that for unlabeled test data, this is not given. Instead, the
- ## sample set size (-n or --samplesize option) must be provided. In this
- ## case, -1 is a outcome template for each instance.
- vlScore = []
- if opts.scoreFileName != None:
- vlScores = evalbproc.extractSentenceScores(opts.scoreFileName, False, False, False, False)
- for vScore in vlScores:
- vlScore.append(vScore.fscore)
- elif opts.sampleSize != None:
- vlScore = [-1] * int(opts.sampleSize)
- else:
- sys.exit('Either sample size or parse score file should be provided!')
- ## processing feature options (if no option, --all will be set)
- ## remember to add new feature options here
- if not (opts.extAll or opts.extLength or opts.extUnseenNum or opts.extPPL or
- opts.extRootTag or opts.extSynTagNum or opts.extIGLexCount or opts.extFRef or
- opts.extTreeDepth, opts.extConstituentCnt):
- opts.extAll = True
- ## overriding individual feature options if --all is set (i.e. all
- ## individual feature options are considered set to be extracted)
- ## remember to add new feature options here
- if opts.extAll:
- opts.extLength = True
- opts.extUnseenNum = True
- opts.extPPL = True
- opts.extRootTag = True
- opts.extSynTagNum = True
- opts.extIGLexCount = True
- opts.extFRef = True
- opts.extTreeDepth = True
- opts.extConstituentCnt = True
- ## opening raw text file, if any related feature is requested
- ## remember to add new such features hear
- vfTextInput = None
- if opts.extLength or opts.extUnseenNum or opts.extIGLexCount:
- if opts.txtInputFileName == None:
- sys.exit('Text input file is not provided!')
- else:
- try:
- vfTextInput = open(opts.txtInputFileName, 'r')
- except IOError:
- sys.exit('Can\'t open text input file: ' + opts.txtInputFileName)
- ## opening parse file, if any related feature is requested
- ## remember to add new such features hear
- vfParse = None
- if opts.extRootTag or opts.extSynTagNum or opts.extTreeDepth or opts.extConstituentCnt:
- if opts.parseFileName == None:
- sys.exit('Parse file is not provided!')
- else:
- try:
- vfParse = open(opts.parseFileName, 'r')
- except IOError:
- sys.exit('Can\'t open parse file: ' + opts.parseFileName)
- ## opening parser training raw text file, if any related feature is requested
- ## remember to add new such features hear
- vfParserTrainText = None
- if opts.extUnseenNum:
- if opts.txtParserTrainFileName == None:
- sys.exit('Parser training text file is not provided!')
- else:
- try:
- vfParserTrainText = open(opts.txtParserTrainFileName, 'r')
- except IOError:
- sys.exit('Can\'t open parser training text file: ' + opts.txtParserTrainFileName)
- ## opening perplexity scores file for the input sentences, if any related
- ## feature is requested
- vfInputPPL = None
- if opts.extPPL:
- if opts.inputPPLFileName == None:
- sys.exit('Perplexity scores file for the input sentences is not provided!')
- else:
- try:
- vfInputPPL = open(opts.inputPPLFileName, 'r')
- except IOError:
- sys.exit('Can\'t open perplexity scores file for the input sentences: ' + opts.inputPPLFileName)
- ## WSJ: The set is extracted from WSJ by Jennifer plus 1 UNPARSED tag for
- ## those sentences which could not be parsed, 1 UNSCORABLE tag for those
- ## sentences which could not be scored, and 1 UNRERANKED tag for those
- ## sentences which could not be re-ranked:
- ## FTB: The set is extracted from FTB plus the above additional tags and X,
- ## plus some found in the Berkeley output (trained on FTB).
- ## The only root tag found in FTB (train+dev+test) was SENT.
- ## TIGER: The set is extracted from Tiger plus the above additional tags,
- ## those found in Tiger.penn, and X.
- cRootTagList = [('WSJ', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', 'ADJP', 'ADVP', 'CONJP', 'FRAG', 'INTJ', 'LS', 'LST', 'NAC', 'NP', 'NX', 'PP', 'PRN', 'PRT', 'QP', 'RRC', 'S', 'SBAR', 'SBARQ', 'SINV', 'SQ', 'UCP', 'VP', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP', 'X']),
- ('FTB', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', 'SENT', 'NP', 'PP', 'COORD', 'VPinf', 'X']),
- ('TIGER', ['UNPARSED', 'UNSCORABLE', 'UNRERANKED', '$', '$.', '"$,"', 'ADJA', 'ADJD', 'ADV', 'AP', 'ART', 'AVP', 'CAP', 'CARD', 'CAVP', 'CH', 'CNP', 'CO', 'CPP', 'CS', 'CVP', 'DL', 'ITJ', 'KON', '$*LRB*', 'NE', 'NN', 'NP', 'PN', 'PP', 'PTKANT', 'PWAV', 'PWS', 'S', 'VMFIN', 'VP', 'VVPP', 'XY', 'X'])]
-
- # fetching the root tag set based on user option
- vRootTagList = [tagset[1] for tagset in cRootTagList if tagset[0] == opts.tagSet.upper()][0]
-
- ## WSJ: The set is extracted from WSJ by Jennifer plus AUX and AUXJ used by
- ## Brown.
- ## FTB: The set is extracted from FTB plus 'X'.
- ## TIGER: The set is extracted from Tiger.penn (727!) plus 'X'.
- cSynTagList = [('WSJ', ['#', '$', "''", ',', '.', ':', 'ADJP', 'ADVP', 'AUX', 'AUXJ', 'CC', 'CD', 'CONJP', 'DT', 'EX', 'FRAG', 'FW', 'IN', 'INTJ', 'JJ', 'JJR', 'JJS', 'LS', 'LST', 'MD', 'NAC', 'NN', 'NNP', 'NNPS', 'NNS', 'NP', 'NX', 'PDT', 'POS', 'PP', 'PRN', 'PRP', 'PRP$', 'PRT', 'PRT|ADVP', 'QP', 'RB', 'RBR', 'RBS', 'RP', 'RRC', 'S', 'SBAR', 'SBARQ', 'SINV', 'SQ', 'SYM', 'TO', 'UCP', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'VP', 'WDT', 'WHADJP', 'WHADVP', 'WHNP', 'WHPP', 'WP', 'WP$', 'WRB', 'X', '``']),
- ('FTB', ['ADJ', 'ADJWH', 'AdP', 'ADV', 'ADVWH', 'AP', 'CC', 'CLO', 'CLR', 'CLS', 'COORD', 'CS', 'DET', 'DETWH', 'ET', 'I', 'NC', 'NP', 'NPP', 'P', 'P+D', 'PONCT', 'PP', 'P+PRO', 'PREF', 'PRO', 'PROREL', 'PROWH', 'SENT', 'Sint', 'Srel', 'Ssub', 'V', 'VIMP', 'VINF', 'VN', 'VPinf', 'VPP', 'VPpart', 'VPR', 'VS', 'X']),
- ('TIGER', ['$,', "'$.'", 'AA-CJ', 'AA-HD', 'AA-MO', 'AA-NK', 'AA-PD', 'ADJA', 'ADJA-ADC', 'ADJA-CJ', 'ADJA-HD', 'ADJA-MO', 'ADJA-NK', 'ADJA-NMC', 'ADJA-OA', 'ADJA-PD', 'ADJA-PNC', 'ADJA-SB', 'ADJD', 'ADJD-APP', 'ADJD-CJ', 'ADJD-CVC', 'ADJD-HD', 'ADJD-MNR', 'ADJD-MO', 'ADJD-NK', 'ADJD-OA', 'ADJD-PAR', 'ADJD-PD',
- 'ADJD-PH', 'ADJD-PNC', 'ADJD-SB', 'ADJD-UC', 'ADV', 'ADV-AC', 'ADV-APP', 'ADV-AVC', 'ADV-CD', 'ADV-CJ', 'ADV-DH', 'ADV-DM', 'ADV-HD', 'ADV-JU', 'ADV-MNR', 'ADV-MO', 'ADV-NG', 'ADV-NK', 'ADV-OA', 'ADV-OP', 'ADV-PAR', 'ADV-PH', 'ADV-PNC', 'ADV-SVP', 'ADV-UC', 'AP', 'AP-AG', 'AP-AMS', 'AP-APP', 'AP-CC',
- 'AP-CJ', 'AP-HD', 'AP-MNR', 'AP-MO', 'AP-NG', 'AP-NK', 'AP-NMC', 'AP-OA', 'AP-OC', 'AP-PAR', 'AP-PD', 'AP-PNC', 'APPO-AC', 'APPR', 'APPR-AC', 'APPRART', 'APPRART-AC', 'APPRART-CJ', 'APPRART-PNC', 'APPR-AVC', 'APPR-CD', 'APPR-CJ', 'APPR-CP', 'APPR-HD', 'APPR-MO', 'APPR-PNC', 'APPR-UC', 'AP-RE', 'AP-RS',
- 'AP-SB', 'APZR-AC', 'APZR-MO', 'ART', 'ART-CJ', 'ART-HD', 'ART-MO', 'ART-NK', 'ART-NMC', 'ART-PNC', 'AVP', 'AVP-APP', 'AVP-CC', 'AVP-CD', 'AVP-CJ', 'AVP-CP', 'AVP-CVC', 'AVP-DM', 'AVP-HD', 'AVP-JU', 'AVP-MNR', 'AVP-MO', 'AVP-NG', 'AVP-NK', 'AVP-OA', 'AVP-OC', 'AVP-OP', 'AVP-PAR', 'AVP-PD', 'AVP-PH',
- 'AVP-PNC', 'AVP-RE', 'AVP-RS', 'AVP-SB', 'CAC-AC', 'CAC-MO', 'CAP', 'CAP-APP', 'CAP-CJ', 'CAP-HD', 'CAP-MNR', 'CAP-MO', 'CAP-NK', 'CAP-NMC', 'CAP-OA', 'CAP-PAR', 'CAP-PD', 'CAP-PNC', 'CAP-RE', 'CAP-SB', 'CARD', 'CARD-AMS', 'CARD-APP', 'CARD-CJ', 'CARD-HD', 'CARD-MNR', 'CARD-MO', 'CARD-NK', 'CARD-NMC',
- 'CARD-OA', 'CARD-OP', 'CARD-PAR', 'CARD-PD', 'CARD-PNC', 'CARD-SB', 'CARD-UC', 'CAVP', 'CAVP-AC', 'CAVP-CJ', 'CAVP-HD', 'CAVP-MNR', 'CAVP-MO', 'CAVP-NG', 'CAVP-NK', 'CAVP-PAR', 'CAVP-PD', 'CAVP-SVP', 'CCP-CP', 'CH', 'CH-APP', 'CH-CJ', 'CH-CM', 'CH-DM', 'CH-HD', 'CH-MNR', 'CH-MO', 'CH-NK', 'CH-OA',
- 'CH-OC', 'CH-PAR', 'CH-PD', 'CH-PNC', 'CH-RE', 'CH-RS', 'CH-SB', 'CNP', 'CNP-AG', 'CNP-AMS', 'CNP-APP', 'CNP-CC', 'CNP-CJ', 'CNP-DA', 'CNP-DH', 'CNP-HD', 'CNP-MNR', 'CNP-MO', 'CNP-NK', 'CNP-NMC', 'CNP-OA', 'CNP-OC', 'CNP-OG', 'CNP-PAR', 'CNP-PD', 'CNP-PH', 'CNP-PNC', 'CNP-RE', 'CNP-RS', 'CNP-SB',
- 'CNP-VO', 'CO', 'CO-AG', 'CO-APP', 'CO-CD', 'CO-CJ', 'CO-CP', 'CO-HD', 'CO-MNR', 'CO-MO', 'CO-NK', 'CO-OA', 'CO-OC', 'CO-PAR', 'CO-PD', 'CO-PG', 'CO-PNC', 'CO-RE', 'CO-RS', 'CO-SB', 'CPP', 'CPP-APP', 'CPP-CC', 'CPP-CJ', 'CPP-CVC', 'CPP-MNR', 'CPP-MO', 'CPP-NK', 'CPP-OP', 'CPP-PAR', 'CPP-PG', 'CPP-PNC',
- 'CPP-RE', 'CPP-SBP', 'CS', 'CS-APP', 'CS-CC', 'CS-CJ', 'CS-DH', 'CS-HD', 'CS-MNR', 'CS-MO', 'CS-NK', 'CS-OC', 'CS-PAR', 'CS-PD', 'CS-PNC', 'CS-RC', 'CS-RE', 'CS-RS', 'CS-SB', 'CVP', 'CVP-APP', 'CVP-CC', 'CVP-CJ', 'CVP-HD', 'CVP-MNR', 'CVP-MO', 'CVP-NK', 'CVP-OC', 'CVP-PAR', 'CVP-PD', 'CVP-RE', 'CVP-SB',
- 'CVZ-HD', 'CVZ-OA', 'CVZ-OC', 'DL', 'DL-CJ', 'DL-MO', 'DL-PAR', 'FM-AC', 'FM-ADC', 'FM-APP', 'FM-AVC', 'FM-CJ', 'FM-MO', 'FM-NK', 'FM-OA', 'FM-OC', 'FM-OP', 'FM-PD', 'FM-PNC', 'FM-SB', 'FM-UC', 'ISU-CJ', 'ISU-MO', 'ISU-NK', 'ITJ', 'ITJ-DM', 'ITJ-MO', 'ITJ-NK', 'ITJ-OA', 'ITJ-UC', 'ITJ-VO', 'KOKOM-AC',
- 'KOKOM-CC', 'KOKOM-CD', 'KOKOM-CM', 'KOKOM-CP', 'KOKOM-MO', 'KOKOM-UC', 'KON', 'KON-AVC', 'KON-CD', 'KON-CJ', 'KON-DH', 'KON-HD', 'KON-JU', 'KON-MO', 'KON-PNC', 'KON-UC', 'KOUI-AC', 'KOUI-AVC', 'KOUI-CP', 'KOUS-AC', 'KOUS-AVC', 'KOUS-CJ', 'KOUS-CP', 'KOUS-HD', 'KOUS-MO', 'KOUS-PH', 'KOUS-PNC', '$*LRB*',
- '$*LRB*-PNC', 'MTA-CJ', 'MTA-NK', 'NE', 'NE-AC', 'NE-ADC', 'NE-AG', 'NE-APP', 'NE-CJ', 'NE-DA', 'NE-DH', 'NE-DM', 'NE-MNR', 'NE-MO', 'NE-NK', 'NE-OA', 'NE-OC', 'NE-OP', 'NE-PAR', 'NE-PD', 'NE-PNC', 'NE-RE', 'NE-RS', 'NE-SB', 'NE-UC', 'NE-VO', 'NM', 'NM-AMS', 'NM-APP', 'NM-CC', 'NM-CJ', 'NM-HD', 'NM-MNR',
- 'NM-MO', 'NM-NK', 'NM-OA', 'NM-PAR', 'NM-PD', 'NM-SB', 'NN', 'NN-ADC', 'NN-AG', 'NN-AMS', 'NN-APP', 'NN-CJ', 'NN-DA', 'NN-DH', 'NN-DM', 'NNE-NK', 'NN-HD', 'NN-MNR', 'NN-MO', 'NN-NK', 'NN-NMC', 'NN-OA', 'NN-OA2', 'NN-OC', 'NN-OG', 'NN-OP', 'NN-PAR', 'NN-PD', 'NN-PNC', 'NN-RE', 'NN-SB', 'NN-UC', 'NN-VO',
- 'NP', 'NP-AG', 'NP-AMS', 'NP-APP', 'NP-CC', 'NP-CJ', 'NP-CVC', 'NP-DA', 'NP-DH', 'NP-MNR', 'NP-MO', 'NP-NK', 'NP-NMC', 'NP-OA', 'NP-OA2', 'NP-OC', 'NP-OG', 'NP-OP', 'NP-PAR', 'NP-PD', 'NP-PG', 'NP-PH', 'NP-PNC', 'NP-RE', 'NP-RS', 'NP-SB', 'NP-SP', 'NP-UC', 'NP-VO', 'PDAT-AG', 'PDAT-CJ', 'PDAT-DA', 'PDAT-HD',
- 'PDAT-NK', 'PDAT-SB', 'PDS-AG', 'PDS-APP', 'PDS-CJ', 'PDS-DA', 'PDS-NK', 'PDS-OA', 'PDS-OA2', 'PDS-OG', 'PDS-PD', 'PDS-PH', 'PDS-SB', 'PDS-SP', 'PIAT-AG', 'PIAT-CJ', 'PIAT-HD', 'PIAT-MO', 'PIAT-NK', 'PIAT-NMC', 'PIAT-PNC', 'PIAT-SB', 'PIAT-UC', 'PIS-AG', 'PIS-APP', 'PIS-CJ', 'PIS-DA', 'PIS-HD', 'PIS-MNR',
- 'PIS-MO', 'PIS-NK', 'PIS-OA', 'PIS-OA2', 'PIS-OG', 'PIS-PD', 'PIS-PH', 'PIS-RE', 'PIS-SB', 'PIS-UC', 'PN', 'PN-AG', 'PN-APP', 'PN-CJ', 'PN-DA', 'PN-DH', 'PN-MO', 'PN-NK', 'PN-OA', 'PN-OC', 'PN-PAR', 'PN-PD', 'PN-PNC', 'PN-RE', 'PN-RS', 'PN-SB', 'PN-UC', 'PP', 'PP-AC', 'PP-AG', 'PP-AMS', 'PP-APP', 'PP-CC',
- 'PP-CJ', 'PP-CVC', 'PP-DH', 'PPER', 'PPER-APP', 'PPER-CJ', 'PPER-DA', 'PPER-EP', 'PPER-NK', 'PPER-OA', 'PPER-OA2', 'PPER-OG', 'PPER-OP', 'PPER-PAR', 'PPER-PD', 'PPER-PH', 'PPER-SB', 'PPER-UC', 'PPER-VO', 'PP-HD', 'PP-MNR', 'PP-MO', 'PP-NK', 'PP-OC', 'PP-OP', 'PPOSAT-DA', 'PPOSAT-NK', 'PPOSS-NK', 'PPOSS-SB',
- 'PP-PAR', 'PP-PD', 'PP-PG', 'PP-PNC', 'PP-RE', 'PP-RS', 'PP-SB', 'PP-SBP', 'PP-SVP', 'PP-UC', 'PRELAT-AG', 'PRELAT-NK', 'PRELS-DA', 'PRELS-MO', 'PRELS-NK', 'PRELS-OA', 'PRELS-OG', 'PRELS-PD', 'PRELS-PH', 'PRELS-SB', 'PRF-CJ', 'PRF-DA', 'PRF-MO', 'PRF-NK', 'PRF-OA', 'PRF-SB', 'PROAV-AC', 'PROAV-CD', 'PROAV-CJ',
- 'PROAV-CVC', 'PROAV-HD', 'PROAV-MNR', 'PROAV-MO', 'PROAV-NK', 'PROAV-OP', 'PROAV-PAR', 'PROAV-PD', 'PROAV-PG', 'PROAV-PH', 'PROAV-SBP', 'PTKA-AC', 'PTKA-HD', 'PTKA-MO', 'PTKANT', 'PTKANT-CJ', 'PTKANT-DM', 'PTKANT-HD', 'PTKANT-MO', 'PTKANT-NK', 'PTKANT-OA', 'PTKANT-OC', 'PTKANT-RS', 'PTKA-PM', 'PTKNEG-CJ',
- 'PTKNEG-HD', 'PTKNEG-MO', 'PTKNEG-NG', 'PTKVZ', 'PTKVZ-CJ', 'PTKVZ-MNR', 'PTKVZ-MO', 'PTKVZ-OA', 'PTKVZ-SVP', 'PTKZU', 'PTKZU-MO', 'PTKZU-PM', 'PTKZU-SVP', 'PWAT-AG', 'PWAT-CJ', 'PWAT-HD', 'PWAT-MO', 'PWAT-NK', 'PWAV', 'PWAV-CJ', 'PWAV-CM', 'PWAV-CP', 'PWAV-CVC', 'PWAV-HD', 'PWAV-MO', 'PWAV-NK', 'PWAV-OA',
- 'PWAV-OP', 'PWAV-PD', 'PWAV-RC', 'PWAV-RE', 'PWAV-SBP', 'PWS', 'PWS-CJ', 'PWS-DA', 'PWS-MO', 'PWS-NK', 'PWS-OA', 'PWS-OA2', 'PWS-PD', 'PWS-PH', 'PWS-SB', 'PWS-UC', 'S', 'S-APP', 'S-CC', 'S-CJ', 'S-DH', 'S-MNR', 'S-MO', 'S-NK', 'S-OA', 'S-OC', 'S-PAR', 'S-PD', 'S-PNC', 'S-RC', 'S-RE', 'S-RS', 'S-SB', 'S-SP',
- 'TOP', 'TRUNC-ADC', 'TRUNC-CJ', 'TRUNC-HD', 'TRUNC-MO', 'TRUNC-NK', 'TRUNC-OC', 'TRUNC-PD', 'TRUNC-PNC', '$.-UC', 'VAFIN', 'VAFIN-HD', 'VAFIN-UC', 'VAIMP-HD', 'VAINF-HD', 'VAINF-OC', 'VAPP-HD', 'VAPP-OC', 'VMFIN', 'VMFIN-HD', 'VMINF-HD', 'VMPP-HD', 'VMPP-PD', 'VP', 'VP-APP', 'VP-CC', 'VP-CJ', 'VP-HD', 'VP-MNR',
- 'VP-MO', 'VP-NK', 'VP-OA', 'VP-OC', 'VP-PAR', 'VP-PD', 'VP-PNC', 'VP-RC', 'VP-RE', 'VP-RS', 'VP-SB', 'VVFIN', 'VVFIN-CJ', 'VVFIN-HD', 'VVFIN-NK', 'VVFIN-OC', 'VVFIN-UC', 'VVIMP-CJ', 'VVIMP-HD', 'VVIMP-MO', 'VVINF', 'VVINF-CJ', 'VVINF-CVC', 'VVINF-HD', 'VVINF-NK', 'VVINF-OC', 'VVINF-PNC', 'VVINF-RE', 'VVINF-SB',
- 'VVINF-UC', 'VVIZU-CC', 'VVIZU-CJ', 'VVIZU-HD', 'VVIZU-OC', 'VVIZU-RE', 'VVPP', 'VVPP-CJ', 'VVPP-HD', 'VVPP-MO', 'VVPP-NK', 'VVPP-OC', 'VVPP-PAR', 'VVPP-PD', 'VVPP-PNC', 'VVPP-SB', 'VZ-CJ', 'VZ-HD', 'VZ-MNR', 'VZ-MO', 'VZ-NK', 'VZ-OC', 'VZ-PD', 'VZ-RE', 'XY', 'XY-APP', 'XY-CJ', 'XY-MNR', 'XY-MO', 'XY-NK', 'XY-OP',
- 'XY-PAR', 'XY-PNC', 'XY-UC', 'X'])]
-
- # fetching the syntactic tag set based on user option
- vSynTagList = [tagset[1] for tagset in cSynTagList if tagset[0] == opts.tagSet.upper()][0]
-
- ## if any feature related to raw text of parser training data is requested,
- ## load the data into memory first
- ## remember to add new such features hear
- vsetParserTrainingYields = set([])
- if opts.extUnseenNum:
- for line in vfParserTrainText.read().splitlines():
- vsetLine = set(line.split())
- vsetParserTrainingYields = vsetParserTrainingYields.union(vsetLine)
-
-
- # extracting top-IG words
- vIGWordList = []
- if opts.extIGLexCount:
- # checking top-IG words file
- if opts.topIGWordsFileName == None:
- sys.exit('Top IG-based ranked words file is not provided!')
- else:
- try:
- vfTopIGWords = open(opts.topIGWordsFileName, 'r')
- except IOError:
- sys.exit('Can\'t open top IG-based ranked words file: ' + opts.topIGWordsFileName)
-
- # creating to IG words list from file
- for word in vfTopIGWords.read().splitlines():
- vIGWordList.append(word)
-
- vlFRef = []
- if opts.extFRef:
- # checking reference parser evalb file
- if opts.refEvalFileName == None:
- sys.exit('Evalb score file of reference parser is not provided!');
- else:
- vRefScores = evalbproc.extractSentenceScores(opts.refEvalFileName, False, False, False, False)
-
- for vScore in vRefScores:
- vlFRef.append(vScore.fscore)
- ## writing output file header
- ## remember to add new feature here
- writeHeader(vfHeader, vfOutput, opts.extLength, opts.extUnseenNum, opts.extPPL, opts.extRootTag,
- vRootTagList, opts.extSynTagNum, vSynTagList, opts.extIGLexCount, vIGWordList, opts.extFRef,
- opts.extTreeDepth, opts.extConstituentCnt)
-
- ## extracting and writing features
- ## remember to add new feature here
- extractFeatures(vlScore, vfOutput, vfTextInput, vfParse, vfInputPPL, vsetParserTrainingYields,
- opts.extLength, opts.extUnseenNum, opts.extPPL, opts.extRootTag, vRootTagList,
- opts.extSynTagNum, vSynTagList, opts.extIGLexCount, vIGWordList, opts.extFRef, vlFRef,
- opts.extTreeDepth, opts.extConstituentCnt)
- vfHeader.close()
- vfOutput.close()
- if vfTextInput != None:
- vfTextInput.close()
- if vfParse != None:
- vfParse.close()
- if vfParserTrainText != None:
- vfParserTrainText.close()
- if vfInputPPL != None:
- vfInputPPL.close()
- if vfTopIGWords != None:
- vfTopIGWords.close()
-
- ##======================================================================
- ## calling main
- if __name__ == "__main__":
- sys.exit(main())
|