123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601 |
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- This module defines classes related to constituency parsing.
-
- Version 1.5 (06-Jul-2016)
- - lowerTerminals() is added to ConstTree and ConstNode.
-
- Version 1.4 (04-May-2016)
- - A bug in stratifyRules() of ConstNode is fixed related to inconsistent
- naming changes. Also creating a new node in stratifyPartialRule()
- is now done using _getNewNode() method.
-
- Version 1.3 (19-Feb-2016)
- - getPathTo() is added to ConstNode.
-
- Version 1.2 (19-Nov-2015 to 24-Nov-2015)
- - deepCopy() is added to ConstTree.
- - deepCopy() ans shallowCopy() of ConstNode are edited to enable inherited
- classes to create new nodes of their own type (by using _getNewNode()).
- - SRL attribute is added to the constructor of ConstTree.
-
- Version 1.1 (28-May-2015 to 10-Jun-2015)
- - removeSuffix() and hasSuffixedLabel() are moved from ConstTree and
- ConstNode to ForeeConstTree and ForeeConstNode in foreebank.py.
- - getRightSibling() and getLeftSibling() are added to ConstNode.
- - addChild() and replaceChild() of ConstNode are changed to set the
- parent of the added child regardless of it being done by the calling
- code.
- - delChild() is changed to set the parent of the deleted child to None.
- - removeFunctionTags() is added to ConstTree and ConstNode.
- - ConstParseLoader is moved here from dloader.py.
- - _createNewTree() is added to ConstParseLoader() and the changes are
- made accordingy to allow the child classes create their corresponding
- tree class instances.
- - getRoot(), getRootLabel(), getPreTerminals() are added to ConstTree.
- - getSynLabels() is added to ConstTree and ConstNode.
-
- Version 1.0 (25-May-2015)
- - ConstParses, ConstParse, ConstTree, ConstNode, BConstParser are moved
- here from dloader.py.
- - getTerminalNodes() is added to ConstTree and ConstNode.
- """
- import re, copy
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # ConstParses
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class ConstParses:
- '''
- N-best constituency parses of a sentences
-
- A ConstParses contains the list of n-best parses, each an instance of
- ConstParse, together with other derails such as parsing time.
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.parses = []
- self.time = None
-
-
-
- @property
- def tree(self):
- '''
- Returns the topmost parse tree in the n-best list
- '''
-
- return self.parses[0].tree
-
-
-
- @property
- def logProb(self):
- '''
- Returns the log probability of the topmost parse in the n-best list
- '''
-
- return self.parses[0].logProb
-
-
-
- @property
- def prob(self):
- '''
- Returns the probability of the topmost parse in the n-best list
- '''
-
- if self.parses[0].logProb == None:
- return None
- else:
- return math.exp(self.parses[0].logProb)
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # ConstParse
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class ConstParse:
- '''
- Constituency parse
-
- A ConstParse contains a constituency parse tree of type ConstTree,
- together with their log probability.
- '''
-
-
- def __init__(self, pConstTree, pParseLogProb = None):
- '''
- Constructor
- '''
-
- self.tree = pConstTree
- self.logProb = pParseLogProb
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # ConstTree
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class ConstTree:
- '''
- Constituency tree
-
- A ConstTree is the constituency parse tree of a sentence where each
- constituent is represented by a ConstNode
-
- Pre-terminals (POS tags) and terminal (tokens) can optionally be in
- in one node or in separate nodes, where the pre-terminal is the parent
- of terminal.
-
- Note that the constituency tree is implemented independent of any format
- like CoNLLSentence to be used more generally.
-
- Note that as of version 1.6 of this module, ConstTree is implemented
- differently from DepTree. A DepTree is a list of all its nodes, while
- a ConstTree is only root node storing the direct children only.
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.root = None
- ## set to true if the terminal is expanded into pre-terminal and
- ## terminal nodes
- self.expandedTerminal = False
-
- self.srl = None
-
-
-
- def deepCopy(self):
- '''
- NOTE: it seems python deepcopy() works better. Try the idea before
- using this method.
-
- Created a deep copy of the tree.
- '''
-
- vCopyTree = self._createNewTree()
- vCopyTree.root = self.root.deepCopy(pflgCopyTree = True)
-
- vCopyTree.srl = self.srl
-
- return vCopyTree
-
-
-
- def _createNewTree(self):
- '''
- Creates and returns a new ConstTree
-
- This method is useful in class inheriting.
- '''
-
- return ConstTree()
-
-
-
- def __str__(self):
- '''
- String representation of the class
- '''
-
- return self.getPTBFormat()
-
-
-
- def loadPTBTree(self, pPTBTree, pflgExpandTerminal = False):
- '''
- Creates the tree by loading from Penn Treebank style tree.
-
- Note that TOP, ROOT and empty are considered tree dummy root node
- labels and will not be added as nodes.
-
- If pflgExpandTerminal is set to true, the terminal node will be
- expanded into a pre-terminal (POS tag) and terminal (token) nodes.
- '''
-
- # taking out the main tree
- if pPTBTree.startswith("( ("):
- vPTBTree = pPTBTree.strip()[2:-1]
- elif pPTBTree.upper().startswith("(TOP ("):
- vPTBTree = pPTBTree.strip()[5:-1]
- elif pPTBTree.upper().startswith("(ROOT ("):
- vPTBTree = pPTBTree.strip()[6:-1]
- elif pPTBTree.startswith('('):
- vPTBTree = pPTBTree.strip()
- else:
- sys.exit("Bad tree! It doesn't start with a bracket.")
-
-
- self.root = self._createRoot()
- self.expandedTerminal = pflgExpandTerminal
- self.root.loadConstituent(vPTBTree, None, pConstFormat = 'ptb', pflgExpandTerminal = pflgExpandTerminal)
-
-
-
- def _createRoot(self):
- '''
- Creates and returns the root node
- '''
-
- return ConstNode()
-
-
-
- def loadSRL(self, pSRL, pConversionMethod = "elevate"):
- '''
- Loads semantic role labeling from SRL object
-
- It checks for the type of SRL and converts it to constituency if
- it's in other formalisms like dependnecy.
-
- It both loads the whole annotation into the srl attribute (of type
- SRL) and the token annotations into nodes.
-
- - pConversionMethod: if pSRL is not in constituency format, this
- specifies the conversion method. Follow srl.convertToConst() for
- details.
- '''
-
- if not pSRL.isConstituency():
- self.srl = pSRL.convertToConst(self, pConversionMethod, pflgLoadIntoTree = True)
- else:
- self.srl = pSRL
-
- # loading node annotations
- #for vProp in self.srl.propositions:
- # predicate
- # vPredNode = vProp.predicate.node
- # vPredNode.predRoles.append((None, vProp.predicate.frameset))
-
- # arguments
- # for vArg in vProp.arguments:
- # vArgNode = vArg.node
- ## preventing duplicates (They may happen if the original
- ## SRL is dependency-based and a role is assigned to multiple
- ## token which are elevated to the same constituency node.)
- # if (vPredNode, vArg.label) not in vArgNode.predRoles:
- # vArgNode.predRoles.append((vPredNode, vArg.label))
-
-
-
- def getPTBFormat(self):
- '''
- Returns the tree in PTB brackeing format
- '''
-
- return self.root.getPTBFormat()
-
-
-
- def getTerminals(self):
- '''
- Returns the list of terminals (tokens) of the tree in their original
- order.
- '''
-
- return self.root.getTerminals()
-
-
-
- def getPreTerminals(self):
- '''
- Returns the preterminal labels of the tree in a list
- '''
-
- return self.root.getPreTerminals()
-
-
-
- @property
- def surface(self):
- '''
- Surface form of the sentence
- '''
-
- return self.root.surface
-
-
-
- def getTerminalNodes(self):
- '''
- Returns the list of terminal nodes of the tree in their original
- order.
- '''
-
- return self.root.getTerminalNodes()
-
-
-
- def getPreTerminalNodes(self):
- '''
- Returns pre-teriminal nodes of the tree
- '''
-
- return self.root.getPreTerminalNodes()
-
-
-
- def getPreTerminalNodeAt(self, pPosition):
- '''
- Returns the pre-terminal node of token at given position if the
- tree format expandes terminals or terminal node if it does not
- '''
-
- return self.root.getPreTerminalNodeAt(pPosition)
-
-
-
- def getHeight(self):
- '''
- Returns the height of the tree which is the number of levels from
- the root to the deepest (farthest) terminal.
-
- The height of a tree with only one node is 0.
- '''
-
- return self.root.getHeight()
-
-
-
- def getSize(self):
- '''
- Returns the size of the tree which is the number of its nodes.
- '''
-
- return self.root.getSize()
-
-
-
- def getPOSs(self):
- '''
- Returns POS tags (pre-trminals) of the terminals in their original
- order in a list.
- '''
-
- return self.root.getPreTerminals()
-
-
-
- def getUniversalPOSs(self, pUniversalTagMap):
- '''
- Returns POS tags mapped to the universal POS tags of Petrov et al.
- (2012).
-
- The map needs to be provided in a tag map file format.
- '''
-
- try:
- vdTagMap = loadTagMap(open(pUniversalTagMap).read(), pflgSelfMapIn = True)
- except IOError:
- raise Exception("Cannot open tag map file: %s" % pUniversalTagMap)
-
- vlUPOSTags = []
-
- for vTag in self.getPOSs():
- try:
- vlUPOSTags.append(vdTagMap[vTag])
- except KeyError:
- if "_catchall" in vdTagMap:
- vlUPOSTags.append(vdTagMap["_catchall"])
- else:
- vlUPOSTags.append(vTag)
-
- return vlUPOSTags
-
-
-
- def mapTags(self, pdMap):
- '''
- Maps syntactic tags to a new tag set using map in pdMap.
- '''
-
- self.root.mapTags(pdMap)
-
-
-
- def getCoarseChunks(self):
- '''
- Extracts and returns coarse-grained chunks which are children
- of root node.
- '''
-
- return self.root.getChildrenTags()
-
-
-
- def getHighestNodeOfSpan(self, pSpan):
- '''
- Returns the highest (closest to root) node spanning the given token
- span
-
- An exception is raised if pSpan is not in the range or it crosses
- the left or right boundary of a node in the tree.
- '''
-
- vHSNode = self.root.getHighestNodeOfSpan(pSpan)
-
- if vHSNode == None:
- vTokenSpan = self.root.getTokenSpan()
- raise Exception("Invalid range: (%s %s) is not in (%s %s)" % (pSpan[0], pSpan[1], vTokenSpan[0], vTokenSpan[1]))
- else:
- return vHSNode
-
-
-
- def getSentLength(self):
- '''
- Returns the length of the sentence of the tree
- '''
-
- return self.root.getSpanSize()
-
-
-
- def getTagset(self):
- '''
- Return the syntactic tags of the tree nodes and their counts.
- '''
-
- return self.root.getTagset()
-
-
-
- def getSynLabels(self):
- '''
- Return the syntactic labes of the tree nodes including phrase labels
- and POS tags in a list
- '''
-
- return self.root.getSynLabels()
-
-
-
- def extractCFGRules(self, pflgNoLexRule = False):
- '''
- Extracts and returns the CFG production rules expanding the nodes
- of the tree.
-
- The rules are returned in a list. Each rule is a dictionary of
- the left and right hand sides of the rules, the former being a
- phrase tag and the latter a list of tags.
-
- If pflgNoLexRule is true, lexical rules will not be extracted.
- '''
-
- return self.root.extractCFGRules(pflgNoLexRule)
-
-
-
- def extractLexRules(self):
- '''
- Extracts and returns the lexical CFG rules.
-
- The rules are returned in a list. Each rule is a dictionary of
- the left and right hand sides of the rules, the former being a
- POS tag and the latter a terminal token.
- '''
-
- return self.root.extractLexRules()
-
-
-
- def extractSubsetTrees(self):
- '''
- Extracts and returns list of all subset trees of the tree.
-
- Subset trees are used in tree kernels (c.f. Moschitti 2006)
- '''
-
- vlFragments = self.root.extractSubsetTrees()
-
- vlSsTrees = []
- for vFrag in vlFragments:
- vTree = ConstTree()
- vTree.root = vFrag
- vlSsTrees.append(vTree)
-
- return vlSsTrees
-
-
-
- def getPST(self, pPredNode):
- '''
- Extracts proposition subtree (PST) the predicate node of which is
- given
-
- PST is a subtree of the constituency tree spanning only the predicate
- and argument nodes of that proposition.
- '''
-
- vPST = ConstTree()
- vPST.expandedTerminal = self.expandedTerminal
-
- vPSTRoot, vPSTPredNode = self.root.getRawPST(pPredNode, None)
-
-
- def _postProcPST(pPSTNode, pOldPredNode, pNewPredNode):
- '''
- Post-processes PST to set the SRL based on the new predicate
- node
-
- Only the role of argument of the given predicate and the frameset
- of this predicate are kept.
- '''
-
- if pPSTNode.isArgumentOf(pOldPredNode):
- ## In Theory, no argument can take two roles for a single
- ## predicate. However, due to errors (e.g. conversion from
- ## dependency SRl to constituency SRL, this may happen.
- ## The uncommented code takes care of the situation.
- #pPSTNode.setPredRoles([(pNewPredNode, vPSTNode.getArgRoleOf(pOldPredNode))])
- pPSTNode.setPredRoles([(pNewPredNode, r) for r in pPSTNode.getArgRoleOf(pOldPredNode)])
- elif pPSTNode == pNewPredNode:
- pPSTNode.setPredRoles([(None, pPSTNode.getPredicateFrameset())])
- else:
- pPSTNode.removePredRoles()
-
- for vChild in pPSTNode.getChildren():
- _postProcPST(vChild, pOldPredNode, pNewPredNode)
-
-
-
- # post-processing SRL in the PST as the predicate node is new
- _postProcPST(vPSTRoot, pPredNode, vPSTPredNode)
-
- ## attempt to remove the trailing 1-child nodes from the
- ## top of the subtree, e.g. (S (S (VP (VP (ADJP (...
- while True:
- if vPSTRoot.getChildrenCount() == 1 and not vPSTRoot.getChildren()[0].isArgument() and vPSTRoot.getChildren()[0] != vPSTPredNode:
- vPSTRoot = vPSTRoot.getChildren()[0]
- else:
- break
-
- vPST.root = vPSTRoot
-
- return vPST, vPSTPredNode
-
-
-
- def addVPToFTBTree(self, pPostVerbalNodeCount = 2):
- '''
- Adds a VP node above all VNs in the tree and their post-verbal NP
- and PPs siblings.
-
- pPostVerbalNodeCount specifies the number of post-verbal sibling
- to go under the VP node.
- '''
-
- self.root.addVPToFTBTree(pPostVerbalNodeCount)
-
-
-
- def stratifyRules(self):
- '''
- Stratifies production rules.
-
- See ConsNode.stratifyFTBTree()
- '''
-
- self.root.stratifyRules()
-
-
-
- def fixFTBCoord(self):
- '''
- Fixes the FTB coordination in the tree.
-
- See ConsNode.fixFTBCoord()
- '''
-
- self.root.fixFTBCoord()
-
-
-
- def delNodes(self, pLabelPattern):
- '''
- Deletes nodes with specified regular expression pattern
-
- A root node cannot be deleted. When a node is removed, the entire
- subtree rooted at it is removed.
- '''
-
- self.root.delNodes(pLabelPattern = pLabelPattern)
-
-
-
- def delPRO(self):
- '''
- Deletes pro-drop nodes (*PRO* in PTB)
- '''
-
- self.root.delPRO()
-
-
-
- def removeFunctionTags(self):
- '''
- Removes function tags from the labels
-
- Function tags are supposed to be glued to the syntactic labels by
- a dash (e.g. NP-OBJ)
- '''
-
- self.root.removeFunctionTags(pflgAllSubtree = True)
-
-
-
- def getRoot(self):
- '''
- Returns the root node
- '''
-
- return self.root
-
-
-
- def getRootLabel(self):
- '''
- Returns the syntactic label of the root node
- '''
-
- return self.root.getLabel()
-
-
-
- def lowerTerminals(self):
- '''
- Converts terminals to lowercase
- '''
-
- self.root.lowerTerminals()
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # ConstNode
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class ConstNode:
- '''
- Constituency tree node
-
- A ConstNode represents a constituent in constituency parse tree. It
- has at most one parent node of the same type and arbitrary number of
- children nodes of the same type including none.
-
- A ConstNode is in fact a constituency subtree because all its children
- down to the terminals are recursively accessible.
-
- Note that the root node has no parent. It is accessed by/via ConstTree
- where.
-
- Pre-terminals (POS tags) and terminal (tokens) can optionally be in
- in one node or in separate nodes, where the pre-terminal is the parent
- of terminal.
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.parent = None
- self.children = []
- self.synTag = ''
- # terminal (word) if the node is leaf: e.g. (DT the)
- self.terminal = ''
- ## set to true if the terminal is expanded into pre-terminal and
- ## terminal nodes
- self.expandedTerminal = False
- ## list of semantic role and predicate tuples
- ## NOTE: a predicate node itself is assigned (None, frameset)
- self.predRoles = []
-
-
-
- def deepCopy(self, pflgCopyTree = False):
- '''
- NOTE: it seems in some cases python deepcopy() works better. Try
- both and choose the one which suites the purpose.
-
- Creates and returns a deep copy of the node optionally including
- the sub tree under it
-
- NOTE: This deep copy does not deep copy the predicate node in the
- self.predRoles tuples. The necessary action should be taken after
- deep copy if needed.
- '''
-
- # copying the node
- vNodeCopy = self._getNewNode()
- vNodeCopy.parent = self.parent
- vNodeCopy.children = self.children
- vNodeCopy.synTag = self.synTag
- vNodeCopy.terminal = self.terminal
- vNodeCopy.expandedTerminal = self.expandedTerminal
- # Note the method comments
- vNodeCopy.predRoles = self.predRoles
-
- # copying subtree rooted at node
- if pflgCopyTree:
- vNodeCopy.children = []
- for vChild in self.children:
- vNodeCopy.children.append(vChild.deepCopy(True))
- vNodeCopy.children[-1].parent = vNodeCopy
-
- return vNodeCopy
-
-
-
- def shallowCopy(self, pflgCopyTree = False):
- '''
- Creates and returns a shallow copy of the node
-
- The shallow copy does not have a parent and children.
- '''
-
- vNodeCopy = self._getNewNode()
- vNodeCopy.synTag = self.synTag
- vNodeCopy.terminal = self.terminal
- vNodeCopy.expandedTerminal = self.expandedTerminal
- vNodeCopy.predRoles = self.predRoles
-
- return vNodeCopy
-
-
-
- def __str__(self):
- '''
- String representation of the class
- '''
-
- return self.getPTBFormat()
-
-
-
- def setLabel(self, pLabel):
- '''
- Sets the value of the node label, i.e. its syntactic tag
- '''
-
- self.synTag = pLabel
-
-
-
- def getLabel(self):
- '''
- Gets the node label, i.e. its syntactic tag
- '''
-
- return self.synTag
-
-
-
- def getSynTag(self):
- '''
- Returns the syntactic tag of the node
- '''
-
- return self.getLabel()
-
-
-
- def addChild(self, pChildNode):
- '''
- Addes a new child to the children of this node
- '''
-
-
- pChildNode.setParent(self)
- self.children.append(pChildNode)
-
-
-
- def insertChild(self, pLabel, pPosition):
- '''
- Inserts a child node with the given label at the given child number
- position
-
- It also returns the inserted child for further use.
-
- The child number positions are counted from the leftmost child.
- '''
-
- # creating the new node
-
- vNewNode = self._getNewNode()
-
- # setting the attributes
-
- vNewNode.setLabel(pLabel)
- vNewNode.setParent(self)
- vNewNode.setExpandedTerminal(self.getExpandedTerminal())
-
- # inserting at the right position
-
- self.children.insert(pPosition - 1, vNewNode)
-
- return vNewNode
-
-
-
- def insertIntermChild(self, pLabel, ptDomChildSpan):
- '''
- Inserts an intermediate child node with the given label dominating
- existing children at the given child number span.
-
- It also returns the inserted child for further use.
-
- The child number positions are counted from the leftmost child.
- '''
-
- # creating the new node
-
- vNewNode = self._getNewNode()
-
- # setting the attributes
-
- vNewNode.setLabel(pLabel)
- vNewNode.setParent(self)
- vNewNode.setExpandedTerminal(self.getExpandedTerminal())
-
- # identifying and moving the children under the new node
-
- vlDomChildren = self.getChildrenAt(ptDomChildSpan)
-
- self.delChildrenAt(ptDomChildSpan)
-
- for vChild in vlDomChildren:
- vChild.setParent(vNewNode)
- vNewNode.addChild(vChild)
-
- # inserting at the right position
-
- self.children.insert(ptDomChildSpan[0] - 1, vNewNode)
-
- return vNewNode
-
-
-
- def replaceChild(self, pPosition, pChild):
- '''
- Replaces the child node at the given position with new child
- '''
-
- pChild.setParent(self)
- self.children[pPosition - 1] = pChild
-
-
-
- def insertParent(self, pLabel):
- '''
- Inserts a node as the parent node of current node with the given
- label and performs the necessary changes.
- '''
-
- # creating the new node
-
- vNewNode = self._getNewNode()
-
- # setting the attributes
-
- vNewNode.setLabel(pLabel)
- vNewNode.setParent(self.getParent())
- vNewNode.setExpandedTerminal(self.getExpandedTerminal())
-
- # setting parent/child relations
-
- vNewNode.getParent().replaceChild(pPosition = vNewNode.getParent().getChildNo(self), pChild = vNewNode)
-
- vNewNode.addChild(self)
- self.setParent(vNewNode)
-
-
-
- def loadConstituent(self, pConst, pParent, pConstFormat = 'ptb', pflgExpandTerminal = False):
- '''
- Creates node for the input constituent
-
- Creating a node includes creating all its children recursively
- down to the terminals.
-
- Constituent formats supported are:
- - ptb: Penn Treebank (e.g. (NP (DT The) (NNS senators)) )
-
- If pflgExpandTerminal is set to true, the terminal node will be
- expanded into a pre-terminal (POS tag) and terminal (token) nodes.
- '''
-
- self.expandedTerminal = pflgExpandTerminal
-
- if pConstFormat.lower() == "ptb" or pConstFormat.lower().startswith("penn"):
- return self._createPTBNode(pConst, pParent, pflgExpandTerminal = pflgExpandTerminal)
-
-
-
- def _createPTBNode(self, pConst, pParent, pflgExpandTerminal = False):
- '''
- Creates node for input constituent which is in PTB format
-
- Creating a node includes creating all its children recursively
- down to the terminals.
-
- If pflgExpandTerminal is set to true, the terminal node will be
- expanded into a pre-terminal (POS tag) and terminal (token) nodes.
- '''
-
- self.parent = pParent
- self.children = []
- self.expandedTerminal = pflgExpandTerminal
-
- vPTBConst = pConst.strip()
-
- vChildrenStartIdx = vPTBConst.find(' ') + 1
-
- # extracting tag
- self.synTag = vPTBConst[1 : vChildrenStartIdx - 1]
-
- # extracting terminal or children constituents
- vWhat = vPTBConst[vChildrenStartIdx : ].strip()
- ## checking if this is bracketing parenthesis not a sentence
- ## token which is not masked by -LRB-)
- if vWhat.startswith('(') and vWhat[1] != ')':
- vChildren = vWhat[ : -1]
- else:
- if pflgExpandTerminal:
- vChildTerminalNode = self._getNewNode()
- vChildTerminalNode.parent = self
- vChildTerminalNode.terminal = vWhat[ : -1]
- vChildTerminalNode.expandedTerminal = True
- self.children = [vChildTerminalNode]
- else:
- self.terminal = vWhat[ : -1]
-
- return
-
- # recursively creating children nodes
- vOpenBCount = 0
- vChildStartIdx = 0
- for i in range(len(vChildren)):
- if vChildren[i] == '(':
- ## checking if this is bracketing parenthesis not a sentence
- ## token which is not masked by -LRB-)
- if vChildren[i+1] != ')':
- vOpenBCount += 1
- elif vChildren[i] == ')':
- ## checking if this is bracketing parenthesis not a sentence
- ## token which is not masked by -RRB-)
- if vChildren[i-1] != ' ':
- vOpenBCount -= 1
- else:
- j = i - 2
- while j >= 0 and vChildren[j] == ' ':
- j -= 1
- if vChildren[j] == ')':
- vOpenBCount -= 1
-
- if vOpenBCount == 0:
- vChild = vChildren[vChildStartIdx : i+1].strip()
- if vChild != '':
- vChildNode = self._getNewNode()
- vChildNode._createPTBNode(vChild, self, pflgExpandTerminal = pflgExpandTerminal)
- self.children.append(vChildNode)
- vChildStartIdx = i + 1
-
-
-
- def _getNewNode(self):
- '''
- Creates and returns a node
- '''
-
- return ConstNode()
-
-
-
- def isTerminal(self):
- '''
- Returns true if the node is a terminal node.
- '''
-
- return len(self.children) == 0
-
-
-
- def isRoot(self):
- '''
- Returns true is this node is the root
- '''
-
- return self.parent == None
-
-
-
- def isPreTerminal(self):
- '''
- Returns true if the node is a pre-terminal node.
-
- Note pre-terminals exist only if the constituent is loaded into a
- node with pflgExpandTerminal option. Otherwise, the pre-terminal
- and terminal are integrated in one node which is called terminal.
- '''
-
- if self.expandedTerminal and len(self.children) == 1 and self.children[0].isTerminal():
- return True
- else:
- return False
-
-
-
- def setExpandedTerminal(self, pValue):
- '''
- Sets the value of expandedTerminal attribute
- '''
-
- self.expandedTerminal = pValue
-
-
-
- def getExpandedTerminal(self):
- '''
- Returns the value of expandedTerminal attribute
-
- Not to be confused with isExpandedTerminal().
- '''
-
- return self.expandedTerminal
-
-
-
- def isExpandedTerminal(self):
- '''
- Returns true if the node is an expanded terminal node.
-
- A terminal is expanded if the pre-terminal and terminals are two
- separate nodes.
- '''
- if self.parent == None:
- return False
- elif self.parent.isPreTerminal():
- return True
- else:
- return False
-
-
-
- def isCollapsedTerminal(self):
- '''
- Returns true if the node is a collapsed terminal node.
-
- A terminal is collapsed if the pre-terminal and terminals are
- merged into one node, i.e. expandedTerminal is false.
- '''
-
- if not self.expandedTerminal and self.isTerminal():
- return True
- else:
- return False
-
-
-
- def getPTBFormat(self):
- '''
- Returns the constituent in PTB brackeing format.
- '''
-
- vPTBConst = '(%s ' % self.synTag
-
- if self.isPreTerminal():
- vPTBConst += self.children[0].terminal + ')'
- else:
- if self.terminal != '':
- vPTBConst += self.terminal + ')'
- else:
- vPTBChildren = []
- for vChild in self.children:
- vPTBChildren.append(vChild.getPTBFormat())
-
- vPTBConst += ' '.join(vPTBChildren) + ')'
-
- return vPTBConst
-
-
-
- def getTerminals(self):
- '''
- Extracts and returns the list of terminals (tokens) the node spans.
- '''
-
- vlTerminals = []
-
- if self.isTerminal():
- vlTerminals.append(self.terminal)
- else:
- for vChild in self.children:
- vlTerminals += vChild.getTerminals()
-
- return vlTerminals
-
-
-
- def setTerminalForm(self, pForm):
- '''
- Sets the value of the surface form of the terminal node
- '''
-
- if self.isTerminal():
- self.terminal = pForm
- else:
- raise Exception("Not a terminal node!")
-
-
-
- @property
- def surface(self):
- '''
- Surface form of the sentence
- '''
-
- return ' '.join(self.getTerminals())
-
-
-
- def getTerminalNodes(self):
- '''
- Extracts and returns the list of terminal nodes the node spans.
- '''
-
- vlTNodes = []
-
- if self.isTerminal():
- vlTNodes.append(self)
- else:
- for vChild in self.children:
- vlTNodes += vChild.getTerminalNodes()
-
- return vlTNodes
-
-
-
- def getPreTerminals(self):
- '''
- Extracts and returns the list of pre-terminals (POS tags) the node spans
- '''
-
- vlPreTerminals = []
-
- if self.isPreTerminal() or self.isTerminal():
- vlPreTerminals.append(self.synTag)
- else:
- for vChild in self.children:
- vlPreTerminals += vChild.getPreTerminals()
-
- return vlPreTerminals
-
-
-
- def getPreTerminalNodes(self):
- '''
- Extracts and returns the list of pre-terminal nodes the node spans
- '''
-
- vlPreTerminalNodes = []
-
- if self.isPreTerminal():
- vlPreTerminalNodes.append(self)
- elif self.isTerminal():
- vlPreTerminalNodes.append(self.parent)
- else:
- for vChild in self.children:
- vlPreTerminalNodes += vChild.getPreTerminalNodes()
-
- return vlPreTerminalNodes
-
-
-
- def getPreTerminalNodeAt(self, pPosition):
- '''
- Returns the pre-terminal node at a given token position if the
- tree format expands terminals, or terminal node if it does not
- '''
-
- if self.isPreTerminal() or self.isCollapsedTerminal():
- return self
- else:
- vHSNode = self.getHighestNodeOfSpan((pPosition, pPosition))
- if vHSNode != None:
- vPTNode = vHSNode
- while not (vPTNode.isPreTerminal() or vPTNode.isCollapsedTerminal()):
- vPTNode = vPTNode.children[0]
- return vPTNode
- else:
- return None
-
-
-
- def setParent(self, pParent):
- '''
- Sets the parent node of the node
- '''
-
- self.parent = pParent
-
-
-
- def getParent(self):
- '''
- Returns the parent node
- '''
-
- return self.parent
-
-
-
- def getChildren(self):
- '''
- Returns the children nodes in a list
-
- The methods used to return the list itself before v4.7, but it returns
- a copy since this version.
- '''
-
- return self.children[:]
-
-
-
- def getChildrenAt(self, pSpan):
- '''
- Returns children nodes at the given position span from left
- '''
-
- return self.children[pSpan[0] - 1 : pSpan[1]]
-
-
-
- def getSibling(self):
- '''
- Returns list of sibling nodes of this node
- '''
-
- return [vChild for vChild in self.getParent().getChildren() if vChild != self]
-
-
-
- def getLeftSibling(self):
- '''
- Returns the sibling nodes in the left hand side of the node
- '''
-
- vlLSibling = []
-
- for vSibling in self.getParent().getChildren():
- if vSibling == self:
- break
- else:
- vlLSibling.append(vSibling)
-
- return vlLSibling
-
-
-
- def getRightSibling(self):
- '''
- Returns the sibling nodes in the right hand side of the node
- '''
-
- vlRSibling = []
-
- for vSibling in reversed(self.getParent().getChildren()):
- if vSibling == self:
- break
- else:
- vlRSibling.append(vSibling)
-
- return reversed(vlRSibling)
-
-
-
- def ifDominates(self, pNode):
- '''
- Checks if this node dominates a given node
- '''
-
- ## Since parse trees tend to be wide rather than deep, a BFS may be
- ## better. However, I use an algorithm which sounds like a mixture:
- ## it first looks at the children of a node, if no success, recursively
- ## traverses each child. I did a quick search and didn't find if
- ## this is already an algorithm in use or critisized. I don't see
- ## a reason why it should not work.
-
- if pNode in self.getChildren():
- return True
- else:
- for vChild in self.getChildren():
- vFound = vChild.ifDominates(pNode)
- if vFound:
- return True
-
- return False
-
-
-
- def getTerminal(self):
- '''
- Return the terminal sequence of the node
- '''
-
- return ' '.join(self.getTerminals())
-
-
-
- def getGrandParentUnderRoot(self):
- '''
- Returns the grandparent of this node which is a child of root
- '''
-
- if self.getParent().isRoot():
- return self
- else:
- return self.getParent().getGrandParentUnderRoot()
-
-
-
- def getRoot(self):
- '''
- Returns the grandparent of this node which is a child of root
- '''
-
- if self.isRoot():
- return self
- else:
- return self.getGrandParentUnderRoot().getParent()
-
-
-
- def getChildrenTags(self):
- '''
- Returns the sytactic tags of children of the node.
- '''
-
- vlChildrenTags = []
-
- if not self.isPreTerminal():
- for vChild in self.children:
- vlChildrenTags.append(vChild.synTag)
-
- return vlChildrenTags
-
-
-
- def getTagset(self):
- '''
- Extracts and returns the syntactic tags of the tree nodes and
- their counts.
- '''
-
- vdTagset = {self.synTag: 1}
-
- if not self.isPreTerminal() and not self.isTerminal():
- for vChild in self.children:
- for vTag, vCount in vChild.getTagset().iteritems():
- if vTag in vdTagset:
- vdTagset[vTag] += vCount
- else:
- vdTagset[vTag] = vCount
-
- return vdTagset
-
-
-
- def getSynLabels(self):
- '''
- Extracts and returns the syntactic labels of the subtree nodes
- including phrase labels and POS tags
- '''
-
- vlLabels = []
-
- if not self.isTerminal():
- vlLabels = [self.getLabel()]
-
- for vChild in self.children:
- vlLabels += vChild.getSynLabels()
-
- return vlLabels
-
-
-
- def getSpanSize(self):
- '''
- Extracts and returns the number of tokens the node spans.
- '''
-
- vSize = 0
- if self.isTerminal():
- vSize += 1
- else:
- for vChild in self.children:
- vSize += vChild.getSpanSize()
-
- return vSize
-
-
-
- def getTokenSpan(self):
- '''
- Extracts and returns the range of token indexes the node spans.
-
- For example if a node spans tokens at positions 3 to 7 in the sentence,
- it returns (3,7)
- '''
-
- if self.isRoot():
- ## If root node, sum of span sizes of children is used to compute
- ## the end of the span.
- vStart = 1
- vEnd = 0
- for vChild in self.children:
- vEnd += vChild.getSpanSize()
- else:
- ## If not root node, the start of the span of its parent is
- ## first computed and then summed with span size of its left
- ## sibling nodes.
-
- # 1. parent recursive part
- vStart = self.getParent().getTokenSpan()[0]
-
- # 2. left sibling part
- for vSibling in self.getParent().children:
- if vSibling == self:
- break
- else:
- vStart += vSibling.getSpanSize()
-
- vEnd = vStart + self.getSpanSize() - 1
-
- return vStart, vEnd
-
-
-
- def getTokenSpanRelation(self, pSpan):
- '''
- Returns the relationship of the token span of this node to the
- given span
-
- 0: equal
- 1: inclusive (token span contains pSpan)
- -1: contained (pSpan contains token span)
- -2: left-cross (pSpan crosses the left boundary of token span but its right boundary is inside token span)
- -3: right-cross (pSpan crosses the right boundary of token span but its left boundary is inside token span)
- -4: disjoint
- '''
-
- vTokenSpan = self.getTokenSpan()
-
- if vTokenSpan == pSpan:
- return 0
- ## The below two conditions include vTokenSpan == pSpan but never
- ## happen because of the above return.
- elif vTokenSpan[0] <= pSpan[0] and pSpan[1] <= vTokenSpan[1]:
- return 1
- elif pSpan[0] <= vTokenSpan[0] and vTokenSpan[1] <= pSpan[1]:
- return -1
- elif pSpan[0] < vTokenSpan[0] and vTokenSpan[0] <= pSpan[1] and pSpan[1] <= vTokenSpan[1]:
- return -2
- elif vTokenSpan[0] <= pSpan[0] and pSpan[0] <= vTokenSpan[1] and vTokenSpan[1] < pSpan[1]:
- return -3
- else:
- return -4
-
-
-
- def getHighestNodeOfSpan(self, pSpan):
- '''
- Returns the highest (closest to root) node spanning the given token
- span if it's in this subtree
-
- It returns None if the span is not in this subtree. It raises an
- exception if it the given span crosses any node boundary.
- '''
-
- vSpanRel = self.getTokenSpanRelation(pSpan)
-
- vTokenSpan = self.getTokenSpan()
-
- if vSpanRel == 0:
- vHSNode = self
- elif vSpanRel == 1:
- for vChild in self.children:
- vHSNode = vChild.getHighestNodeOfSpan(pSpan)
- if vHSNode != None:
- break
- elif vSpanRel == -1:
- vHSNode = None
- elif vSpanRel == -2:
- raise Exception("Left-cross: (%s %s) vs. (%s %s)" % (vTokenSpan[0], vTokenSpan[1], pSpan[0], pSpan[1]))
- elif vSpanRel == -3:
- raise Exception("Right-cross: (%s %s) vs. (%s %s)" % (vTokenSpan[0], vTokenSpan[1], pSpan[0], pSpan[1]))
- else:
- vHSNode = None
-
- return vHSNode
-
-
-
- def getHeight(self):
- '''
- Computes and returns the height of the node which is the number
- of levels from the node to the deepest (farthest) terminal under
- it.
-
- It is the tree depth minus the depth of the node. Depth is calculated
- from the root, while height from the deepest terminal. Therefore,
- tree depth is equal to tree height.
-
- The height of a terminal node is 0.
- '''
-
- ## The height of all children are recursively computed. Their
- ## maximum is then the height of the node.
-
- if self.isTerminal():
- return 0
- else:
- vMaxChildHeight = 0
- for vChild in self.children:
- vHeight = vChild.getHeight()
- if vHeight > vMaxChildHeight:
- vMaxChildHeight = vHeight
-
- return vMaxChildHeight + 1
-
-
-
- def getSize(self):
- '''
- Returns the size of the node which is the number of nodes in the
- subtree under the node plus 1.
- '''
-
- if self.isTerminal():
- return 1
- else:
- vChildrenSizeSum = 0
- for vChild in self.children:
- vChildrenSizeSum += vChild.getSize()
-
- return vChildrenSizeSum + 1
-
-
-
- def mapTags(self, pdMap):
- '''
- Maps syntactic tags in the subtree under the node to a new tag set
- using map in pdMap.
-
- If a tag is not specified, a catch-all tag will be sought to map
- to. If it does not exist, it will be mapped into itself.
- '''
-
- if self.isExpandedTerminal():
- return
-
- try:
- self.synTag = pdMap[self.synTag]
- except KeyError:
- if "_catchall" in pdMap:
- self.synTag = pdMap["_catchall"]
-
- for vChild in self.children:
- vChild.mapTags(pdMap)
-
-
-
- def getChildrenCount(self):
- '''
- Returns the number of children
- '''
-
- return len(self.children)
-
-
-
- def getCFGRule(self, pflgNoLexRule = False):
- '''
- Returns the CFG production rule expanding the node if it is not
- a leaf node.
-
- A rule is represented using a dictionary with two items: lhs for
- the left hand side of the rules and rhs for the its right hand
- side which is a list of tags.
-
- If pflgNoLexRule is true, lexical rules will not be extracted.
- '''
-
- if pflgNoLexRule:
- if self.isPreTerminal() or self.isTerminal():
- return {}
- else:
- return {"lhs": self.synTag, "rhs": self.getChildrenTags()}
- else:
- if self.isPreTerminal():
- return {"lhs": self.synTag, "rhs": self.getTerminals()}
- elif self.isTerminal():
- if self.expandedTerminal:
- return {}
- else:
- return {"lhs": self.synTag, "rhs": [self.terminal]}
- else:
- return {"lhs": self.synTag, "rhs": self.getChildrenTags()}
-
-
-
- def extractCFGRules(self, pflgNoLexRule = False):
- '''
- Returns the CFG production rules expanding the node and all the
- nodes of the subtree under it.
-
- The rules are returned in a list.
-
- If pflgNoLexRule is true, lexical rules will not be extracted.
- '''
-
- # rule of the node itself
- vlRules = [self.getCFGRule(pflgNoLexRule)]
-
- # rules of the nodes in the subtree
- for vChild in self.children:
- vChildRules = vChild.extractCFGRules(pflgNoLexRule)
- if vChildRules != [{}]:
- vlRules += vChildRules
-
- return vlRules
-
-
-
- def extractLexRules(self):
- '''
- Returns the lexical rules in the subtree under the node.
-
- The rules are returned in a list.
- '''
-
- vlRules = []
-
- if self.isPreTerminal():
- vlRules = [{"lhs": self.synTag, "rhs": self.getTerminals()}]
- elif self.isTerminal():
- if not self.expandedTerminal:
- vlRules = [{"lhs": self.synTag, "rhs": [self.terminal]}]
- else:
- for vChild in self.children:
- vChildRules = vChild.extractLexRules()
- if vChildRules != [{}]:
- vlRules += vChildRules
-
- return vlRules
-
-
-
- def extractSubsetTrees(self):
- '''
- NOTE: THE ALGORITHM IS BUGGY BECAUSE OF MISUNDERSTANDING SUBSET
- TREE KERNEL DEFINITION.
-
- Extracts and returns list of all subset trees of the subtree under
- the node.
-
- Note that what is returned is the top node of the trees and ConstTree
- objects are not created.
-
- Subset trees are used in tree kernels (c.f. Moschitti 2006)
- '''
-
- raise Exception("NOTE: THE ALGORITHM IS BUGGY BECAUSE OF MISUNDERSTANDING SUBSET TREE KERNEL DEFINITION.")
-
- if self.isTerminal():
- return []
- else:
- vlSsTrees = []
-
- ## extracting subtrees of children while adding children
- ## themselves to build subtrees with this node as root later
- vlChildrenSsTrees = []
- for vChild in self.children:
- # adding child itself for later use
- vChildCopy = vChild.deepCopy()
- vChildCopy.children = []
- vlChildrenSsTrees.append([vChildCopy])
-
- # extracting subtrees of children
- vlChildSsTrees = vChild.extractSubsetTrees()
- if len(vlChildSsTrees) != 0:
- ## collecting children's subtrees
- for i in range(len(vlChildSsTrees)):
- ## saving copy of child's subtrees if rooted in
- ## the child for later building subtrees with this
- ## node
- if vlChildSsTrees[i].parent == self:
- vlChildrenSsTrees[-1].append(vlChildSsTrees[i].deepCopy())
-
- # adding to final set
- vlSsTrees.append(vlChildSsTrees[i])
- ## setting the parent to None because this is the
- ## top node of subtree
- vlSsTrees[-1].parent = None
-
- ## building subtrees rooted in this node with all subtrees of
- ## its children rooted in the child itself plus children nodes
- ## themselves
- for vNewChildren in itertools.product(*vlChildrenSsTrees):
- # building subtrees with this node as root
- vThisNode = self.deepCopy()
- vThisNode.children = list(vNewChildren)
- # setting parents of children
- for i in range(len(vThisNode.children)):
- vThisNode.children[i].parent = vThisNode
- vlSsTrees.append(vThisNode)
-
- return vlSsTrees
-
-
-
- def addVPToFTBTree(self, pPostVerbalNodeCount = 2):
- '''
- Adds a VP node above all VNs in the subtree under the node and
- their post-verbal NP and PPs siblings.
-
- pPostVerbalNodeCount specifies the number of post-verbal sibling
- to go under the VP node.
- '''
-
- if self.synTag == "VN":
- # slicing children of parent on this node
- for i in range(len(self.parent.children)):
- if self.parent.children[i] == self:
- vlRSibling = self.parent.children[i:]
- self.parent.children = self.parent.children[:i]
- break
-
- # adding VP node
- vVPNode = ConstNode()
- vVPNode.synTag = "VP"
- vVPNode.parent = self.parent
- vVPNode.expandedTerminal = self.expandedTerminal
- self.parent.children.append(vVPNode)
-
- # adding this node and sibling on the right under new VP node
- vAddedPVNodeCount = 0
- for i, vNode in enumerate(vlRSibling):
- if vNode == self or vNode.synTag.startswith("NP") or vNode.synTag.startswith("PP") :
- vVPNode.children.append(vNode)
- vNode.parent = vVPNode
- vAddedPVNodeCount += 1
- if vAddedPVNodeCount == pPostVerbalNodeCount + 1:
- vVPNode.parent.children += vlRSibling[i + 1 : ]
- break
- else:
- vVPNode.parent.children += vlRSibling[i : ]
- break
- else:
- # doing it for children
- for vChild in self.children:
- ## checking if this is still a child, because it may have
- ## gone under VP via previous child
- if vChild.parent == self:
- vChild.addVPToFTBTree(pPostVerbalNodeCount)
-
-
-
- def stratifyRules(self):
- '''
- Stratifies production rules in the subtree under the node by
- grouping together every two equal adjecent POS tags under a new
- node with a tag made of the POS tag suffixed with _St.
-
- For example,
-
- (AdP (ADV x) (ADV y) (ADV z) (ADV w))
- to
- (AdP (ADV_St (ADV x) (ADV y)) (ADV_St (ADV z) (ADV w)))
- '''
-
- vChildCount = self.getChildrenCount()
-
- if vChildCount >= 3:
- # grouping children into groups of two from right
- vRightSiblingIdx = vChildCount - 1
- vLeftSiblingIdx = vRightSiblingIdx -1
- while vLeftSiblingIdx >= 0:
- if self.stratifyPartialRule(self, vLeftSiblingIdx, vRightSiblingIdx):
- vRightSiblingIdx -= 2
- vLeftSiblingIdx -= 2
- else:
- vRightSiblingIdx -= 1
- vLeftSiblingIdx -= 1
-
- # recursively stratifying for children (if any left)
- for vChild in self.children:
- if not (vChild.synTag.endswith("_St") or vChild.isPreTerminal() or vChild.isCollapsedTerminal()):
- vChild.stratifyRules()
-
-
-
- def stratifyPartialRule(self, pLHSNode, pRHSNodeLeftIdx, pRHSNodeRightIdx):
- '''
- Stratifies the partial rule.
-
- A partial rule consists of the left hand side node and indexes of
- two adjacent nodes on its right hand side (children). A new node
- will be added in the middle so that it is the in the right hand
- side of pLHSNode at that position and left hand side of pRHSNodeLeft
- and pRHSNodeRight.
- '''
-
- if (pLHSNode.children[pRHSNodeLeftIdx].isPreTerminal() or pLHSNode.children[pRHSNodeLeftIdx].isCollapsedTerminal()) and \
- (pLHSNode.children[pRHSNodeRightIdx].isPreTerminal() or pLHSNode.children[pRHSNodeRightIdx].isCollapsedTerminal()):
- if pLHSNode.children[pRHSNodeLeftIdx].synTag == pLHSNode.children[pRHSNodeRightIdx].synTag:
- vNewNode = ConstNode()
- vNewNode.synTag = pLHSNode.children[pRHSNodeLeftIdx].synTag + "_St"
- vNewNode.parent = pLHSNode
- vNewNode.children = [pLHSNode.children[pRHSNodeLeftIdx], pLHSNode.children[pRHSNodeRightIdx]]
- vNewNode.expandedTerminal = self.expandedTerminal
- pLHSNode.children[pRHSNodeLeftIdx].parent = vNewNode
- pLHSNode.children[pRHSNodeRightIdx].parent = vNewNode
- pLHSNode.children = pLHSNode.children[ : pRHSNodeLeftIdx] + [vNewNode] + pLHSNode.children[pRHSNodeRightIdx + 1 : ]
-
-
-
- def fixFTBCoord(self):
- '''
- Fixes the FTB coordination all over the subtree under the node.
-
- To fix, it moves the coordinated node (the immediate left sibling
- of the COORD node) under the COORD node.
-
- For example,
-
- (S (NP (NC x)) (COORD (CC et) (NP (NC y))))
- to
- (S (COORD (NP (NC x)) (CC et) (NP (NC y))))
- '''
-
- vIdx = 1
- while vIdx < self.getChildrenCount():
- if self.children[vIdx].synTag == "COORD":
- self.children[vIdx-1].parent = self.children[vIdx]
- self.children[vIdx].children = [self.children[vIdx-1]] + self.children[vIdx].children
- self.children = self.children[ : vIdx-1] + self.children[vIdx : ]
- else:
- vIdx += 1
-
- # for i in range(self.getChildrenCount()):
- # if i > 0 and self.children[i].synTag == "COORD":
- # self.children[i-1].parent = self.children[i]
- # self.children[i].children = [self.children[i-1]] + self.children[i].children
- # self.children = self.children[ : i-1] + self.children[i : ]
-
- # recursively doing for children as well
- for vChild in self.children:
- vChild.fixFTBCoord()
-
-
-
- def isPredicate(self):
- '''
- Returns true if the node is a predicate
- '''
-
- return len([p for p, r in self.predRoles if p == None]) > 0
-
-
-
- def isArgument(self):
- '''
- Returns true if the node is a an argument for any predicate in the
- SRL of the sentence.
- '''
-
- return len([r for p, r in self.predRoles if p != None]) > 0
-
-
-
- def isArgumentOf(self, pPredNode):
- '''
- Returns true if the node is a an argument of the predicate the node
- of which is given
- '''
-
- return len([r for p, r in self.predRoles if p == pPredNode]) > 0
-
-
-
- def setPredRoles(self, plPredRoles):
- '''
- Sets the value of predicate/role attribute
- '''
-
- self.predRoles = plPredRoles
-
-
-
- def removePredRoles(self):
- '''
- Removes all the predicate/roles
- '''
-
- self.predRoles = []
-
-
-
- def hasPredRole(self, pPredRole):
- '''
- Returns true if given predicate-role exists in node's annotation
- '''
-
- return pPredRole in self.predRoles
-
-
-
- def addPredRole(self, pPredRole):
- '''
- Addes given predicate-role to node's annotation if it does not
- already exist
- '''
-
- if not self.hasPredRole(pPredRole):
- self.predRoles.append(pPredRole)
-
-
-
- def removePredRole(self, pPredRole):
- '''
- Removes given predicate-role from node's annotation if it exist
- '''
-
- if self.hasPredRole(pPredRole):
- self.predRoles.remove(pPredRole)
-
-
-
- def getPredicateFrameset(self):
- '''
- Returns the predicate frameset from the semantic role labeling
- of the node
- '''
-
- if self.isPredicate():
- return [r for p, r in self.predRoles if p == None][0]
- else:
- return ''
-
-
-
- def getArgRoles(self):
- '''
- Returns the list of argument role labels from the semantic role
- labeling of the node
- '''
-
- return [r for p, r in self.predRoles if p != None]
-
-
-
- def getArgRoleOf(self, vPredNode):
- '''
- Returns the role label if the node is an argument of the predicate
- at the given predicate
- '''
-
- ## NOTE: The following commented code takes care of taking multiple
- ## role of a predicate by one node. Technically, this is not valid.
- ## It however is possible when the original SRL is in dependency
- ## formalism and converted by elevating the role in the constituency
- ## tree. The reasons could be SRL error, parsing error or conversion
- ## error. At the moment, we merge the role labels when needed.
-
- # vRole = [r for p, r in self.predRoles if p == vPredNode]
- #if len(vRole) == 0:
- # return ''
- #elif len(vRole) > 1:
- # raise Exception("Strange! A node (%s) cannot take more than one argument role (%s) for a given prediate (%s)!" % (self.getPTBFormat(), ' '.join(vRole), vPredNode.getPTBFormat()))
- #else:
- # return vRole[0]
-
- return [r for p, r in self.predRoles if p == vPredNode]
-
-
-
- def getRawPST(self, pOldPredNode, pNewPredNode):
- '''
- Extracts raw proposition subtree (PST) under the node the predicate
- node of which is given
-
- It's raw because the SRL annotation on it needs to be post-processed
- at tree level which cannot be done at node level.
-
- Since new nodes are created, references to old predicate node in
- predRoles attribute are no longer valid. The newly created predicate
- node is returned/passed for keeping track of it until returning
- to the tree level (ConstTree.getPST()) instead of having to look
- for it again at the tree level.
-
- PST is a subtree of the constituency tree spanning only the predicate
- and argument nodes of that proposition including all full subtrees
- of argument nodes.
- '''
-
- if self.isArgumentOf(pOldPredNode):
- #vPSTNode = copy.deepcopy(self)
- # deepCopy() works better than python deepcopy in this case
- vPSTNode = self.deepCopy(pflgCopyTree = True)
- elif self == pOldPredNode:
- #vPSTNode = copy.deepcopy(self)
- # deepCopy() works better than python deepcopy in this case
- vPSTNode = self.deepCopy(pflgCopyTree = True)
- pNewPredNode = vPSTNode
- elif self.isPreTerminal() or self.isCollapsedTerminal():
- vPSTNode = None
- else:
- vlChildPSTs = []
- for vChild in self.getChildren():
- vChildPST, pNewPredNode = vChild.getRawPST(pOldPredNode, pNewPredNode)
- if vChildPST:
- vlChildPSTs.append(vChildPST)
- if len(vlChildPSTs) == 0:
- vPSTNode = None
- else:
- vPSTNode = self.shallowCopy()
- for vChildPST in vlChildPSTs:
- vPSTNode.addChild(vChildPST)
-
- return vPSTNode, pNewPredNode
-
-
-
- def delNodes(self, pLabelPattern):
- '''
- Deletes nodes with specified regular expression pattern in the
- subtree rooted at this node (except the node itself)
-
- When a node is removed, the entire subtree rooted at it is removed.
- '''
-
- vlMarkedChildren = []
-
- for vChild in self.getChildren():
- if re.match(pLabelPattern, vChild.getSynTag()):
- vlMarkedChildren.append(vChild)
- else:
- vChild.delNodes(pLabelPattern = pLabelPattern)
-
- for vMarkedChild in vlMarkedChildren:
- self.delChild(vMarkedChild)
-
-
-
- def delPRO(self):
- '''
- Deletes pro-drop nodes (*PRO* in PTB) in the subtree rooted at
- this node (except the node itself)
- '''
-
- vlPROChildren = []
-
- for vChild in self.getChildren():
- if vChild.isPreTerminal():
- if vChild.getTerminal() == "*PRO*":
- vlPROChildren.append(vChild)
- elif vChild.isTerminal():
- if vChild.surface == "*PRO*":
- vlPROChildren.append(self)
- else:
- vChild.delPRO()
-
- for vPROChild in vlPROChildren:
- self.delChild(vPROChild)
-
-
-
- def delChild(self, pChildNode):
- '''
- Deletes the given child node of the current node
- '''
-
- for i, vChild in enumerate(self.children):
- if vChild == pChildNode:
- del self.children[i]
- vChild.setParent(None)
-
- # node is deleted if left without child
- if self.getChildrenCount() == 0:
- self.parent.delChild(self)
-
-
-
- def delChildrenAt(self, pSpan):
- '''
- Deletes the children at the given position span from left
- '''
-
- self.children = self.children[ : pSpan[0] - 1] + self.children[pSpan[1] : ]
-
-
-
- def getChildNo(self, pChildNode):
- '''
- Returns the position of the given child node of the current node
- in its children list (starting from left)
- '''
-
- for i, vChild in enumerate(self.children, start = 1):
- if vChild == pChildNode:
- return i
-
- # if the child not found
- return -1
-
-
-
- def removeFunctionTags(self, pflgAllSubtree):
- '''
- Removes function tags from the labels
-
- Function tags are supposed to be attached to the syntactic labels by
- a dash (e.g. NP-OBJ).
-
- The removal can be done to only the current node or the entire subtree
- under the node.
- '''
-
- vGluePos = self.getLabel().find('-')
-
- if vGluePos != -1:
- self.synTag = self.getLabel()[ : vGluePos]
-
- if pflgAllSubtree:
- for vChild in self.getChildren():
- vChild.removeFunctionTags(pflgAllSubtree = pflgAllSubtree)
-
-
-
- def getPathTo(self, pNode):
- '''
- Extracts and returns the path from this node to the given node
-
- NOTE: this method is not recursive. It has been designed for efficiency.
-
- The path is a string consisting of the syntactic label of the node and the directions represented by /, \, > and
- < (up, down, right, left in order). Left and right are used when descending from the lowest common ancestor down
- to the destination node. If the path turns right, \> is used and if left, \<.
- '''
-
- if self == None:
- return ''
- else:
- # 1. stacking this node and all its parents up until root
- vlNodeParents = [self]
- while not vlNodeParents[-1].isRoot():
- vlNodeParents.append(vlNodeParents[-1].getParent())
-
- # 2. stacking destination node and all its parents up until root
- vlDestParents = [pNode]
- while not vlDestParents[-1].isRoot():
- vlDestParents.append(vlDestParents[-1].getParent())
-
- # 3. finding lowest common ancestor (the u-turn point)
-
- # if there is only one node in either list, LCA is the last one
- if len(vlNodeParents) == 1 or len(vlDestParents) == 1:
- vLCA = vlNodeParents[-1]
- # otherwise, matching the stacks
- else:
- for vNPIdx, vDPIdx in zip(reversed(range(len(vlNodeParents))), reversed(range(len(vlDestParents)))):
- # if the nodes before these two are the same, this node (at the current index) cannot be LCA
- if vlNodeParents[vNPIdx - 1] == vlDestParents[vDPIdx - 1]:
- del vlNodeParents[vNPIdx]
- del vlDestParents[vDPIdx]
- # if the nodes before these two are not the same, this node (at the current index) is LCA
- else:
- vLCA = vlNodeParents[vNPIdx]
- break
-
- # 4. extracting the path
-
- # 4.a. this node to LCA
- vPath = '/'.join([n.getSynTag() for n in vlNodeParents])
- # 4.b. u-turn (if vLCA is not the start or the end)
- if len(vlNodeParents) > 1 and len(vlDestParents) > 1:
- if vLCA.getChildNo(vlNodeParents[-2]) < vLCA.getChildNo(vlDestParents[-2]):
- vPath += '\>'
- else:
- vPath += '\<'
- # 4.c. LCA to destination node
- vPath += '\\'.join([n.getSynTag() for n in reversed(vlDestParents[:-1])])
-
- return vPath
-
-
-
- def lowerTerminals(self):
- '''
- Converts terminals in the subtree to lowercase
- '''
-
- for vTNode in self.getTerminalNodes():
- vTNode.setTerminalForm(vTNode.getTerminal().lower())
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # BConstParser
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class BConstParser:
- '''
- A class for baseline constituency parsing
-
- It implements the following baseline constituency parsing methods:
- - random parsing
- '''
-
- def __init__(self):
- '''
- Constructor
- '''
-
- # default tag sets
-
- self.enPOSTagSet = ["#", ".", ":", ",", "''", "``", "$", "-LRB-", "-RRB-", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP$", "WP", "WRB"]
- self.enPhraseTagSet = ["ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", "NX", "PP", "PRN", "PRT", "QP", "RRC", "S", "SBAR", "SBARQ", "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X"]
- self.frPOSTagSet = ["ADJ", "ADJWH", "ADV", "ADVWH", "CC", "CLO", "CLR", "CLS", "CS", "DET", "DETWH", "ET", "I", "NC", "NPP", "P", "P+D", "PONCT", "P+PRO", "PREF", "PRO", "PROREL", "PROWH", "V", "VIMP", "VINF", "VPP", "VPR", "VS"]
- self.frPhraseTagSet = ["AdP", "AP", "COORD", "NP", "PP", "SENT", "Sint", "Srel", "Ssub", "VN", "VPinf", "VPpart"]
-
-
-
- def _getLangTagsets(self, pLang):
- '''
- Returns POS and phrase tag set for the language
- '''
-
- vLang = util.normalizeLang(pLang)
-
- if vLang == "en":
- return self.enPOSTagSet, self.enPhraseTagSet
- elif vLang == "fr":
- return self.frPOSTagSet, self.frPhraseTagSet
-
-
-
- def randomParse(self, pSentence, pLang = "en", pflgTokenize = False):
- '''
- Randomly creates a constituency tree for the sentence.
-
- The output is a ConstTree.
- '''
-
- if pflgTokenize:
- vSentence = nlp.tokenizeSegment(pSentence, pLang, False)
- else:
- vSentence = pSentence
-
- vlTokens = vSentence.strip().split()
-
- # setting tag sets for the language
- vlPOSTagset, vlPhraseTagset = self._getLangTagsets(pLang)
-
- # creating a ConsTree
- vCTree = ConstTree()
-
- # parsing
- vCTree.root = self._randParsePhrase(vlTokens, None, vlPOSTagset, vlPhraseTagset)
-
- return vCTree
-
-
-
- def _randParsePhrase(self, plPhraseTokens, pParent, plPOSTagset, plPhraseTagset):
- '''
- Randomly creates a phrase structure for the phrase recursively.
-
- The tokens are recursively randomly split into phrases. Phrase
- tags are randomly assigned from the phrase tag set and POS tags
- from the POS tag set of the language.
-
- It returns a ConsNode for the phrase structure.
- '''
-
- vLen = len(plPhraseTokens)
-
- # creating a node for the phrase
- vNode = ConstNode()
- vNode.parent = pParent
-
- if vLen == 1:
- ## If the phrase is only 1 token and the parent is None, this
- ## is a special case of one-token sentence and terminal node
- ## should created under this node.
- if vNode.parent == None:
- vTerminalNode = ConstNode()
- vTerminalNode.parent = vNode
- vTerminalNode.synTag = self._getRandomTag(plPOSTagset)
- vTerminalNode.terminal = plPhraseTokens[0]
- vNode.synTag = self._getRandomTag(plPhraseTagset)
- vNode.children.append(vTerminalNode)
- else:
- vNode.synTag = self._getRandomTag(plPOSTagset)
- vNode.terminal = plPhraseTokens[0]
- else:
- # randomly labelling the phrase
- vNode.synTag = self._getRandomTag(plPhraseTagset)
-
- # splitting into a random number of sub-phrases:
- vlSubPhrases = self._randSplitPhrase(plPhraseTokens)
-
- ## recursively creating phrase structure for each sub-phrase
- ## (children nodes)
- for vlSubPhrase in vlSubPhrases:
- vNode.children.append(self._randParsePhrase(vlSubPhrase, vNode, plPOSTagset, plPhraseTagset))
-
- return vNode
-
-
-
- def _randSplitPhrase(self, plPhraseTokens):
- '''
- Randomly splits phrase into subphrases recursively.
-
- The output is a list of sub-phrases which are in turn list of
- tokens.
- '''
-
- ## It's first split into two sub-phrases. Then the right sub-phrase
- ## is recursively split again.
-
- vLen = len(plPhraseTokens)
-
- if vLen == 1:
- return [plPhraseTokens]
- elif vLen == 2:
- return [[plPhraseTokens[0]], [plPhraseTokens[1]]]
- else:
- ## A phrase of length N cat be split at positions 2 (second token)
- ## to N (last token).
- vSplitPos = random.randint(2, vLen)
- return [plPhraseTokens[ : vSplitPos - 1]] + self._randSplitPhrase(plPhraseTokens[vSplitPos - 1 :])
-
-
-
- def _getRandomTag(self, plTagset):
- '''
- Returns a random tag from plTagset
- '''
-
- return plTagset[random.randrange(len(plTagset))]
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # ConstParseLoader
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class ConstParseLoader:
- '''
- Constituency parse loader
-
- Loads constituency parse trees into ConstParses objects. Depending on
- the parse file format it loads n-best parses and parsing details such
- as log probability and time are also loaded.
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- # list of ConstParses
- self.parses = None
-
-
-
- def loadPTBTrees(self, pPTBFile, pflgExpandTerminal = False):
- '''
- Loads the parse trees in Penn Treebank format.
-
- If pflgExpandTerminal is set to true, the terminal node will be
- expanded into a pre-terminal (POS tag) and terminal (token) nodes.
- '''
-
- try:
- vlTrees = open(pPTBFile).read().strip().split('\n')
- except IOError:
- raise Exception("Cannot open the tree file: %s" % pPTBFile)
-
- self.parses = []
- for vTree in vlTrees:
- self.parses.append(ConstParses())
- vCTree = self._createNewTree()
- vCTree.loadPTBTree(vTree, pflgExpandTerminal)
- self.parses[-1].parses.append(ConstParse(vCTree))
-
-
-
- def _createNewTree(self):
- '''
- Creates and returns a new constituency tree
- '''
-
- return ConstTree()
-
-
-
- def loadParses(self, pParseFile, pflgExpandTerminal = False):
- '''
- Loads parse trees in PTB format with their log probabilities if
- included.
-
- If pflgExpandTerminal is set to true, the terminal node will be
- expanded into a pre-terminal (POS tag) and terminal (token) nodes.
- '''
-
- try:
- vlLines = open(pParseFile).read().strip().split('\n')
- except IOError:
- raise Exception("Cannot open the parse file: %s" % pParseFile)
-
- self.parses = []
- for vLine in vlLines:
- self.parses.append(ConstParses())
- vCTree = self._createNewTree()
-
- if vLine[0] == '-' or vLine[0] == '0':
- vColon = vLine.find(':')
- vLogProb = float(vLine[ : vColon].strip())
- vTree = vLine[vColon+1 : ].strip()
- vCTree.loadPTBTree(vTree, pflgExpandTerminal)
- self.parses[-1].parses.append(ConstParse(vCTree, vLogProb))
- else:
- vTree = vLine
- vCTree.loadPTBTree(vTree, pflgExpandTerminal)
- self.parses[-1].parses.append(ConstParse(vCTree))
-
-
-
- def loadLorgParses(self, pParseFile, pNoParseRepTreeFile = '', pNoParseRepProbFile = '', pDummyForNoParse = "( (ERR NOPARSE))", pDefaultProbForNoParse = -150, pflgExpandTerminal = False, pflgVerbose = True):
- '''
- Loads the parse tree and details from Lorg output file
-
- Both verbose and quiet output are supported.
-
- If pflgExpandTerminal is set to true, the terminal node will be
- expanded into a pre-terminal (POS tag) and terminal (token) nodes.
-
- Lorg parser output may contain "no parse found" for sentences it
- cannot This method implements two ways to get around that.
- The first is to replace them with alternative parses provided in
- an alternative data set in pNoParseRepTreeFile (as well as their
- probabilities in pNoParseRepProbFile. Similar to the main parses,
- the trees are provided in a file with the same size but one tree
- per line, and only required lines will be retrieved. The probabilities
- are also provided in one per line in a file. So, if the no parse
- sentences are known in advance, the rest can be left blank and
- alternative parses can be provided in the appropriate positions
- in the file. The second way is to replace them with a dummy parse
- tree and a default probability value. If the alternative data set
- is not provided, the second method will be used.
- '''
-
- # loading parses
-
- try:
- vLorgOutput = open(pParseFile).read()
- except IOError:
- raise Exception("Cannot open the parse file: %s" % pParseFile)
-
- # loading replacement parses and probabilities for no-parse
-
- if pNoParseRepTreeFile != '' and pNoParseRepProbFile != '':
- try:
- vlNoParseRepTrees = open(pNoParseRepTreeFile).read().strip().split('\n')
- vlNoParseRepProbs = open(pNoParseRepProbFile).read().strip().split('\n')
- except IOError:
- raise Exception("Cannot open the replacement parse file: %s" % pNoParseRepFile)
- else:
- vlNoParseRepTrees = []
- vlNoParseRepProbs = []
-
-
- self.parses = []
- for vLine in vLorgOutput.strip().split("\n"):
- if vLine.startswith("###"):
- vLineContent = vLine[4:].strip()
- vlFound = []
-
- # ID for sanity check and initilization
- vPattern = re.compile("ID: ([0-9]+)$")
- vlFound = vPattern.findall(vLineContent)
- if len(vlFound) == 1:
- vID = vlFound[0]
- # initialization
- self.parses.append(ConstParses())
-
- # time
- vPattern = re.compile("time: ([0-9]*[.]*[0-9]+)$")
- vlFound = vPattern.findall(vLineContent)
- if len(vlFound) == 1:
- self.parses[-1].time = float(vlFound[0])
- elif vLine[0] == '-' or vLine[0] == '0':
- # n-best parses
- vColon = vLine.find(':')
- vLogProb = float(vLine[ : vColon].strip())
- vParseTree = self._createNewTree()
- vParseTree.loadPTBTree(vLine[vColon+1 : ].strip(), pflgExpandTerminal)
- self.parses[-1].parses.append(ConstParse(vParseTree, vLogProb))
- elif vLine.startswith("no parse found"):
- vParseTree = self._createNewTree()
- if len(vlNoParseRepTrees) > 0:
- vNoParseRepTree = vlNoParseRepTrees[len(self.parses) - 1]
- vRepLogProb = vlNoParseRepProbs[len(self.parses) - 1]
- else:
- vNoParseRepTree = pDummyForNoParse
- vRepLogProb = pDefaultProbForNoParse
- vParseTree.loadPTBTree(vNoParseRepTree, pflgExpandTerminal)
- if not pflgVerbose:
- self.parses.append(ConstParses())
- self.parses[-1].parses = [ConstParse(vParseTree, vRepLogProb)]
- elif vLine.startswith("("): # older Lorg
- vParseTree = self._createNewTree()
- vParseTree.loadPTBTree(vLine.strip(), pflgExpandTerminal)
- if not pflgVerbose:
- self.parses.append(ConstParses())
- self.parses[-1].parses = [ConstParse(vParseTree)]
- else:
- raise Exception("Unknown pattern: %s" % vLine)
-
-
-
- def getParses(self):
- '''
- Returns the loaded constituency parses
-
- The returned value is the list of ConstParses object
- '''
-
- return self.parses
-
-
-
|