constparse.py 65 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. This module defines classes related to constituency parsing.
  5. Version 1.5 (06-Jul-2016)
  6. - lowerTerminals() is added to ConstTree and ConstNode.
  7. Version 1.4 (04-May-2016)
  8. - A bug in stratifyRules() of ConstNode is fixed related to inconsistent
  9. naming changes. Also creating a new node in stratifyPartialRule()
  10. is now done using _getNewNode() method.
  11. Version 1.3 (19-Feb-2016)
  12. - getPathTo() is added to ConstNode.
  13. Version 1.2 (19-Nov-2015 to 24-Nov-2015)
  14. - deepCopy() is added to ConstTree.
  15. - deepCopy() ans shallowCopy() of ConstNode are edited to enable inherited
  16. classes to create new nodes of their own type (by using _getNewNode()).
  17. - SRL attribute is added to the constructor of ConstTree.
  18. Version 1.1 (28-May-2015 to 10-Jun-2015)
  19. - removeSuffix() and hasSuffixedLabel() are moved from ConstTree and
  20. ConstNode to ForeeConstTree and ForeeConstNode in foreebank.py.
  21. - getRightSibling() and getLeftSibling() are added to ConstNode.
  22. - addChild() and replaceChild() of ConstNode are changed to set the
  23. parent of the added child regardless of it being done by the calling
  24. code.
  25. - delChild() is changed to set the parent of the deleted child to None.
  26. - removeFunctionTags() is added to ConstTree and ConstNode.
  27. - ConstParseLoader is moved here from dloader.py.
  28. - _createNewTree() is added to ConstParseLoader() and the changes are
  29. made accordingy to allow the child classes create their corresponding
  30. tree class instances.
  31. - getRoot(), getRootLabel(), getPreTerminals() are added to ConstTree.
  32. - getSynLabels() is added to ConstTree and ConstNode.
  33. Version 1.0 (25-May-2015)
  34. - ConstParses, ConstParse, ConstTree, ConstNode, BConstParser are moved
  35. here from dloader.py.
  36. - getTerminalNodes() is added to ConstTree and ConstNode.
  37. """
  38. import re, copy
  39. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  40. # ConstParses
  41. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  42. class ConstParses:
  43. '''
  44. N-best constituency parses of a sentences
  45. A ConstParses contains the list of n-best parses, each an instance of
  46. ConstParse, together with other derails such as parsing time.
  47. '''
  48. def __init__(self):
  49. '''
  50. Constructor
  51. '''
  52. self.parses = []
  53. self.time = None
  54. @property
  55. def tree(self):
  56. '''
  57. Returns the topmost parse tree in the n-best list
  58. '''
  59. return self.parses[0].tree
  60. @property
  61. def logProb(self):
  62. '''
  63. Returns the log probability of the topmost parse in the n-best list
  64. '''
  65. return self.parses[0].logProb
  66. @property
  67. def prob(self):
  68. '''
  69. Returns the probability of the topmost parse in the n-best list
  70. '''
  71. if self.parses[0].logProb == None:
  72. return None
  73. else:
  74. return math.exp(self.parses[0].logProb)
  75. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  76. # ConstParse
  77. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  78. class ConstParse:
  79. '''
  80. Constituency parse
  81. A ConstParse contains a constituency parse tree of type ConstTree,
  82. together with their log probability.
  83. '''
  84. def __init__(self, pConstTree, pParseLogProb = None):
  85. '''
  86. Constructor
  87. '''
  88. self.tree = pConstTree
  89. self.logProb = pParseLogProb
  90. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  91. # ConstTree
  92. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  93. class ConstTree:
  94. '''
  95. Constituency tree
  96. A ConstTree is the constituency parse tree of a sentence where each
  97. constituent is represented by a ConstNode
  98. Pre-terminals (POS tags) and terminal (tokens) can optionally be in
  99. in one node or in separate nodes, where the pre-terminal is the parent
  100. of terminal.
  101. Note that the constituency tree is implemented independent of any format
  102. like CoNLLSentence to be used more generally.
  103. Note that as of version 1.6 of this module, ConstTree is implemented
  104. differently from DepTree. A DepTree is a list of all its nodes, while
  105. a ConstTree is only root node storing the direct children only.
  106. '''
  107. def __init__(self):
  108. '''
  109. Constructor
  110. '''
  111. self.root = None
  112. ## set to true if the terminal is expanded into pre-terminal and
  113. ## terminal nodes
  114. self.expandedTerminal = False
  115. self.srl = None
  116. def deepCopy(self):
  117. '''
  118. NOTE: it seems python deepcopy() works better. Try the idea before
  119. using this method.
  120. Created a deep copy of the tree.
  121. '''
  122. vCopyTree = self._createNewTree()
  123. vCopyTree.root = self.root.deepCopy(pflgCopyTree = True)
  124. vCopyTree.srl = self.srl
  125. return vCopyTree
  126. def _createNewTree(self):
  127. '''
  128. Creates and returns a new ConstTree
  129. This method is useful in class inheriting.
  130. '''
  131. return ConstTree()
  132. def __str__(self):
  133. '''
  134. String representation of the class
  135. '''
  136. return self.getPTBFormat()
  137. def loadPTBTree(self, pPTBTree, pflgExpandTerminal = False):
  138. '''
  139. Creates the tree by loading from Penn Treebank style tree.
  140. Note that TOP, ROOT and empty are considered tree dummy root node
  141. labels and will not be added as nodes.
  142. If pflgExpandTerminal is set to true, the terminal node will be
  143. expanded into a pre-terminal (POS tag) and terminal (token) nodes.
  144. '''
  145. # taking out the main tree
  146. if pPTBTree.startswith("( ("):
  147. vPTBTree = pPTBTree.strip()[2:-1]
  148. elif pPTBTree.upper().startswith("(TOP ("):
  149. vPTBTree = pPTBTree.strip()[5:-1]
  150. elif pPTBTree.upper().startswith("(ROOT ("):
  151. vPTBTree = pPTBTree.strip()[6:-1]
  152. elif pPTBTree.startswith('('):
  153. vPTBTree = pPTBTree.strip()
  154. else:
  155. sys.exit("Bad tree! It doesn't start with a bracket.")
  156. self.root = self._createRoot()
  157. self.expandedTerminal = pflgExpandTerminal
  158. self.root.loadConstituent(vPTBTree, None, pConstFormat = 'ptb', pflgExpandTerminal = pflgExpandTerminal)
  159. def _createRoot(self):
  160. '''
  161. Creates and returns the root node
  162. '''
  163. return ConstNode()
  164. def loadSRL(self, pSRL, pConversionMethod = "elevate"):
  165. '''
  166. Loads semantic role labeling from SRL object
  167. It checks for the type of SRL and converts it to constituency if
  168. it's in other formalisms like dependnecy.
  169. It both loads the whole annotation into the srl attribute (of type
  170. SRL) and the token annotations into nodes.
  171. - pConversionMethod: if pSRL is not in constituency format, this
  172. specifies the conversion method. Follow srl.convertToConst() for
  173. details.
  174. '''
  175. if not pSRL.isConstituency():
  176. self.srl = pSRL.convertToConst(self, pConversionMethod, pflgLoadIntoTree = True)
  177. else:
  178. self.srl = pSRL
  179. # loading node annotations
  180. #for vProp in self.srl.propositions:
  181. # predicate
  182. # vPredNode = vProp.predicate.node
  183. # vPredNode.predRoles.append((None, vProp.predicate.frameset))
  184. # arguments
  185. # for vArg in vProp.arguments:
  186. # vArgNode = vArg.node
  187. ## preventing duplicates (They may happen if the original
  188. ## SRL is dependency-based and a role is assigned to multiple
  189. ## token which are elevated to the same constituency node.)
  190. # if (vPredNode, vArg.label) not in vArgNode.predRoles:
  191. # vArgNode.predRoles.append((vPredNode, vArg.label))
  192. def getPTBFormat(self):
  193. '''
  194. Returns the tree in PTB brackeing format
  195. '''
  196. return self.root.getPTBFormat()
  197. def getTerminals(self):
  198. '''
  199. Returns the list of terminals (tokens) of the tree in their original
  200. order.
  201. '''
  202. return self.root.getTerminals()
  203. def getPreTerminals(self):
  204. '''
  205. Returns the preterminal labels of the tree in a list
  206. '''
  207. return self.root.getPreTerminals()
  208. @property
  209. def surface(self):
  210. '''
  211. Surface form of the sentence
  212. '''
  213. return self.root.surface
  214. def getTerminalNodes(self):
  215. '''
  216. Returns the list of terminal nodes of the tree in their original
  217. order.
  218. '''
  219. return self.root.getTerminalNodes()
  220. def getPreTerminalNodes(self):
  221. '''
  222. Returns pre-teriminal nodes of the tree
  223. '''
  224. return self.root.getPreTerminalNodes()
  225. def getPreTerminalNodeAt(self, pPosition):
  226. '''
  227. Returns the pre-terminal node of token at given position if the
  228. tree format expandes terminals or terminal node if it does not
  229. '''
  230. return self.root.getPreTerminalNodeAt(pPosition)
  231. def getHeight(self):
  232. '''
  233. Returns the height of the tree which is the number of levels from
  234. the root to the deepest (farthest) terminal.
  235. The height of a tree with only one node is 0.
  236. '''
  237. return self.root.getHeight()
  238. def getSize(self):
  239. '''
  240. Returns the size of the tree which is the number of its nodes.
  241. '''
  242. return self.root.getSize()
  243. def getPOSs(self):
  244. '''
  245. Returns POS tags (pre-trminals) of the terminals in their original
  246. order in a list.
  247. '''
  248. return self.root.getPreTerminals()
  249. def getUniversalPOSs(self, pUniversalTagMap):
  250. '''
  251. Returns POS tags mapped to the universal POS tags of Petrov et al.
  252. (2012).
  253. The map needs to be provided in a tag map file format.
  254. '''
  255. try:
  256. vdTagMap = loadTagMap(open(pUniversalTagMap).read(), pflgSelfMapIn = True)
  257. except IOError:
  258. raise Exception("Cannot open tag map file: %s" % pUniversalTagMap)
  259. vlUPOSTags = []
  260. for vTag in self.getPOSs():
  261. try:
  262. vlUPOSTags.append(vdTagMap[vTag])
  263. except KeyError:
  264. if "_catchall" in vdTagMap:
  265. vlUPOSTags.append(vdTagMap["_catchall"])
  266. else:
  267. vlUPOSTags.append(vTag)
  268. return vlUPOSTags
  269. def mapTags(self, pdMap):
  270. '''
  271. Maps syntactic tags to a new tag set using map in pdMap.
  272. '''
  273. self.root.mapTags(pdMap)
  274. def getCoarseChunks(self):
  275. '''
  276. Extracts and returns coarse-grained chunks which are children
  277. of root node.
  278. '''
  279. return self.root.getChildrenTags()
  280. def getHighestNodeOfSpan(self, pSpan):
  281. '''
  282. Returns the highest (closest to root) node spanning the given token
  283. span
  284. An exception is raised if pSpan is not in the range or it crosses
  285. the left or right boundary of a node in the tree.
  286. '''
  287. vHSNode = self.root.getHighestNodeOfSpan(pSpan)
  288. if vHSNode == None:
  289. vTokenSpan = self.root.getTokenSpan()
  290. raise Exception("Invalid range: (%s %s) is not in (%s %s)" % (pSpan[0], pSpan[1], vTokenSpan[0], vTokenSpan[1]))
  291. else:
  292. return vHSNode
  293. def getSentLength(self):
  294. '''
  295. Returns the length of the sentence of the tree
  296. '''
  297. return self.root.getSpanSize()
  298. def getTagset(self):
  299. '''
  300. Return the syntactic tags of the tree nodes and their counts.
  301. '''
  302. return self.root.getTagset()
  303. def getSynLabels(self):
  304. '''
  305. Return the syntactic labes of the tree nodes including phrase labels
  306. and POS tags in a list
  307. '''
  308. return self.root.getSynLabels()
  309. def extractCFGRules(self, pflgNoLexRule = False):
  310. '''
  311. Extracts and returns the CFG production rules expanding the nodes
  312. of the tree.
  313. The rules are returned in a list. Each rule is a dictionary of
  314. the left and right hand sides of the rules, the former being a
  315. phrase tag and the latter a list of tags.
  316. If pflgNoLexRule is true, lexical rules will not be extracted.
  317. '''
  318. return self.root.extractCFGRules(pflgNoLexRule)
  319. def extractLexRules(self):
  320. '''
  321. Extracts and returns the lexical CFG rules.
  322. The rules are returned in a list. Each rule is a dictionary of
  323. the left and right hand sides of the rules, the former being a
  324. POS tag and the latter a terminal token.
  325. '''
  326. return self.root.extractLexRules()
  327. def extractSubsetTrees(self):
  328. '''
  329. Extracts and returns list of all subset trees of the tree.
  330. Subset trees are used in tree kernels (c.f. Moschitti 2006)
  331. '''
  332. vlFragments = self.root.extractSubsetTrees()
  333. vlSsTrees = []
  334. for vFrag in vlFragments:
  335. vTree = ConstTree()
  336. vTree.root = vFrag
  337. vlSsTrees.append(vTree)
  338. return vlSsTrees
  339. def getPST(self, pPredNode):
  340. '''
  341. Extracts proposition subtree (PST) the predicate node of which is
  342. given
  343. PST is a subtree of the constituency tree spanning only the predicate
  344. and argument nodes of that proposition.
  345. '''
  346. vPST = ConstTree()
  347. vPST.expandedTerminal = self.expandedTerminal
  348. vPSTRoot, vPSTPredNode = self.root.getRawPST(pPredNode, None)
  349. def _postProcPST(pPSTNode, pOldPredNode, pNewPredNode):
  350. '''
  351. Post-processes PST to set the SRL based on the new predicate
  352. node
  353. Only the role of argument of the given predicate and the frameset
  354. of this predicate are kept.
  355. '''
  356. if pPSTNode.isArgumentOf(pOldPredNode):
  357. ## In Theory, no argument can take two roles for a single
  358. ## predicate. However, due to errors (e.g. conversion from
  359. ## dependency SRl to constituency SRL, this may happen.
  360. ## The uncommented code takes care of the situation.
  361. #pPSTNode.setPredRoles([(pNewPredNode, vPSTNode.getArgRoleOf(pOldPredNode))])
  362. pPSTNode.setPredRoles([(pNewPredNode, r) for r in pPSTNode.getArgRoleOf(pOldPredNode)])
  363. elif pPSTNode == pNewPredNode:
  364. pPSTNode.setPredRoles([(None, pPSTNode.getPredicateFrameset())])
  365. else:
  366. pPSTNode.removePredRoles()
  367. for vChild in pPSTNode.getChildren():
  368. _postProcPST(vChild, pOldPredNode, pNewPredNode)
  369. # post-processing SRL in the PST as the predicate node is new
  370. _postProcPST(vPSTRoot, pPredNode, vPSTPredNode)
  371. ## attempt to remove the trailing 1-child nodes from the
  372. ## top of the subtree, e.g. (S (S (VP (VP (ADJP (...
  373. while True:
  374. if vPSTRoot.getChildrenCount() == 1 and not vPSTRoot.getChildren()[0].isArgument() and vPSTRoot.getChildren()[0] != vPSTPredNode:
  375. vPSTRoot = vPSTRoot.getChildren()[0]
  376. else:
  377. break
  378. vPST.root = vPSTRoot
  379. return vPST, vPSTPredNode
  380. def addVPToFTBTree(self, pPostVerbalNodeCount = 2):
  381. '''
  382. Adds a VP node above all VNs in the tree and their post-verbal NP
  383. and PPs siblings.
  384. pPostVerbalNodeCount specifies the number of post-verbal sibling
  385. to go under the VP node.
  386. '''
  387. self.root.addVPToFTBTree(pPostVerbalNodeCount)
  388. def stratifyRules(self):
  389. '''
  390. Stratifies production rules.
  391. See ConsNode.stratifyFTBTree()
  392. '''
  393. self.root.stratifyRules()
  394. def fixFTBCoord(self):
  395. '''
  396. Fixes the FTB coordination in the tree.
  397. See ConsNode.fixFTBCoord()
  398. '''
  399. self.root.fixFTBCoord()
  400. def delNodes(self, pLabelPattern):
  401. '''
  402. Deletes nodes with specified regular expression pattern
  403. A root node cannot be deleted. When a node is removed, the entire
  404. subtree rooted at it is removed.
  405. '''
  406. self.root.delNodes(pLabelPattern = pLabelPattern)
  407. def delPRO(self):
  408. '''
  409. Deletes pro-drop nodes (*PRO* in PTB)
  410. '''
  411. self.root.delPRO()
  412. def removeFunctionTags(self):
  413. '''
  414. Removes function tags from the labels
  415. Function tags are supposed to be glued to the syntactic labels by
  416. a dash (e.g. NP-OBJ)
  417. '''
  418. self.root.removeFunctionTags(pflgAllSubtree = True)
  419. def getRoot(self):
  420. '''
  421. Returns the root node
  422. '''
  423. return self.root
  424. def getRootLabel(self):
  425. '''
  426. Returns the syntactic label of the root node
  427. '''
  428. return self.root.getLabel()
  429. def lowerTerminals(self):
  430. '''
  431. Converts terminals to lowercase
  432. '''
  433. self.root.lowerTerminals()
  434. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  435. # ConstNode
  436. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  437. class ConstNode:
  438. '''
  439. Constituency tree node
  440. A ConstNode represents a constituent in constituency parse tree. It
  441. has at most one parent node of the same type and arbitrary number of
  442. children nodes of the same type including none.
  443. A ConstNode is in fact a constituency subtree because all its children
  444. down to the terminals are recursively accessible.
  445. Note that the root node has no parent. It is accessed by/via ConstTree
  446. where.
  447. Pre-terminals (POS tags) and terminal (tokens) can optionally be in
  448. in one node or in separate nodes, where the pre-terminal is the parent
  449. of terminal.
  450. '''
  451. def __init__(self):
  452. '''
  453. Constructor
  454. '''
  455. self.parent = None
  456. self.children = []
  457. self.synTag = ''
  458. # terminal (word) if the node is leaf: e.g. (DT the)
  459. self.terminal = ''
  460. ## set to true if the terminal is expanded into pre-terminal and
  461. ## terminal nodes
  462. self.expandedTerminal = False
  463. ## list of semantic role and predicate tuples
  464. ## NOTE: a predicate node itself is assigned (None, frameset)
  465. self.predRoles = []
  466. def deepCopy(self, pflgCopyTree = False):
  467. '''
  468. NOTE: it seems in some cases python deepcopy() works better. Try
  469. both and choose the one which suites the purpose.
  470. Creates and returns a deep copy of the node optionally including
  471. the sub tree under it
  472. NOTE: This deep copy does not deep copy the predicate node in the
  473. self.predRoles tuples. The necessary action should be taken after
  474. deep copy if needed.
  475. '''
  476. # copying the node
  477. vNodeCopy = self._getNewNode()
  478. vNodeCopy.parent = self.parent
  479. vNodeCopy.children = self.children
  480. vNodeCopy.synTag = self.synTag
  481. vNodeCopy.terminal = self.terminal
  482. vNodeCopy.expandedTerminal = self.expandedTerminal
  483. # Note the method comments
  484. vNodeCopy.predRoles = self.predRoles
  485. # copying subtree rooted at node
  486. if pflgCopyTree:
  487. vNodeCopy.children = []
  488. for vChild in self.children:
  489. vNodeCopy.children.append(vChild.deepCopy(True))
  490. vNodeCopy.children[-1].parent = vNodeCopy
  491. return vNodeCopy
  492. def shallowCopy(self, pflgCopyTree = False):
  493. '''
  494. Creates and returns a shallow copy of the node
  495. The shallow copy does not have a parent and children.
  496. '''
  497. vNodeCopy = self._getNewNode()
  498. vNodeCopy.synTag = self.synTag
  499. vNodeCopy.terminal = self.terminal
  500. vNodeCopy.expandedTerminal = self.expandedTerminal
  501. vNodeCopy.predRoles = self.predRoles
  502. return vNodeCopy
  503. def __str__(self):
  504. '''
  505. String representation of the class
  506. '''
  507. return self.getPTBFormat()
  508. def setLabel(self, pLabel):
  509. '''
  510. Sets the value of the node label, i.e. its syntactic tag
  511. '''
  512. self.synTag = pLabel
  513. def getLabel(self):
  514. '''
  515. Gets the node label, i.e. its syntactic tag
  516. '''
  517. return self.synTag
  518. def getSynTag(self):
  519. '''
  520. Returns the syntactic tag of the node
  521. '''
  522. return self.getLabel()
  523. def addChild(self, pChildNode):
  524. '''
  525. Addes a new child to the children of this node
  526. '''
  527. pChildNode.setParent(self)
  528. self.children.append(pChildNode)
  529. def insertChild(self, pLabel, pPosition):
  530. '''
  531. Inserts a child node with the given label at the given child number
  532. position
  533. It also returns the inserted child for further use.
  534. The child number positions are counted from the leftmost child.
  535. '''
  536. # creating the new node
  537. vNewNode = self._getNewNode()
  538. # setting the attributes
  539. vNewNode.setLabel(pLabel)
  540. vNewNode.setParent(self)
  541. vNewNode.setExpandedTerminal(self.getExpandedTerminal())
  542. # inserting at the right position
  543. self.children.insert(pPosition - 1, vNewNode)
  544. return vNewNode
  545. def insertIntermChild(self, pLabel, ptDomChildSpan):
  546. '''
  547. Inserts an intermediate child node with the given label dominating
  548. existing children at the given child number span.
  549. It also returns the inserted child for further use.
  550. The child number positions are counted from the leftmost child.
  551. '''
  552. # creating the new node
  553. vNewNode = self._getNewNode()
  554. # setting the attributes
  555. vNewNode.setLabel(pLabel)
  556. vNewNode.setParent(self)
  557. vNewNode.setExpandedTerminal(self.getExpandedTerminal())
  558. # identifying and moving the children under the new node
  559. vlDomChildren = self.getChildrenAt(ptDomChildSpan)
  560. self.delChildrenAt(ptDomChildSpan)
  561. for vChild in vlDomChildren:
  562. vChild.setParent(vNewNode)
  563. vNewNode.addChild(vChild)
  564. # inserting at the right position
  565. self.children.insert(ptDomChildSpan[0] - 1, vNewNode)
  566. return vNewNode
  567. def replaceChild(self, pPosition, pChild):
  568. '''
  569. Replaces the child node at the given position with new child
  570. '''
  571. pChild.setParent(self)
  572. self.children[pPosition - 1] = pChild
  573. def insertParent(self, pLabel):
  574. '''
  575. Inserts a node as the parent node of current node with the given
  576. label and performs the necessary changes.
  577. '''
  578. # creating the new node
  579. vNewNode = self._getNewNode()
  580. # setting the attributes
  581. vNewNode.setLabel(pLabel)
  582. vNewNode.setParent(self.getParent())
  583. vNewNode.setExpandedTerminal(self.getExpandedTerminal())
  584. # setting parent/child relations
  585. vNewNode.getParent().replaceChild(pPosition = vNewNode.getParent().getChildNo(self), pChild = vNewNode)
  586. vNewNode.addChild(self)
  587. self.setParent(vNewNode)
  588. def loadConstituent(self, pConst, pParent, pConstFormat = 'ptb', pflgExpandTerminal = False):
  589. '''
  590. Creates node for the input constituent
  591. Creating a node includes creating all its children recursively
  592. down to the terminals.
  593. Constituent formats supported are:
  594. - ptb: Penn Treebank (e.g. (NP (DT The) (NNS senators)) )
  595. If pflgExpandTerminal is set to true, the terminal node will be
  596. expanded into a pre-terminal (POS tag) and terminal (token) nodes.
  597. '''
  598. self.expandedTerminal = pflgExpandTerminal
  599. if pConstFormat.lower() == "ptb" or pConstFormat.lower().startswith("penn"):
  600. return self._createPTBNode(pConst, pParent, pflgExpandTerminal = pflgExpandTerminal)
  601. def _createPTBNode(self, pConst, pParent, pflgExpandTerminal = False):
  602. '''
  603. Creates node for input constituent which is in PTB format
  604. Creating a node includes creating all its children recursively
  605. down to the terminals.
  606. If pflgExpandTerminal is set to true, the terminal node will be
  607. expanded into a pre-terminal (POS tag) and terminal (token) nodes.
  608. '''
  609. self.parent = pParent
  610. self.children = []
  611. self.expandedTerminal = pflgExpandTerminal
  612. vPTBConst = pConst.strip()
  613. vChildrenStartIdx = vPTBConst.find(' ') + 1
  614. # extracting tag
  615. self.synTag = vPTBConst[1 : vChildrenStartIdx - 1]
  616. # extracting terminal or children constituents
  617. vWhat = vPTBConst[vChildrenStartIdx : ].strip()
  618. ## checking if this is bracketing parenthesis not a sentence
  619. ## token which is not masked by -LRB-)
  620. if vWhat.startswith('(') and vWhat[1] != ')':
  621. vChildren = vWhat[ : -1]
  622. else:
  623. if pflgExpandTerminal:
  624. vChildTerminalNode = self._getNewNode()
  625. vChildTerminalNode.parent = self
  626. vChildTerminalNode.terminal = vWhat[ : -1]
  627. vChildTerminalNode.expandedTerminal = True
  628. self.children = [vChildTerminalNode]
  629. else:
  630. self.terminal = vWhat[ : -1]
  631. return
  632. # recursively creating children nodes
  633. vOpenBCount = 0
  634. vChildStartIdx = 0
  635. for i in range(len(vChildren)):
  636. if vChildren[i] == '(':
  637. ## checking if this is bracketing parenthesis not a sentence
  638. ## token which is not masked by -LRB-)
  639. if vChildren[i+1] != ')':
  640. vOpenBCount += 1
  641. elif vChildren[i] == ')':
  642. ## checking if this is bracketing parenthesis not a sentence
  643. ## token which is not masked by -RRB-)
  644. if vChildren[i-1] != ' ':
  645. vOpenBCount -= 1
  646. else:
  647. j = i - 2
  648. while j >= 0 and vChildren[j] == ' ':
  649. j -= 1
  650. if vChildren[j] == ')':
  651. vOpenBCount -= 1
  652. if vOpenBCount == 0:
  653. vChild = vChildren[vChildStartIdx : i+1].strip()
  654. if vChild != '':
  655. vChildNode = self._getNewNode()
  656. vChildNode._createPTBNode(vChild, self, pflgExpandTerminal = pflgExpandTerminal)
  657. self.children.append(vChildNode)
  658. vChildStartIdx = i + 1
  659. def _getNewNode(self):
  660. '''
  661. Creates and returns a node
  662. '''
  663. return ConstNode()
  664. def isTerminal(self):
  665. '''
  666. Returns true if the node is a terminal node.
  667. '''
  668. return len(self.children) == 0
  669. def isRoot(self):
  670. '''
  671. Returns true is this node is the root
  672. '''
  673. return self.parent == None
  674. def isPreTerminal(self):
  675. '''
  676. Returns true if the node is a pre-terminal node.
  677. Note pre-terminals exist only if the constituent is loaded into a
  678. node with pflgExpandTerminal option. Otherwise, the pre-terminal
  679. and terminal are integrated in one node which is called terminal.
  680. '''
  681. if self.expandedTerminal and len(self.children) == 1 and self.children[0].isTerminal():
  682. return True
  683. else:
  684. return False
  685. def setExpandedTerminal(self, pValue):
  686. '''
  687. Sets the value of expandedTerminal attribute
  688. '''
  689. self.expandedTerminal = pValue
  690. def getExpandedTerminal(self):
  691. '''
  692. Returns the value of expandedTerminal attribute
  693. Not to be confused with isExpandedTerminal().
  694. '''
  695. return self.expandedTerminal
  696. def isExpandedTerminal(self):
  697. '''
  698. Returns true if the node is an expanded terminal node.
  699. A terminal is expanded if the pre-terminal and terminals are two
  700. separate nodes.
  701. '''
  702. if self.parent == None:
  703. return False
  704. elif self.parent.isPreTerminal():
  705. return True
  706. else:
  707. return False
  708. def isCollapsedTerminal(self):
  709. '''
  710. Returns true if the node is a collapsed terminal node.
  711. A terminal is collapsed if the pre-terminal and terminals are
  712. merged into one node, i.e. expandedTerminal is false.
  713. '''
  714. if not self.expandedTerminal and self.isTerminal():
  715. return True
  716. else:
  717. return False
  718. def getPTBFormat(self):
  719. '''
  720. Returns the constituent in PTB brackeing format.
  721. '''
  722. vPTBConst = '(%s ' % self.synTag
  723. if self.isPreTerminal():
  724. vPTBConst += self.children[0].terminal + ')'
  725. else:
  726. if self.terminal != '':
  727. vPTBConst += self.terminal + ')'
  728. else:
  729. vPTBChildren = []
  730. for vChild in self.children:
  731. vPTBChildren.append(vChild.getPTBFormat())
  732. vPTBConst += ' '.join(vPTBChildren) + ')'
  733. return vPTBConst
  734. def getTerminals(self):
  735. '''
  736. Extracts and returns the list of terminals (tokens) the node spans.
  737. '''
  738. vlTerminals = []
  739. if self.isTerminal():
  740. vlTerminals.append(self.terminal)
  741. else:
  742. for vChild in self.children:
  743. vlTerminals += vChild.getTerminals()
  744. return vlTerminals
  745. def setTerminalForm(self, pForm):
  746. '''
  747. Sets the value of the surface form of the terminal node
  748. '''
  749. if self.isTerminal():
  750. self.terminal = pForm
  751. else:
  752. raise Exception("Not a terminal node!")
  753. @property
  754. def surface(self):
  755. '''
  756. Surface form of the sentence
  757. '''
  758. return ' '.join(self.getTerminals())
  759. def getTerminalNodes(self):
  760. '''
  761. Extracts and returns the list of terminal nodes the node spans.
  762. '''
  763. vlTNodes = []
  764. if self.isTerminal():
  765. vlTNodes.append(self)
  766. else:
  767. for vChild in self.children:
  768. vlTNodes += vChild.getTerminalNodes()
  769. return vlTNodes
  770. def getPreTerminals(self):
  771. '''
  772. Extracts and returns the list of pre-terminals (POS tags) the node spans
  773. '''
  774. vlPreTerminals = []
  775. if self.isPreTerminal() or self.isTerminal():
  776. vlPreTerminals.append(self.synTag)
  777. else:
  778. for vChild in self.children:
  779. vlPreTerminals += vChild.getPreTerminals()
  780. return vlPreTerminals
  781. def getPreTerminalNodes(self):
  782. '''
  783. Extracts and returns the list of pre-terminal nodes the node spans
  784. '''
  785. vlPreTerminalNodes = []
  786. if self.isPreTerminal():
  787. vlPreTerminalNodes.append(self)
  788. elif self.isTerminal():
  789. vlPreTerminalNodes.append(self.parent)
  790. else:
  791. for vChild in self.children:
  792. vlPreTerminalNodes += vChild.getPreTerminalNodes()
  793. return vlPreTerminalNodes
  794. def getPreTerminalNodeAt(self, pPosition):
  795. '''
  796. Returns the pre-terminal node at a given token position if the
  797. tree format expands terminals, or terminal node if it does not
  798. '''
  799. if self.isPreTerminal() or self.isCollapsedTerminal():
  800. return self
  801. else:
  802. vHSNode = self.getHighestNodeOfSpan((pPosition, pPosition))
  803. if vHSNode != None:
  804. vPTNode = vHSNode
  805. while not (vPTNode.isPreTerminal() or vPTNode.isCollapsedTerminal()):
  806. vPTNode = vPTNode.children[0]
  807. return vPTNode
  808. else:
  809. return None
  810. def setParent(self, pParent):
  811. '''
  812. Sets the parent node of the node
  813. '''
  814. self.parent = pParent
  815. def getParent(self):
  816. '''
  817. Returns the parent node
  818. '''
  819. return self.parent
  820. def getChildren(self):
  821. '''
  822. Returns the children nodes in a list
  823. The methods used to return the list itself before v4.7, but it returns
  824. a copy since this version.
  825. '''
  826. return self.children[:]
  827. def getChildrenAt(self, pSpan):
  828. '''
  829. Returns children nodes at the given position span from left
  830. '''
  831. return self.children[pSpan[0] - 1 : pSpan[1]]
  832. def getSibling(self):
  833. '''
  834. Returns list of sibling nodes of this node
  835. '''
  836. return [vChild for vChild in self.getParent().getChildren() if vChild != self]
  837. def getLeftSibling(self):
  838. '''
  839. Returns the sibling nodes in the left hand side of the node
  840. '''
  841. vlLSibling = []
  842. for vSibling in self.getParent().getChildren():
  843. if vSibling == self:
  844. break
  845. else:
  846. vlLSibling.append(vSibling)
  847. return vlLSibling
  848. def getRightSibling(self):
  849. '''
  850. Returns the sibling nodes in the right hand side of the node
  851. '''
  852. vlRSibling = []
  853. for vSibling in reversed(self.getParent().getChildren()):
  854. if vSibling == self:
  855. break
  856. else:
  857. vlRSibling.append(vSibling)
  858. return reversed(vlRSibling)
  859. def ifDominates(self, pNode):
  860. '''
  861. Checks if this node dominates a given node
  862. '''
  863. ## Since parse trees tend to be wide rather than deep, a BFS may be
  864. ## better. However, I use an algorithm which sounds like a mixture:
  865. ## it first looks at the children of a node, if no success, recursively
  866. ## traverses each child. I did a quick search and didn't find if
  867. ## this is already an algorithm in use or critisized. I don't see
  868. ## a reason why it should not work.
  869. if pNode in self.getChildren():
  870. return True
  871. else:
  872. for vChild in self.getChildren():
  873. vFound = vChild.ifDominates(pNode)
  874. if vFound:
  875. return True
  876. return False
  877. def getTerminal(self):
  878. '''
  879. Return the terminal sequence of the node
  880. '''
  881. return ' '.join(self.getTerminals())
  882. def getGrandParentUnderRoot(self):
  883. '''
  884. Returns the grandparent of this node which is a child of root
  885. '''
  886. if self.getParent().isRoot():
  887. return self
  888. else:
  889. return self.getParent().getGrandParentUnderRoot()
  890. def getRoot(self):
  891. '''
  892. Returns the grandparent of this node which is a child of root
  893. '''
  894. if self.isRoot():
  895. return self
  896. else:
  897. return self.getGrandParentUnderRoot().getParent()
  898. def getChildrenTags(self):
  899. '''
  900. Returns the sytactic tags of children of the node.
  901. '''
  902. vlChildrenTags = []
  903. if not self.isPreTerminal():
  904. for vChild in self.children:
  905. vlChildrenTags.append(vChild.synTag)
  906. return vlChildrenTags
  907. def getTagset(self):
  908. '''
  909. Extracts and returns the syntactic tags of the tree nodes and
  910. their counts.
  911. '''
  912. vdTagset = {self.synTag: 1}
  913. if not self.isPreTerminal() and not self.isTerminal():
  914. for vChild in self.children:
  915. for vTag, vCount in vChild.getTagset().iteritems():
  916. if vTag in vdTagset:
  917. vdTagset[vTag] += vCount
  918. else:
  919. vdTagset[vTag] = vCount
  920. return vdTagset
  921. def getSynLabels(self):
  922. '''
  923. Extracts and returns the syntactic labels of the subtree nodes
  924. including phrase labels and POS tags
  925. '''
  926. vlLabels = []
  927. if not self.isTerminal():
  928. vlLabels = [self.getLabel()]
  929. for vChild in self.children:
  930. vlLabels += vChild.getSynLabels()
  931. return vlLabels
  932. def getSpanSize(self):
  933. '''
  934. Extracts and returns the number of tokens the node spans.
  935. '''
  936. vSize = 0
  937. if self.isTerminal():
  938. vSize += 1
  939. else:
  940. for vChild in self.children:
  941. vSize += vChild.getSpanSize()
  942. return vSize
  943. def getTokenSpan(self):
  944. '''
  945. Extracts and returns the range of token indexes the node spans.
  946. For example if a node spans tokens at positions 3 to 7 in the sentence,
  947. it returns (3,7)
  948. '''
  949. if self.isRoot():
  950. ## If root node, sum of span sizes of children is used to compute
  951. ## the end of the span.
  952. vStart = 1
  953. vEnd = 0
  954. for vChild in self.children:
  955. vEnd += vChild.getSpanSize()
  956. else:
  957. ## If not root node, the start of the span of its parent is
  958. ## first computed and then summed with span size of its left
  959. ## sibling nodes.
  960. # 1. parent recursive part
  961. vStart = self.getParent().getTokenSpan()[0]
  962. # 2. left sibling part
  963. for vSibling in self.getParent().children:
  964. if vSibling == self:
  965. break
  966. else:
  967. vStart += vSibling.getSpanSize()
  968. vEnd = vStart + self.getSpanSize() - 1
  969. return vStart, vEnd
  970. def getTokenSpanRelation(self, pSpan):
  971. '''
  972. Returns the relationship of the token span of this node to the
  973. given span
  974. 0: equal
  975. 1: inclusive (token span contains pSpan)
  976. -1: contained (pSpan contains token span)
  977. -2: left-cross (pSpan crosses the left boundary of token span but its right boundary is inside token span)
  978. -3: right-cross (pSpan crosses the right boundary of token span but its left boundary is inside token span)
  979. -4: disjoint
  980. '''
  981. vTokenSpan = self.getTokenSpan()
  982. if vTokenSpan == pSpan:
  983. return 0
  984. ## The below two conditions include vTokenSpan == pSpan but never
  985. ## happen because of the above return.
  986. elif vTokenSpan[0] <= pSpan[0] and pSpan[1] <= vTokenSpan[1]:
  987. return 1
  988. elif pSpan[0] <= vTokenSpan[0] and vTokenSpan[1] <= pSpan[1]:
  989. return -1
  990. elif pSpan[0] < vTokenSpan[0] and vTokenSpan[0] <= pSpan[1] and pSpan[1] <= vTokenSpan[1]:
  991. return -2
  992. elif vTokenSpan[0] <= pSpan[0] and pSpan[0] <= vTokenSpan[1] and vTokenSpan[1] < pSpan[1]:
  993. return -3
  994. else:
  995. return -4
  996. def getHighestNodeOfSpan(self, pSpan):
  997. '''
  998. Returns the highest (closest to root) node spanning the given token
  999. span if it's in this subtree
  1000. It returns None if the span is not in this subtree. It raises an
  1001. exception if it the given span crosses any node boundary.
  1002. '''
  1003. vSpanRel = self.getTokenSpanRelation(pSpan)
  1004. vTokenSpan = self.getTokenSpan()
  1005. if vSpanRel == 0:
  1006. vHSNode = self
  1007. elif vSpanRel == 1:
  1008. for vChild in self.children:
  1009. vHSNode = vChild.getHighestNodeOfSpan(pSpan)
  1010. if vHSNode != None:
  1011. break
  1012. elif vSpanRel == -1:
  1013. vHSNode = None
  1014. elif vSpanRel == -2:
  1015. raise Exception("Left-cross: (%s %s) vs. (%s %s)" % (vTokenSpan[0], vTokenSpan[1], pSpan[0], pSpan[1]))
  1016. elif vSpanRel == -3:
  1017. raise Exception("Right-cross: (%s %s) vs. (%s %s)" % (vTokenSpan[0], vTokenSpan[1], pSpan[0], pSpan[1]))
  1018. else:
  1019. vHSNode = None
  1020. return vHSNode
  1021. def getHeight(self):
  1022. '''
  1023. Computes and returns the height of the node which is the number
  1024. of levels from the node to the deepest (farthest) terminal under
  1025. it.
  1026. It is the tree depth minus the depth of the node. Depth is calculated
  1027. from the root, while height from the deepest terminal. Therefore,
  1028. tree depth is equal to tree height.
  1029. The height of a terminal node is 0.
  1030. '''
  1031. ## The height of all children are recursively computed. Their
  1032. ## maximum is then the height of the node.
  1033. if self.isTerminal():
  1034. return 0
  1035. else:
  1036. vMaxChildHeight = 0
  1037. for vChild in self.children:
  1038. vHeight = vChild.getHeight()
  1039. if vHeight > vMaxChildHeight:
  1040. vMaxChildHeight = vHeight
  1041. return vMaxChildHeight + 1
  1042. def getSize(self):
  1043. '''
  1044. Returns the size of the node which is the number of nodes in the
  1045. subtree under the node plus 1.
  1046. '''
  1047. if self.isTerminal():
  1048. return 1
  1049. else:
  1050. vChildrenSizeSum = 0
  1051. for vChild in self.children:
  1052. vChildrenSizeSum += vChild.getSize()
  1053. return vChildrenSizeSum + 1
  1054. def mapTags(self, pdMap):
  1055. '''
  1056. Maps syntactic tags in the subtree under the node to a new tag set
  1057. using map in pdMap.
  1058. If a tag is not specified, a catch-all tag will be sought to map
  1059. to. If it does not exist, it will be mapped into itself.
  1060. '''
  1061. if self.isExpandedTerminal():
  1062. return
  1063. try:
  1064. self.synTag = pdMap[self.synTag]
  1065. except KeyError:
  1066. if "_catchall" in pdMap:
  1067. self.synTag = pdMap["_catchall"]
  1068. for vChild in self.children:
  1069. vChild.mapTags(pdMap)
  1070. def getChildrenCount(self):
  1071. '''
  1072. Returns the number of children
  1073. '''
  1074. return len(self.children)
  1075. def getCFGRule(self, pflgNoLexRule = False):
  1076. '''
  1077. Returns the CFG production rule expanding the node if it is not
  1078. a leaf node.
  1079. A rule is represented using a dictionary with two items: lhs for
  1080. the left hand side of the rules and rhs for the its right hand
  1081. side which is a list of tags.
  1082. If pflgNoLexRule is true, lexical rules will not be extracted.
  1083. '''
  1084. if pflgNoLexRule:
  1085. if self.isPreTerminal() or self.isTerminal():
  1086. return {}
  1087. else:
  1088. return {"lhs": self.synTag, "rhs": self.getChildrenTags()}
  1089. else:
  1090. if self.isPreTerminal():
  1091. return {"lhs": self.synTag, "rhs": self.getTerminals()}
  1092. elif self.isTerminal():
  1093. if self.expandedTerminal:
  1094. return {}
  1095. else:
  1096. return {"lhs": self.synTag, "rhs": [self.terminal]}
  1097. else:
  1098. return {"lhs": self.synTag, "rhs": self.getChildrenTags()}
  1099. def extractCFGRules(self, pflgNoLexRule = False):
  1100. '''
  1101. Returns the CFG production rules expanding the node and all the
  1102. nodes of the subtree under it.
  1103. The rules are returned in a list.
  1104. If pflgNoLexRule is true, lexical rules will not be extracted.
  1105. '''
  1106. # rule of the node itself
  1107. vlRules = [self.getCFGRule(pflgNoLexRule)]
  1108. # rules of the nodes in the subtree
  1109. for vChild in self.children:
  1110. vChildRules = vChild.extractCFGRules(pflgNoLexRule)
  1111. if vChildRules != [{}]:
  1112. vlRules += vChildRules
  1113. return vlRules
  1114. def extractLexRules(self):
  1115. '''
  1116. Returns the lexical rules in the subtree under the node.
  1117. The rules are returned in a list.
  1118. '''
  1119. vlRules = []
  1120. if self.isPreTerminal():
  1121. vlRules = [{"lhs": self.synTag, "rhs": self.getTerminals()}]
  1122. elif self.isTerminal():
  1123. if not self.expandedTerminal:
  1124. vlRules = [{"lhs": self.synTag, "rhs": [self.terminal]}]
  1125. else:
  1126. for vChild in self.children:
  1127. vChildRules = vChild.extractLexRules()
  1128. if vChildRules != [{}]:
  1129. vlRules += vChildRules
  1130. return vlRules
  1131. def extractSubsetTrees(self):
  1132. '''
  1133. NOTE: THE ALGORITHM IS BUGGY BECAUSE OF MISUNDERSTANDING SUBSET
  1134. TREE KERNEL DEFINITION.
  1135. Extracts and returns list of all subset trees of the subtree under
  1136. the node.
  1137. Note that what is returned is the top node of the trees and ConstTree
  1138. objects are not created.
  1139. Subset trees are used in tree kernels (c.f. Moschitti 2006)
  1140. '''
  1141. raise Exception("NOTE: THE ALGORITHM IS BUGGY BECAUSE OF MISUNDERSTANDING SUBSET TREE KERNEL DEFINITION.")
  1142. if self.isTerminal():
  1143. return []
  1144. else:
  1145. vlSsTrees = []
  1146. ## extracting subtrees of children while adding children
  1147. ## themselves to build subtrees with this node as root later
  1148. vlChildrenSsTrees = []
  1149. for vChild in self.children:
  1150. # adding child itself for later use
  1151. vChildCopy = vChild.deepCopy()
  1152. vChildCopy.children = []
  1153. vlChildrenSsTrees.append([vChildCopy])
  1154. # extracting subtrees of children
  1155. vlChildSsTrees = vChild.extractSubsetTrees()
  1156. if len(vlChildSsTrees) != 0:
  1157. ## collecting children's subtrees
  1158. for i in range(len(vlChildSsTrees)):
  1159. ## saving copy of child's subtrees if rooted in
  1160. ## the child for later building subtrees with this
  1161. ## node
  1162. if vlChildSsTrees[i].parent == self:
  1163. vlChildrenSsTrees[-1].append(vlChildSsTrees[i].deepCopy())
  1164. # adding to final set
  1165. vlSsTrees.append(vlChildSsTrees[i])
  1166. ## setting the parent to None because this is the
  1167. ## top node of subtree
  1168. vlSsTrees[-1].parent = None
  1169. ## building subtrees rooted in this node with all subtrees of
  1170. ## its children rooted in the child itself plus children nodes
  1171. ## themselves
  1172. for vNewChildren in itertools.product(*vlChildrenSsTrees):
  1173. # building subtrees with this node as root
  1174. vThisNode = self.deepCopy()
  1175. vThisNode.children = list(vNewChildren)
  1176. # setting parents of children
  1177. for i in range(len(vThisNode.children)):
  1178. vThisNode.children[i].parent = vThisNode
  1179. vlSsTrees.append(vThisNode)
  1180. return vlSsTrees
  1181. def addVPToFTBTree(self, pPostVerbalNodeCount = 2):
  1182. '''
  1183. Adds a VP node above all VNs in the subtree under the node and
  1184. their post-verbal NP and PPs siblings.
  1185. pPostVerbalNodeCount specifies the number of post-verbal sibling
  1186. to go under the VP node.
  1187. '''
  1188. if self.synTag == "VN":
  1189. # slicing children of parent on this node
  1190. for i in range(len(self.parent.children)):
  1191. if self.parent.children[i] == self:
  1192. vlRSibling = self.parent.children[i:]
  1193. self.parent.children = self.parent.children[:i]
  1194. break
  1195. # adding VP node
  1196. vVPNode = ConstNode()
  1197. vVPNode.synTag = "VP"
  1198. vVPNode.parent = self.parent
  1199. vVPNode.expandedTerminal = self.expandedTerminal
  1200. self.parent.children.append(vVPNode)
  1201. # adding this node and sibling on the right under new VP node
  1202. vAddedPVNodeCount = 0
  1203. for i, vNode in enumerate(vlRSibling):
  1204. if vNode == self or vNode.synTag.startswith("NP") or vNode.synTag.startswith("PP") :
  1205. vVPNode.children.append(vNode)
  1206. vNode.parent = vVPNode
  1207. vAddedPVNodeCount += 1
  1208. if vAddedPVNodeCount == pPostVerbalNodeCount + 1:
  1209. vVPNode.parent.children += vlRSibling[i + 1 : ]
  1210. break
  1211. else:
  1212. vVPNode.parent.children += vlRSibling[i : ]
  1213. break
  1214. else:
  1215. # doing it for children
  1216. for vChild in self.children:
  1217. ## checking if this is still a child, because it may have
  1218. ## gone under VP via previous child
  1219. if vChild.parent == self:
  1220. vChild.addVPToFTBTree(pPostVerbalNodeCount)
  1221. def stratifyRules(self):
  1222. '''
  1223. Stratifies production rules in the subtree under the node by
  1224. grouping together every two equal adjecent POS tags under a new
  1225. node with a tag made of the POS tag suffixed with _St.
  1226. For example,
  1227. (AdP (ADV x) (ADV y) (ADV z) (ADV w))
  1228. to
  1229. (AdP (ADV_St (ADV x) (ADV y)) (ADV_St (ADV z) (ADV w)))
  1230. '''
  1231. vChildCount = self.getChildrenCount()
  1232. if vChildCount >= 3:
  1233. # grouping children into groups of two from right
  1234. vRightSiblingIdx = vChildCount - 1
  1235. vLeftSiblingIdx = vRightSiblingIdx -1
  1236. while vLeftSiblingIdx >= 0:
  1237. if self.stratifyPartialRule(self, vLeftSiblingIdx, vRightSiblingIdx):
  1238. vRightSiblingIdx -= 2
  1239. vLeftSiblingIdx -= 2
  1240. else:
  1241. vRightSiblingIdx -= 1
  1242. vLeftSiblingIdx -= 1
  1243. # recursively stratifying for children (if any left)
  1244. for vChild in self.children:
  1245. if not (vChild.synTag.endswith("_St") or vChild.isPreTerminal() or vChild.isCollapsedTerminal()):
  1246. vChild.stratifyRules()
  1247. def stratifyPartialRule(self, pLHSNode, pRHSNodeLeftIdx, pRHSNodeRightIdx):
  1248. '''
  1249. Stratifies the partial rule.
  1250. A partial rule consists of the left hand side node and indexes of
  1251. two adjacent nodes on its right hand side (children). A new node
  1252. will be added in the middle so that it is the in the right hand
  1253. side of pLHSNode at that position and left hand side of pRHSNodeLeft
  1254. and pRHSNodeRight.
  1255. '''
  1256. if (pLHSNode.children[pRHSNodeLeftIdx].isPreTerminal() or pLHSNode.children[pRHSNodeLeftIdx].isCollapsedTerminal()) and \
  1257. (pLHSNode.children[pRHSNodeRightIdx].isPreTerminal() or pLHSNode.children[pRHSNodeRightIdx].isCollapsedTerminal()):
  1258. if pLHSNode.children[pRHSNodeLeftIdx].synTag == pLHSNode.children[pRHSNodeRightIdx].synTag:
  1259. vNewNode = ConstNode()
  1260. vNewNode.synTag = pLHSNode.children[pRHSNodeLeftIdx].synTag + "_St"
  1261. vNewNode.parent = pLHSNode
  1262. vNewNode.children = [pLHSNode.children[pRHSNodeLeftIdx], pLHSNode.children[pRHSNodeRightIdx]]
  1263. vNewNode.expandedTerminal = self.expandedTerminal
  1264. pLHSNode.children[pRHSNodeLeftIdx].parent = vNewNode
  1265. pLHSNode.children[pRHSNodeRightIdx].parent = vNewNode
  1266. pLHSNode.children = pLHSNode.children[ : pRHSNodeLeftIdx] + [vNewNode] + pLHSNode.children[pRHSNodeRightIdx + 1 : ]
  1267. def fixFTBCoord(self):
  1268. '''
  1269. Fixes the FTB coordination all over the subtree under the node.
  1270. To fix, it moves the coordinated node (the immediate left sibling
  1271. of the COORD node) under the COORD node.
  1272. For example,
  1273. (S (NP (NC x)) (COORD (CC et) (NP (NC y))))
  1274. to
  1275. (S (COORD (NP (NC x)) (CC et) (NP (NC y))))
  1276. '''
  1277. vIdx = 1
  1278. while vIdx < self.getChildrenCount():
  1279. if self.children[vIdx].synTag == "COORD":
  1280. self.children[vIdx-1].parent = self.children[vIdx]
  1281. self.children[vIdx].children = [self.children[vIdx-1]] + self.children[vIdx].children
  1282. self.children = self.children[ : vIdx-1] + self.children[vIdx : ]
  1283. else:
  1284. vIdx += 1
  1285. # for i in range(self.getChildrenCount()):
  1286. # if i > 0 and self.children[i].synTag == "COORD":
  1287. # self.children[i-1].parent = self.children[i]
  1288. # self.children[i].children = [self.children[i-1]] + self.children[i].children
  1289. # self.children = self.children[ : i-1] + self.children[i : ]
  1290. # recursively doing for children as well
  1291. for vChild in self.children:
  1292. vChild.fixFTBCoord()
  1293. def isPredicate(self):
  1294. '''
  1295. Returns true if the node is a predicate
  1296. '''
  1297. return len([p for p, r in self.predRoles if p == None]) > 0
  1298. def isArgument(self):
  1299. '''
  1300. Returns true if the node is a an argument for any predicate in the
  1301. SRL of the sentence.
  1302. '''
  1303. return len([r for p, r in self.predRoles if p != None]) > 0
  1304. def isArgumentOf(self, pPredNode):
  1305. '''
  1306. Returns true if the node is a an argument of the predicate the node
  1307. of which is given
  1308. '''
  1309. return len([r for p, r in self.predRoles if p == pPredNode]) > 0
  1310. def setPredRoles(self, plPredRoles):
  1311. '''
  1312. Sets the value of predicate/role attribute
  1313. '''
  1314. self.predRoles = plPredRoles
  1315. def removePredRoles(self):
  1316. '''
  1317. Removes all the predicate/roles
  1318. '''
  1319. self.predRoles = []
  1320. def hasPredRole(self, pPredRole):
  1321. '''
  1322. Returns true if given predicate-role exists in node's annotation
  1323. '''
  1324. return pPredRole in self.predRoles
  1325. def addPredRole(self, pPredRole):
  1326. '''
  1327. Addes given predicate-role to node's annotation if it does not
  1328. already exist
  1329. '''
  1330. if not self.hasPredRole(pPredRole):
  1331. self.predRoles.append(pPredRole)
  1332. def removePredRole(self, pPredRole):
  1333. '''
  1334. Removes given predicate-role from node's annotation if it exist
  1335. '''
  1336. if self.hasPredRole(pPredRole):
  1337. self.predRoles.remove(pPredRole)
  1338. def getPredicateFrameset(self):
  1339. '''
  1340. Returns the predicate frameset from the semantic role labeling
  1341. of the node
  1342. '''
  1343. if self.isPredicate():
  1344. return [r for p, r in self.predRoles if p == None][0]
  1345. else:
  1346. return ''
  1347. def getArgRoles(self):
  1348. '''
  1349. Returns the list of argument role labels from the semantic role
  1350. labeling of the node
  1351. '''
  1352. return [r for p, r in self.predRoles if p != None]
  1353. def getArgRoleOf(self, vPredNode):
  1354. '''
  1355. Returns the role label if the node is an argument of the predicate
  1356. at the given predicate
  1357. '''
  1358. ## NOTE: The following commented code takes care of taking multiple
  1359. ## role of a predicate by one node. Technically, this is not valid.
  1360. ## It however is possible when the original SRL is in dependency
  1361. ## formalism and converted by elevating the role in the constituency
  1362. ## tree. The reasons could be SRL error, parsing error or conversion
  1363. ## error. At the moment, we merge the role labels when needed.
  1364. # vRole = [r for p, r in self.predRoles if p == vPredNode]
  1365. #if len(vRole) == 0:
  1366. # return ''
  1367. #elif len(vRole) > 1:
  1368. # raise Exception("Strange! A node (%s) cannot take more than one argument role (%s) for a given prediate (%s)!" % (self.getPTBFormat(), ' '.join(vRole), vPredNode.getPTBFormat()))
  1369. #else:
  1370. # return vRole[0]
  1371. return [r for p, r in self.predRoles if p == vPredNode]
  1372. def getRawPST(self, pOldPredNode, pNewPredNode):
  1373. '''
  1374. Extracts raw proposition subtree (PST) under the node the predicate
  1375. node of which is given
  1376. It's raw because the SRL annotation on it needs to be post-processed
  1377. at tree level which cannot be done at node level.
  1378. Since new nodes are created, references to old predicate node in
  1379. predRoles attribute are no longer valid. The newly created predicate
  1380. node is returned/passed for keeping track of it until returning
  1381. to the tree level (ConstTree.getPST()) instead of having to look
  1382. for it again at the tree level.
  1383. PST is a subtree of the constituency tree spanning only the predicate
  1384. and argument nodes of that proposition including all full subtrees
  1385. of argument nodes.
  1386. '''
  1387. if self.isArgumentOf(pOldPredNode):
  1388. #vPSTNode = copy.deepcopy(self)
  1389. # deepCopy() works better than python deepcopy in this case
  1390. vPSTNode = self.deepCopy(pflgCopyTree = True)
  1391. elif self == pOldPredNode:
  1392. #vPSTNode = copy.deepcopy(self)
  1393. # deepCopy() works better than python deepcopy in this case
  1394. vPSTNode = self.deepCopy(pflgCopyTree = True)
  1395. pNewPredNode = vPSTNode
  1396. elif self.isPreTerminal() or self.isCollapsedTerminal():
  1397. vPSTNode = None
  1398. else:
  1399. vlChildPSTs = []
  1400. for vChild in self.getChildren():
  1401. vChildPST, pNewPredNode = vChild.getRawPST(pOldPredNode, pNewPredNode)
  1402. if vChildPST:
  1403. vlChildPSTs.append(vChildPST)
  1404. if len(vlChildPSTs) == 0:
  1405. vPSTNode = None
  1406. else:
  1407. vPSTNode = self.shallowCopy()
  1408. for vChildPST in vlChildPSTs:
  1409. vPSTNode.addChild(vChildPST)
  1410. return vPSTNode, pNewPredNode
  1411. def delNodes(self, pLabelPattern):
  1412. '''
  1413. Deletes nodes with specified regular expression pattern in the
  1414. subtree rooted at this node (except the node itself)
  1415. When a node is removed, the entire subtree rooted at it is removed.
  1416. '''
  1417. vlMarkedChildren = []
  1418. for vChild in self.getChildren():
  1419. if re.match(pLabelPattern, vChild.getSynTag()):
  1420. vlMarkedChildren.append(vChild)
  1421. else:
  1422. vChild.delNodes(pLabelPattern = pLabelPattern)
  1423. for vMarkedChild in vlMarkedChildren:
  1424. self.delChild(vMarkedChild)
  1425. def delPRO(self):
  1426. '''
  1427. Deletes pro-drop nodes (*PRO* in PTB) in the subtree rooted at
  1428. this node (except the node itself)
  1429. '''
  1430. vlPROChildren = []
  1431. for vChild in self.getChildren():
  1432. if vChild.isPreTerminal():
  1433. if vChild.getTerminal() == "*PRO*":
  1434. vlPROChildren.append(vChild)
  1435. elif vChild.isTerminal():
  1436. if vChild.surface == "*PRO*":
  1437. vlPROChildren.append(self)
  1438. else:
  1439. vChild.delPRO()
  1440. for vPROChild in vlPROChildren:
  1441. self.delChild(vPROChild)
  1442. def delChild(self, pChildNode):
  1443. '''
  1444. Deletes the given child node of the current node
  1445. '''
  1446. for i, vChild in enumerate(self.children):
  1447. if vChild == pChildNode:
  1448. del self.children[i]
  1449. vChild.setParent(None)
  1450. # node is deleted if left without child
  1451. if self.getChildrenCount() == 0:
  1452. self.parent.delChild(self)
  1453. def delChildrenAt(self, pSpan):
  1454. '''
  1455. Deletes the children at the given position span from left
  1456. '''
  1457. self.children = self.children[ : pSpan[0] - 1] + self.children[pSpan[1] : ]
  1458. def getChildNo(self, pChildNode):
  1459. '''
  1460. Returns the position of the given child node of the current node
  1461. in its children list (starting from left)
  1462. '''
  1463. for i, vChild in enumerate(self.children, start = 1):
  1464. if vChild == pChildNode:
  1465. return i
  1466. # if the child not found
  1467. return -1
  1468. def removeFunctionTags(self, pflgAllSubtree):
  1469. '''
  1470. Removes function tags from the labels
  1471. Function tags are supposed to be attached to the syntactic labels by
  1472. a dash (e.g. NP-OBJ).
  1473. The removal can be done to only the current node or the entire subtree
  1474. under the node.
  1475. '''
  1476. vGluePos = self.getLabel().find('-')
  1477. if vGluePos != -1:
  1478. self.synTag = self.getLabel()[ : vGluePos]
  1479. if pflgAllSubtree:
  1480. for vChild in self.getChildren():
  1481. vChild.removeFunctionTags(pflgAllSubtree = pflgAllSubtree)
  1482. def getPathTo(self, pNode):
  1483. '''
  1484. Extracts and returns the path from this node to the given node
  1485. NOTE: this method is not recursive. It has been designed for efficiency.
  1486. The path is a string consisting of the syntactic label of the node and the directions represented by /, \, > and
  1487. < (up, down, right, left in order). Left and right are used when descending from the lowest common ancestor down
  1488. to the destination node. If the path turns right, \> is used and if left, \<.
  1489. '''
  1490. if self == None:
  1491. return ''
  1492. else:
  1493. # 1. stacking this node and all its parents up until root
  1494. vlNodeParents = [self]
  1495. while not vlNodeParents[-1].isRoot():
  1496. vlNodeParents.append(vlNodeParents[-1].getParent())
  1497. # 2. stacking destination node and all its parents up until root
  1498. vlDestParents = [pNode]
  1499. while not vlDestParents[-1].isRoot():
  1500. vlDestParents.append(vlDestParents[-1].getParent())
  1501. # 3. finding lowest common ancestor (the u-turn point)
  1502. # if there is only one node in either list, LCA is the last one
  1503. if len(vlNodeParents) == 1 or len(vlDestParents) == 1:
  1504. vLCA = vlNodeParents[-1]
  1505. # otherwise, matching the stacks
  1506. else:
  1507. for vNPIdx, vDPIdx in zip(reversed(range(len(vlNodeParents))), reversed(range(len(vlDestParents)))):
  1508. # if the nodes before these two are the same, this node (at the current index) cannot be LCA
  1509. if vlNodeParents[vNPIdx - 1] == vlDestParents[vDPIdx - 1]:
  1510. del vlNodeParents[vNPIdx]
  1511. del vlDestParents[vDPIdx]
  1512. # if the nodes before these two are not the same, this node (at the current index) is LCA
  1513. else:
  1514. vLCA = vlNodeParents[vNPIdx]
  1515. break
  1516. # 4. extracting the path
  1517. # 4.a. this node to LCA
  1518. vPath = '/'.join([n.getSynTag() for n in vlNodeParents])
  1519. # 4.b. u-turn (if vLCA is not the start or the end)
  1520. if len(vlNodeParents) > 1 and len(vlDestParents) > 1:
  1521. if vLCA.getChildNo(vlNodeParents[-2]) < vLCA.getChildNo(vlDestParents[-2]):
  1522. vPath += '\>'
  1523. else:
  1524. vPath += '\<'
  1525. # 4.c. LCA to destination node
  1526. vPath += '\\'.join([n.getSynTag() for n in reversed(vlDestParents[:-1])])
  1527. return vPath
  1528. def lowerTerminals(self):
  1529. '''
  1530. Converts terminals in the subtree to lowercase
  1531. '''
  1532. for vTNode in self.getTerminalNodes():
  1533. vTNode.setTerminalForm(vTNode.getTerminal().lower())
  1534. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  1535. # BConstParser
  1536. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  1537. class BConstParser:
  1538. '''
  1539. A class for baseline constituency parsing
  1540. It implements the following baseline constituency parsing methods:
  1541. - random parsing
  1542. '''
  1543. def __init__(self):
  1544. '''
  1545. Constructor
  1546. '''
  1547. # default tag sets
  1548. self.enPOSTagSet = ["#", ".", ":", ",", "''", "``", "$", "-LRB-", "-RRB-", "CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NN", "NNP", "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO", "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT", "WP$", "WP", "WRB"]
  1549. self.enPhraseTagSet = ["ADJP", "ADVP", "CONJP", "FRAG", "INTJ", "LST", "NAC", "NP", "NX", "PP", "PRN", "PRT", "QP", "RRC", "S", "SBAR", "SBARQ", "SINV", "SQ", "UCP", "VP", "WHADJP", "WHADVP", "WHNP", "WHPP", "X"]
  1550. self.frPOSTagSet = ["ADJ", "ADJWH", "ADV", "ADVWH", "CC", "CLO", "CLR", "CLS", "CS", "DET", "DETWH", "ET", "I", "NC", "NPP", "P", "P+D", "PONCT", "P+PRO", "PREF", "PRO", "PROREL", "PROWH", "V", "VIMP", "VINF", "VPP", "VPR", "VS"]
  1551. self.frPhraseTagSet = ["AdP", "AP", "COORD", "NP", "PP", "SENT", "Sint", "Srel", "Ssub", "VN", "VPinf", "VPpart"]
  1552. def _getLangTagsets(self, pLang):
  1553. '''
  1554. Returns POS and phrase tag set for the language
  1555. '''
  1556. vLang = util.normalizeLang(pLang)
  1557. if vLang == "en":
  1558. return self.enPOSTagSet, self.enPhraseTagSet
  1559. elif vLang == "fr":
  1560. return self.frPOSTagSet, self.frPhraseTagSet
  1561. def randomParse(self, pSentence, pLang = "en", pflgTokenize = False):
  1562. '''
  1563. Randomly creates a constituency tree for the sentence.
  1564. The output is a ConstTree.
  1565. '''
  1566. if pflgTokenize:
  1567. vSentence = nlp.tokenizeSegment(pSentence, pLang, False)
  1568. else:
  1569. vSentence = pSentence
  1570. vlTokens = vSentence.strip().split()
  1571. # setting tag sets for the language
  1572. vlPOSTagset, vlPhraseTagset = self._getLangTagsets(pLang)
  1573. # creating a ConsTree
  1574. vCTree = ConstTree()
  1575. # parsing
  1576. vCTree.root = self._randParsePhrase(vlTokens, None, vlPOSTagset, vlPhraseTagset)
  1577. return vCTree
  1578. def _randParsePhrase(self, plPhraseTokens, pParent, plPOSTagset, plPhraseTagset):
  1579. '''
  1580. Randomly creates a phrase structure for the phrase recursively.
  1581. The tokens are recursively randomly split into phrases. Phrase
  1582. tags are randomly assigned from the phrase tag set and POS tags
  1583. from the POS tag set of the language.
  1584. It returns a ConsNode for the phrase structure.
  1585. '''
  1586. vLen = len(plPhraseTokens)
  1587. # creating a node for the phrase
  1588. vNode = ConstNode()
  1589. vNode.parent = pParent
  1590. if vLen == 1:
  1591. ## If the phrase is only 1 token and the parent is None, this
  1592. ## is a special case of one-token sentence and terminal node
  1593. ## should created under this node.
  1594. if vNode.parent == None:
  1595. vTerminalNode = ConstNode()
  1596. vTerminalNode.parent = vNode
  1597. vTerminalNode.synTag = self._getRandomTag(plPOSTagset)
  1598. vTerminalNode.terminal = plPhraseTokens[0]
  1599. vNode.synTag = self._getRandomTag(plPhraseTagset)
  1600. vNode.children.append(vTerminalNode)
  1601. else:
  1602. vNode.synTag = self._getRandomTag(plPOSTagset)
  1603. vNode.terminal = plPhraseTokens[0]
  1604. else:
  1605. # randomly labelling the phrase
  1606. vNode.synTag = self._getRandomTag(plPhraseTagset)
  1607. # splitting into a random number of sub-phrases:
  1608. vlSubPhrases = self._randSplitPhrase(plPhraseTokens)
  1609. ## recursively creating phrase structure for each sub-phrase
  1610. ## (children nodes)
  1611. for vlSubPhrase in vlSubPhrases:
  1612. vNode.children.append(self._randParsePhrase(vlSubPhrase, vNode, plPOSTagset, plPhraseTagset))
  1613. return vNode
  1614. def _randSplitPhrase(self, plPhraseTokens):
  1615. '''
  1616. Randomly splits phrase into subphrases recursively.
  1617. The output is a list of sub-phrases which are in turn list of
  1618. tokens.
  1619. '''
  1620. ## It's first split into two sub-phrases. Then the right sub-phrase
  1621. ## is recursively split again.
  1622. vLen = len(plPhraseTokens)
  1623. if vLen == 1:
  1624. return [plPhraseTokens]
  1625. elif vLen == 2:
  1626. return [[plPhraseTokens[0]], [plPhraseTokens[1]]]
  1627. else:
  1628. ## A phrase of length N cat be split at positions 2 (second token)
  1629. ## to N (last token).
  1630. vSplitPos = random.randint(2, vLen)
  1631. return [plPhraseTokens[ : vSplitPos - 1]] + self._randSplitPhrase(plPhraseTokens[vSplitPos - 1 :])
  1632. def _getRandomTag(self, plTagset):
  1633. '''
  1634. Returns a random tag from plTagset
  1635. '''
  1636. return plTagset[random.randrange(len(plTagset))]
  1637. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  1638. # ConstParseLoader
  1639. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  1640. class ConstParseLoader:
  1641. '''
  1642. Constituency parse loader
  1643. Loads constituency parse trees into ConstParses objects. Depending on
  1644. the parse file format it loads n-best parses and parsing details such
  1645. as log probability and time are also loaded.
  1646. '''
  1647. def __init__(self):
  1648. '''
  1649. Constructor
  1650. '''
  1651. # list of ConstParses
  1652. self.parses = None
  1653. def loadPTBTrees(self, pPTBFile, pflgExpandTerminal = False):
  1654. '''
  1655. Loads the parse trees in Penn Treebank format.
  1656. If pflgExpandTerminal is set to true, the terminal node will be
  1657. expanded into a pre-terminal (POS tag) and terminal (token) nodes.
  1658. '''
  1659. try:
  1660. vlTrees = open(pPTBFile).read().strip().split('\n')
  1661. except IOError:
  1662. raise Exception("Cannot open the tree file: %s" % pPTBFile)
  1663. self.parses = []
  1664. for vTree in vlTrees:
  1665. self.parses.append(ConstParses())
  1666. vCTree = self._createNewTree()
  1667. vCTree.loadPTBTree(vTree, pflgExpandTerminal)
  1668. self.parses[-1].parses.append(ConstParse(vCTree))
  1669. def _createNewTree(self):
  1670. '''
  1671. Creates and returns a new constituency tree
  1672. '''
  1673. return ConstTree()
  1674. def loadParses(self, pParseFile, pflgExpandTerminal = False):
  1675. '''
  1676. Loads parse trees in PTB format with their log probabilities if
  1677. included.
  1678. If pflgExpandTerminal is set to true, the terminal node will be
  1679. expanded into a pre-terminal (POS tag) and terminal (token) nodes.
  1680. '''
  1681. try:
  1682. vlLines = open(pParseFile).read().strip().split('\n')
  1683. except IOError:
  1684. raise Exception("Cannot open the parse file: %s" % pParseFile)
  1685. self.parses = []
  1686. for vLine in vlLines:
  1687. self.parses.append(ConstParses())
  1688. vCTree = self._createNewTree()
  1689. if vLine[0] == '-' or vLine[0] == '0':
  1690. vColon = vLine.find(':')
  1691. vLogProb = float(vLine[ : vColon].strip())
  1692. vTree = vLine[vColon+1 : ].strip()
  1693. vCTree.loadPTBTree(vTree, pflgExpandTerminal)
  1694. self.parses[-1].parses.append(ConstParse(vCTree, vLogProb))
  1695. else:
  1696. vTree = vLine
  1697. vCTree.loadPTBTree(vTree, pflgExpandTerminal)
  1698. self.parses[-1].parses.append(ConstParse(vCTree))
  1699. def loadLorgParses(self, pParseFile, pNoParseRepTreeFile = '', pNoParseRepProbFile = '', pDummyForNoParse = "( (ERR NOPARSE))", pDefaultProbForNoParse = -150, pflgExpandTerminal = False, pflgVerbose = True):
  1700. '''
  1701. Loads the parse tree and details from Lorg output file
  1702. Both verbose and quiet output are supported.
  1703. If pflgExpandTerminal is set to true, the terminal node will be
  1704. expanded into a pre-terminal (POS tag) and terminal (token) nodes.
  1705. Lorg parser output may contain "no parse found" for sentences it
  1706. cannot This method implements two ways to get around that.
  1707. The first is to replace them with alternative parses provided in
  1708. an alternative data set in pNoParseRepTreeFile (as well as their
  1709. probabilities in pNoParseRepProbFile. Similar to the main parses,
  1710. the trees are provided in a file with the same size but one tree
  1711. per line, and only required lines will be retrieved. The probabilities
  1712. are also provided in one per line in a file. So, if the no parse
  1713. sentences are known in advance, the rest can be left blank and
  1714. alternative parses can be provided in the appropriate positions
  1715. in the file. The second way is to replace them with a dummy parse
  1716. tree and a default probability value. If the alternative data set
  1717. is not provided, the second method will be used.
  1718. '''
  1719. # loading parses
  1720. try:
  1721. vLorgOutput = open(pParseFile).read()
  1722. except IOError:
  1723. raise Exception("Cannot open the parse file: %s" % pParseFile)
  1724. # loading replacement parses and probabilities for no-parse
  1725. if pNoParseRepTreeFile != '' and pNoParseRepProbFile != '':
  1726. try:
  1727. vlNoParseRepTrees = open(pNoParseRepTreeFile).read().strip().split('\n')
  1728. vlNoParseRepProbs = open(pNoParseRepProbFile).read().strip().split('\n')
  1729. except IOError:
  1730. raise Exception("Cannot open the replacement parse file: %s" % pNoParseRepFile)
  1731. else:
  1732. vlNoParseRepTrees = []
  1733. vlNoParseRepProbs = []
  1734. self.parses = []
  1735. for vLine in vLorgOutput.strip().split("\n"):
  1736. if vLine.startswith("###"):
  1737. vLineContent = vLine[4:].strip()
  1738. vlFound = []
  1739. # ID for sanity check and initilization
  1740. vPattern = re.compile("ID: ([0-9]+)$")
  1741. vlFound = vPattern.findall(vLineContent)
  1742. if len(vlFound) == 1:
  1743. vID = vlFound[0]
  1744. # initialization
  1745. self.parses.append(ConstParses())
  1746. # time
  1747. vPattern = re.compile("time: ([0-9]*[.]*[0-9]+)$")
  1748. vlFound = vPattern.findall(vLineContent)
  1749. if len(vlFound) == 1:
  1750. self.parses[-1].time = float(vlFound[0])
  1751. elif vLine[0] == '-' or vLine[0] == '0':
  1752. # n-best parses
  1753. vColon = vLine.find(':')
  1754. vLogProb = float(vLine[ : vColon].strip())
  1755. vParseTree = self._createNewTree()
  1756. vParseTree.loadPTBTree(vLine[vColon+1 : ].strip(), pflgExpandTerminal)
  1757. self.parses[-1].parses.append(ConstParse(vParseTree, vLogProb))
  1758. elif vLine.startswith("no parse found"):
  1759. vParseTree = self._createNewTree()
  1760. if len(vlNoParseRepTrees) > 0:
  1761. vNoParseRepTree = vlNoParseRepTrees[len(self.parses) - 1]
  1762. vRepLogProb = vlNoParseRepProbs[len(self.parses) - 1]
  1763. else:
  1764. vNoParseRepTree = pDummyForNoParse
  1765. vRepLogProb = pDefaultProbForNoParse
  1766. vParseTree.loadPTBTree(vNoParseRepTree, pflgExpandTerminal)
  1767. if not pflgVerbose:
  1768. self.parses.append(ConstParses())
  1769. self.parses[-1].parses = [ConstParse(vParseTree, vRepLogProb)]
  1770. elif vLine.startswith("("): # older Lorg
  1771. vParseTree = self._createNewTree()
  1772. vParseTree.loadPTBTree(vLine.strip(), pflgExpandTerminal)
  1773. if not pflgVerbose:
  1774. self.parses.append(ConstParses())
  1775. self.parses[-1].parses = [ConstParse(vParseTree)]
  1776. else:
  1777. raise Exception("Unknown pattern: %s" % vLine)
  1778. def getParses(self):
  1779. '''
  1780. Returns the loaded constituency parses
  1781. The returned value is the list of ConstParses object
  1782. '''
  1783. return self.parses