slsa.py 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. This module defines classes for sentence-level sentiment analysis (SLSA).
  5. Version 0.1 (06-Jul-2016)
  6. - SLSASet, SLSASent are added.
  7. """
  8. from parse import constparse, depparse
  9. from ml import tk
  10. from nlp import nlp
  11. class SLSASet:
  12. '''
  13. Class for sentence level sentiment analysis dataset.
  14. '''
  15. def __init__(self):
  16. '''
  17. Constructor
  18. '''
  19. self.sentences = []
  20. # WordVector object containing the word vectors of the vocabulary in the dataset
  21. self.wv = None
  22. def load(self, pTxtFilename, pPolarityFilename, pflgTokenize = False, pLanguage = "en"):
  23. '''
  24. Loads the data from input files
  25. This is the basic loader of SLSA data which expects the sentences
  26. (text) and polarity scores to be provided in separate parallel files.
  27. '''
  28. self.sentences = []
  29. vlTxtLines = open(pTxtFilename).read().strip().split('\n')
  30. vlPolarityScores = open(pPolarityFilename).read().strip().split('\n')
  31. if len(vlTxtLines) != len(vlPolarityScores):
  32. raise Exception("Number of sentences does not match number of scores: %s vs. %s" % (len(vlTxtLines), len(vlPolarityScores)))
  33. for vTxtLine, vPolScore in zip(vlTxtLines, vlPolarityScores):
  34. vSLSASent = SLSASent(pSLSASet = self)
  35. vSLSASent.load(vTxtLine, vPolScore, pflgTokenize = pflgTokenize, pLanguage = pLanguage)
  36. self.sentences.append(vSLSASent)
  37. @property
  38. def size(self):
  39. '''
  40. Returns the size of the data set which is the number of its sentences
  41. '''
  42. return len(self.getSentences())
  43. @property
  44. def tokenLength(self):
  45. '''
  46. Returns the number of tokens in the data set
  47. '''
  48. return sum([s.length for s in self.getSentences()])
  49. def getSentences(self, pSort = ''):
  50. '''
  51. Returns the SLSA sentences
  52. The sort options are:
  53. - None: in document order
  54. - text: in sentence text order
  55. '''
  56. if pSort.lower() == "text":
  57. return [s for s in sorted(self.sentences, key = lambda x: x.getText())]
  58. else:
  59. return [s for s in self.sentences]
  60. def addSentences(self, plSentences):
  61. '''
  62. Adds SLSA sentences to existing sentences
  63. Currently, no care is taken regarding ID duplication.
  64. '''
  65. for vSent in plSentences:
  66. vSent.dataset = self
  67. self.sentences.append(vSent)
  68. def getVocabualry(self):
  69. '''
  70. Extracts and returns the vocabulary of the dataset
  71. '''
  72. return sorted(set([t for s in self.getSentences() for t in s.getTokens()]))
  73. def extractSentenceForms(self, pSort = None):
  74. '''
  75. Returns the surface form of the sentences
  76. The sort options are:
  77. - None: in document order
  78. - id: in sentence ID order
  79. - text: in sentence text order
  80. '''
  81. return [s.getText() for s in self.getSentences(pSort = pSort)]
  82. def loadConstTrees(self, plConstTrees):
  83. '''
  84. Loads the constituency parse trees of the sentences
  85. It assumes that the provided constituency trees are in the order
  86. in which the sentences are loaded.
  87. The constituency trees can be provided in bracketing format or as
  88. constparse.ConstTree objects (in a list).
  89. '''
  90. for vSent, pCTree in zip(self.getSentences(), plConstTrees):
  91. vSent.loadConstTree(pCTree)
  92. def loadDepTrees(self, plDepTrees):
  93. '''
  94. Loads the dependency parse trees of the sentences
  95. It assumes that the provided dependency trees are in the order in
  96. which the sentences are loaded.
  97. The dependency trees are assumed to be provided a list of
  98. depparse.DepTree objects.
  99. '''
  100. for vSent, pDTree in zip(self.getSentences(), plDepTrees):
  101. vSent.loadDepTree(pDTree)
  102. def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
  103. '''
  104. Loads polarity scores to sentences from a sentiment lexicon which
  105. is a Sentexicon object
  106. For details about Sentexicon object, see sentexicon.py
  107. '''
  108. vTotalWordNum = 0 # total number of words in the data set
  109. vTotalEntryWordNum = 0 # number of words found in the lexicon
  110. for vSent in self.getSentences():
  111. vWordNum, vEntryWordNum = vSent.loadSentimentScores(pSentexicon, pNeutralScore)
  112. vTotalWordNum += vWordNum
  113. vTotalEntryWordNum += vEntryWordNum
  114. return vTotalWordNum, vTotalEntryWordNum
  115. def getSentimentScores(self):
  116. '''
  117. Returns a dictionary of words in the data set and the sentiment
  118. scores attached to them
  119. '''
  120. vdResult = {}
  121. for vSent in self.getSentences():
  122. for vWord, vScore in zip(vSent.getTokens(), vSent.getSentimentScores()):
  123. vdResult[vWord] = vScore
  124. return vdResult
  125. def loadSSInDTrees(self):
  126. '''
  127. Loads sentiment scores into dependency tree nodes
  128. '''
  129. for vSent in self.getSentences():
  130. vSent.loadSSInDTree()
  131. def loadSSInCTrees(self, pPropagation = None):
  132. '''
  133. Loads sentiment scores into constituency tree nodes
  134. '''
  135. for vSent in self.getSentences():
  136. vSent.loadSSInCTree(pPropagation = pPropagation)
  137. def loadWordVectors(self, pWordVectors, pflgFilter = True):
  138. '''
  139. Loads word vectors from a file or WordVector object, whichever is given
  140. By default, it filters out the words not in the data vocabulary, which can be changed to not filter (e.g. when
  141. the input is already filtered).
  142. '''
  143. from ml import wv
  144. self.wv = wv.WordVector()
  145. if type(pWordVectors) == str:
  146. vWV.load(pWVFilename = vWVFile, plFilterVocab = self.getVocabualry())
  147. else:
  148. self.wv = pWordVectors
  149. class SLSASent:
  150. '''
  151. Class for SLSA sentence
  152. '''
  153. def __init__(self, pSLSASet):
  154. '''
  155. Constructor
  156. '''
  157. # SLSASet the sentence belongs to
  158. self.dataset = pSLSASet
  159. self.text = None
  160. self.polarity = None
  161. self.cTree = None
  162. self.dTree = None
  163. self.sentScores = [] # sentiment score, one per word in the tokenized self.text
  164. def load(self, pText, pPolarity = None, pflgTokenize = False, pLanguage = "en"):
  165. '''
  166. Loads sentence form and its label
  167. It optionally tokenizes the text.
  168. '''
  169. if pflgTokenize:
  170. self.text = nlp.tokenizeSegment(pText, pLang = pLanguage, pflgTokenizeFSlash = False)
  171. else:
  172. self.text = pText
  173. self.polarity = pPolarity
  174. def getText(self):
  175. '''
  176. Returns sentence text (form)
  177. '''
  178. return self.text
  179. def getTokens(self):
  180. '''
  181. Returns the tokenization of the sentence
  182. The sentence text is assumed to be in tokenized format and only
  183. splits on space.
  184. '''
  185. return self.getText().split()
  186. def getPolarity(self):
  187. '''
  188. Returns the polarity of the sentence
  189. '''
  190. return self.polarity
  191. @property
  192. def length(self):
  193. '''
  194. Returns the sentence length
  195. '''
  196. return len(self.getTokens())
  197. def getConstTree(self):
  198. '''
  199. Returns the constituency parse tree of the sentence
  200. The returned object is of type constparse.ConstTree
  201. '''
  202. return self.cTree
  203. def getPOSTags(self):
  204. '''
  205. Returns the list of POS tags which matches the token list
  206. The POS tags are extracted from the constituency tree or dependency tree
  207. '''
  208. if self.cTree is not None:
  209. return self.cTree.getPOSs()
  210. elif self.dTree is not None:
  211. return self.dTree.getPOSs()
  212. else:
  213. return []
  214. def getDepTree(self):
  215. '''
  216. Returns the dependency parse tree of the sentence
  217. The returned object is of type depparse.DepTree
  218. '''
  219. return self.dTree
  220. def loadConstTree(self, pConstTree):
  221. '''
  222. Loads the constituency parse tree of the sentence
  223. The contituency tree can be provided in bracketing format or as
  224. constparse.ConstTree object.
  225. '''
  226. # loading the tree
  227. if isinstance(pConstTree, constparse.ConstTree):
  228. vConstTree = pConstTree.getPTBFormat()
  229. else:
  230. vConstTree = pConstTree
  231. self.cTree = SLSACTree()
  232. self.cTree.loadPTBTree(vConstTree, pflgExpandTerminal = True)
  233. # sanity check; comment out
  234. #if self.cTree.surface != self.getText():
  235. # print "Sentence and tree mismatch:\nSentence: %s\nTree: %s\n" % (self.getText(), self.cTree.surface)
  236. def loadDepTree(self, pDepTree):
  237. '''
  238. Loads the dependency parse tree of the sentence
  239. The dependency tree is assumed to be depparse.DepTree object.
  240. '''
  241. # loading the tree
  242. if not isinstance(pDepTree, depparse.DepTree):
  243. raise Exception("A DepTree object is expected!")
  244. self.dTree = SLSADTree()
  245. self.dTree.loadFromDepTree(pDepTree = pDepTree)
  246. # sanity check; comment out
  247. #if self.dTree.surface != self.getText():
  248. # print "Sentence and tree mismatch:\nSentence: %s\nTree:%s\n" % (self.getText(), self.dTree.surface)
  249. def getPTBConstTree(self):
  250. '''
  251. Returns the constituency tree of the sentence in PTB bracketing format
  252. '''
  253. return self.cTree.getPTBFormat()
  254. def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
  255. '''
  256. Loads sentiment scores to words in the sentence from a sentiment
  257. lexicon which is a Sentexicon object
  258. It returns a tuple of the total number of words in the sentence
  259. and the number of words found in the lexicon.
  260. If the word is not found in the lexicon, None will be used.
  261. For details about Sentexicon object, see sentexicon.py
  262. '''
  263. vlWords = self.getTokens()
  264. self.sentScores = []
  265. for vWord in vlWords:
  266. vScore = pSentexicon.getScore(vWord)
  267. if vScore is None and pNeutralScore is not None:
  268. self.sentScores.append(pNeutralScore)
  269. else:
  270. self.sentScores.append(vScore)
  271. return self.length, len([s for s in self.sentScores if s is not None])
  272. def getSentimentScores(self):
  273. '''
  274. Returns the sentiment scores in a list corresponding to the token list
  275. '''
  276. return self.sentScores
  277. def loadSSInDTree(self):
  278. '''
  279. Loads sentiment scores into the dependency tree of the sentence
  280. '''
  281. self.dTree.loadSentScores(self.sentScores)
  282. def loadSSInCTree(self, pPropagation = None):
  283. '''
  284. Loads sentiment scores into the constituency tree of the sentence
  285. '''
  286. self.cTree.loadSentScores(self.sentScores, pPropagation = pPropagation)
  287. def generateNGramKTree(self, pNodeContentType = "word", pFormat = "binary", pdOptions = None):
  288. '''
  289. Generates and returns the tree representation of the surface form
  290. of the sentence
  291. '''
  292. vSLSANGramKTree = SLSANGramKTree(pSLSASent = self, pNodeContentType = pNodeContentType, pdOptions = pdOptions)
  293. return vSLSANGramKTree.generateNGramKTree(pFormat = pFormat, pdOptions = pdOptions)
  294. def getAvgSentScore(self):
  295. '''
  296. Calculates and returns the average sentiment score sentence tokens
  297. '''
  298. vlSentScores = self.getSentimentScores()
  299. return sum(vlSentScores) / len(vlSentScores)
  300. def getPolarScores(self, pNeutralScore = None):
  301. '''
  302. Returns sentiment scores with non-neutral polarity
  303. Neutral polarity score can be set as parameter. It is None by default meaning that no polarity score is assigned
  304. to neutral words.
  305. '''
  306. return [s for s in self.sentScores if s != pNeutralScore]
  307. def getWordVectors(self):
  308. '''
  309. Returns the word vectors of the sentence tokens
  310. '''
  311. return [self.dataset.wv.getVector(t) for t in self.getTokens()]
  312. class SLSACTree(constparse.ConstTree):
  313. '''
  314. Class for constituency parse tree of a SLSA sentence
  315. '''
  316. def __init__(self):
  317. '''
  318. Constructor
  319. '''
  320. constparse.ConstTree.__init__(self)
  321. def _createNewTree(self):
  322. '''
  323. Creates and returns a new SLSACTree
  324. This method is useful in class inheriting.
  325. '''
  326. return SLSACTree()
  327. def _createRoot(self):
  328. '''
  329. Creates and returns the root node
  330. '''
  331. return SLSACNode()
  332. def modifyNPStruct(self):
  333. '''
  334. Modifies the structure of the noun phrases in order to avoid
  335. term/constituent mismatch caused by a flat NP structure
  336. '''
  337. self.root.modifyNPStruct()
  338. def loadSentScores(self, plSentScores, pPropagation = None):
  339. '''
  340. Loads sentiment scores to tree nodes
  341. The sentiment scores are given in a list which corresponds to the
  342. terminal (i.e. token) list.
  343. Propagation argument specifies the method by which the score are
  344. propagated from terminals up to the root node. None means scores
  345. are only assigned to the terminal nodes, thus they are 0 for the
  346. phrase nodes.
  347. '''
  348. self.root.loadSentScores(plSentScores, pPropagation = pPropagation)
  349. def extractTopmostVP(self, pSpan):
  350. '''
  351. Extracts and returns the topmost verb phrase node above the given span in the tree
  352. '''
  353. return self.root.extractTopmostVP(pSpan)
  354. class SLSACNode(constparse.ConstNode):
  355. '''
  356. Class for constituency tree node of a SLSA sentence
  357. '''
  358. def __init__(self):
  359. '''
  360. Constructor
  361. '''
  362. constparse.ConstNode.__init__(self)
  363. # sentiment score
  364. self.sentScore = None
  365. # list of type OpinionExpression
  366. self.oe = []
  367. def deepCopy(self, pflgCopyTree = False):
  368. '''
  369. NOTE: it seems python deepcopy() works better. Try the idea before
  370. using this method.
  371. Creates and returns a deep copy of the node optionally including
  372. the sub tree under it
  373. '''
  374. vNodeCopy = constparse.ConstNode.deepCopy(self, pflgCopyTree)
  375. # copying the sentiment score
  376. vNodeCopy.sentScore = self.sentScore
  377. return vNodeCopy
  378. def shallowCopy(self, pflgCopyTree = False):
  379. '''
  380. NOTE: before using, check if python shallowcopy() does not work
  381. as expected.
  382. Creates and returns a shallow copy of the node
  383. The shallow copy does not have a parent and children.
  384. '''
  385. vNodeCopy = constparse.ConstNode.shallowCopy(self, pflgCopyTree)
  386. # copying the sentiment score
  387. vNodeCopy.sentScore = self.sentScore
  388. return vNodeCopy
  389. def _getNewNode(self):
  390. '''
  391. Creates and returns a node
  392. '''
  393. return SLSACNode()
  394. def setSentScore(self, pScore):
  395. '''
  396. Sets the value of sentiment score of the node's word
  397. '''
  398. self.sentScore = pScore
  399. def getSentScore(self):
  400. '''
  401. Returns the sentiment score of the node's word
  402. '''
  403. return self.sentScore
  404. def modifyNPStruct(self):
  405. '''
  406. Modifies the structure of the noun phrases in order to avoid
  407. term/constituent mismatch caused by a flat NP structure
  408. '''
  409. if self.getSynTag() == "NP":
  410. vChildLabelSeq = ' '.join(self.getChildrenTags())
  411. if re.search("^(DT|PRP\$) NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
  412. self.insertIntermChild("NP", (2, 3))
  413. return
  414. if re.search("^(DT|PRP\$) JJ[A-Z]* NN[A-Z]*$", vChildLabelSeq):
  415. self.insertIntermChild("ADJP", (2, 3))
  416. return
  417. if re.search("^(DT|PRP\$) VBG NN[A-Z]*$", vChildLabelSeq):
  418. self.insertIntermChild("NP", (2, 3))
  419. return
  420. if re.search("^(DT|PRP\$) NN[A-Z]* NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
  421. self.insertIntermChild("NP", (2, 4))
  422. return
  423. if re.search("^(DT|PRP\$) JJ[A-Z]* NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
  424. self.insertIntermChild("NP", (3, 4))
  425. self.insertIntermChild("NP", (2, 3))
  426. return
  427. if re.search("^(DT|PRP\$) ADJP NN[A-Z]*$", vChildLabelSeq):
  428. self.insertIntermChild("NP", (2, 3))
  429. return
  430. if re.search("^(DT|PRP\$) CD NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
  431. self.insertIntermChild("NP", (2, 4))
  432. return
  433. if re.search("^(DT|PRP\$) JJ[A-Z]* JJ[A-Z]* NN[A-Z]*$", vChildLabelSeq):
  434. self.insertIntermChild("NP", (3, 4))
  435. self.insertIntermChild("NP", (2, 3))
  436. return
  437. for vChild in self.getChildren():
  438. vChild.modifyNPStruct()
  439. def loadSentScores(self, plSentScores, pPropagation = None):
  440. '''
  441. Loads sentiment scores to nodes in the subtree
  442. The sentiment scores are given in a list which corresponds to the
  443. terminal (i.e. token) list.
  444. Neutral score will be used instead of None for words (and nodes when
  445. propagating) without a sentiment score (None).
  446. Propagation argument specifies the method by which the score are
  447. propagated from terminals up to this node. None means scores
  448. are only assigned to the terminal nodes, thus they are 0 for the
  449. phrase nodes. The following are the possible methods:
  450. - sum: nodes score is the sum of its children score
  451. - vote: node score is the dominant positive or negative score in the
  452. children nodes (i.e. more +1: score is +1, more -1: score -1)
  453. '''
  454. vTokenSpan = self.getTokenSpan()
  455. if self.isTerminal():
  456. # sanity check
  457. if vTokenSpan[0] != vTokenSpan[1]:
  458. raise Exception("Either the node is not terminal or its span is wrong: %s" % self)
  459. else:
  460. self.setSentScore(plSentScores[vTokenSpan[0] - 1])
  461. else:
  462. vlChildrenSSores = []
  463. for vChild in self.getChildren():
  464. vlChildrenSSores.append(vChild.loadSentScores(plSentScores, pPropagation))
  465. # calculating the sentiment score of the node based on its children's (propagation)
  466. if pPropagation is not None:
  467. if pPropagation.lower() == "sum":
  468. self.setSentScore(sum(vlChildrenSSores))
  469. elif pPropagation.lower() == "vote":
  470. self.setSentScore(self._getDominantSentiment(vlChildrenSSores))
  471. return self.getSentScore()
  472. def _getDominantSentiment(self, plScores):
  473. '''
  474. Returns +1 or -1 whichever is dominant in the given list of sentiment
  475. scores
  476. If the same number of both sentiment scores exist, 0 is returned.
  477. '''
  478. vPosCount = 0
  479. vNegCount = 0
  480. for vScore in plScores:
  481. if vScore == 1:
  482. vPosCount += 1
  483. elif vScore == -1:
  484. vNegCount += 1
  485. if vPosCount > vNegCount:
  486. return 1
  487. elif vPosCount < vNegCount:
  488. return -1
  489. else:
  490. return 0
  491. def extractTopmostVP(self, pSpan):
  492. '''
  493. Extracts and returns the topmost verb phrase node which overlaps the given span in the node subtree
  494. Overlap means that the given span and the span of the VP must not ne disjoint. So, left and right crossing will
  495. also be considered.
  496. '''
  497. if self.getSynTag() == 'VP':
  498. vSpanRel = self.getTokenSpanRelation(pSpan)
  499. if vSpanRel != -4:
  500. return self
  501. else:
  502. return None
  503. else:
  504. for vChild in self.getChildren():
  505. vTopVP = vChild.extractTopmostVP(pSpan)
  506. if vTopVP is not None:
  507. return vTopVP
  508. return None
  509. class SLSADTree(depparse.DepTree):
  510. '''
  511. Class for dependency parse tree of a SLSA sentence
  512. '''
  513. def __init__(self, pLanguage = ''):
  514. '''
  515. Constructor
  516. '''
  517. depparse.DepTree.__init__(self, pLanguage = pLanguage)
  518. def loadFromDepTree(self, pDepTree):
  519. '''
  520. Loads the tree from DepTree object
  521. '''
  522. # 1. nodes
  523. self.nodes = []
  524. for vNode in pDepTree.nodes:
  525. self.nodes.append(SLSADNode())
  526. self.nodes[-1].loadFromDepNode(pSLSADTree = self, pDepNode = vNode)
  527. # 2. SRL
  528. self.srl = pDepTree.srl
  529. # 3. language
  530. self.language = pDepTree.language
  531. def _createNewTree(self, pLanguage = ''):
  532. '''
  533. Creates and returns a new tree
  534. '''
  535. return SLSADTree(pLanguage = pLanguage)
  536. def loadSentScores(self, plSentScores):
  537. '''
  538. Loads sentiment scores to tree nodes
  539. The sentiment scores are given in a list which corresponds to the
  540. tree node list.
  541. '''
  542. for vNode, vSentScore in zip(self.getNodes(), plSentScores):
  543. vNode.setSentScore(vSentScore)
  544. def generateDepKTree(self, pFormat = "(rel form)", pdOptions = {}):
  545. '''
  546. Generates the dependency tree representation in PTB bracketing
  547. for tree kernels
  548. pdOptions provides options specific to each format.
  549. '''
  550. vDepKTree = SLSADKTree(pDepTree = self)
  551. return vDepKTree.generateDepKTree(pNode = "root", pFormat = pFormat, pdOptions = pdOptions)
  552. class SLSADNode(depparse.DepNode):
  553. '''
  554. Class for dependency parse node of a SLSA sentence
  555. '''
  556. def __init__(self, pSLSADTree = None, pForm = "", pPosition = 0, plHeadDeps = None, plDependents = None, pPOSTag = "", plPredRoles = None, pSentScore = None):
  557. '''
  558. Constructor
  559. '''
  560. depparse.DepNode.__init__(self, pDepTree = pSLSADTree, pForm = pForm, pPosition = pPosition, plHeadDeps = plHeadDeps, plDependents = plDependents, pPOSTag = pPOSTag, plPredRoles = plPredRoles)
  561. # sentiment score
  562. self.sentScore = pSentScore
  563. def loadFromDepNode(self, pSLSADTree, pDepNode):
  564. '''
  565. Loads the node data from DepNode object
  566. '''
  567. self.depTree = pSLSADTree # SLSA dependency tree the node belongs to
  568. self.form = pDepNode.form # token surface form
  569. self.position = pDepNode.position # token position in the sentence
  570. self.headDeps = pDepNode.headDeps # list of head and dependency tuples
  571. self.dependents = pDepNode.dependents # children
  572. self.posTag = pDepNode.posTag # POS tag
  573. self.predRoles = pDepNode.predRoles # list of (predicate position, semantic role) tuples
  574. def _createNewNode(self, pDepTree = None, pForm = "", pPosition = 0, plHeadDeps = None, plDependents = None, pPOSTag = "", plPredRoles = None, pSentScore = None):
  575. '''
  576. Creates and returns an new node
  577. '''
  578. return SLSADNode(pSLSADTree = pDepTree,
  579. pForm = pForm,
  580. pPosition = pPosition,
  581. plHeadDeps = plHeadDeps[:],
  582. plDependents = plDependents[:],
  583. pPOSTag = pPOSTag,
  584. plPredRoles = plPredRoles[:],
  585. pSentScore = pSentScore)
  586. def deepCopy(self, pDepTree):
  587. '''
  588. NOTE: it seems python deepcopy() works better. Try the idea before
  589. using this method.
  590. Creates and returns a new dependency node which is a deep copy of
  591. the current node
  592. '''
  593. return self._createNewNode(pDepTree = pDepTree,
  594. pForm = self.form,
  595. pPosition = self.position,
  596. plHeadDeps = self.headDeps[:],
  597. plDependents = self.dependents[:],
  598. pPOSTag = self.posTag,
  599. plPredRoles = self.predRoles[:],
  600. pSentScore = self.sentScore)
  601. def setSentScore(self, pScore):
  602. '''
  603. Sets the value of sentiment score of the node's word
  604. '''
  605. self.sentScore = pScore
  606. def getSentScore(self):
  607. '''
  608. Returns the sentiment score of the node's word
  609. '''
  610. return self.sentScore
  611. class SLSADKTree(tk.DepKTree):
  612. '''
  613. The class for SLSA dependency tree for use in tree kernels.
  614. For use in tree kernels, the tree is represented in PTB bracketing
  615. format.
  616. '''
  617. def generateDepKTree(self, pNode = "root", pFormat = "(rel form)", pdOptions = {}):
  618. '''
  619. Generates dependency kernel tree or subtree under a given node in
  620. the required format
  621. NOTE: The subtree here should not be confused with the notion of
  622. subtree as a tree kernel variation used in parallel to subset tree
  623. kernel.
  624. The default format is (rel form) which is the pure dependency tree
  625. with only dependency relations and token forms as node labels. In
  626. general, the format string is the representation of the innermost
  627. treelet in the format. See each format-specific method for exact
  628. details.
  629. pdOptions provides options specific to each format.
  630. See the documentation of the parent class.
  631. '''
  632. if type(pNode) is str and pNode.lower() == "root":
  633. pNode = self.depTree.root
  634. if pFormat == "(score (rel (pos form)))":
  635. self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree1(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
  636. elif pFormat == "(rel (pos score))":
  637. self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree2(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
  638. elif pFormat == "(score (rel (roles (pos form))))":
  639. self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree3(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
  640. elif pFormat == "(score (rel (pos_roles form)))":
  641. self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree4(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
  642. elif pFormat == "(rel_score (pos form))":
  643. self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree5(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
  644. else:
  645. self.kTree = tk.DepKTree.generateDepKTree(self, pNode = pNode, pFormat = pFormat)
  646. return self.kTree
  647. # (score (rel (pos form))) -> no equivalent number format in version 0.3
  648. def _generateSLSADKSubtree1(self, pNode, pCurrentHead, pdOptions = {}):
  649. '''
  650. Recursively generates the kernel subtree of the given node in
  651. bracketing representation in (score (rel (pos form))) format
  652. pCurrentHead identifies which head is calling this method in case
  653. the node has multiple heads.
  654. pdOptions contains the following options to be used in formatting:
  655. - neutral: the way the neutral words, i.e. those without a sentiment
  656. score should be treated. The possible values include
  657. an empty string which means do not add any node for such
  658. words, and a string value which will be used as a node
  659. to be inserted in the same way the scores are.
  660. '''
  661. vDependents = ''.join([self._generateSLSADKSubtree1(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
  662. if pNode.getSentScore() is not None:
  663. if len(pNode.dependents) == 0:
  664. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
  665. pNode.getDepRel(pCurrentHead),
  666. pNode.getPOSTag(),
  667. pNode.form)
  668. else:
  669. vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
  670. pNode.getDepRel(pCurrentHead),
  671. pNode.getPOSTag(),
  672. pNode.form,
  673. vDependents)
  674. else:
  675. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  676. if len(pNode.dependents) == 0:
  677. vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
  678. pNode.getDepRel(pCurrentHead),
  679. pNode.getPOSTag(),
  680. pNode.form)
  681. else:
  682. vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
  683. pNode.getDepRel(pCurrentHead),
  684. pNode.getPOSTag(),
  685. pNode.form,
  686. vDependents)
  687. else:
  688. if len(pNode.dependents) == 0:
  689. vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  690. pNode.getPOSTag(),
  691. pNode.form)
  692. else:
  693. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  694. pNode.getPOSTag(),
  695. pNode.form,
  696. vDependents)
  697. return vKSubtree
  698. # (rel (pos score)) -> no equivalent number format in version 0.3
  699. def _generateSLSADKSubtree2(self, pNode, pCurrentHead, pdOptions = {}):
  700. '''
  701. Recursively generates the kernel subtree of the given node in
  702. bracketing representation in (rel (pos score)) format
  703. pCurrentHead identifies which head is calling this method in case
  704. the node has multiple heads.
  705. '''
  706. vDependents = ''.join([self._generateSLSADKSubtree2(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
  707. if pNode.getSentScore() is not None:
  708. if len(pNode.dependents) == 0:
  709. vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  710. pNode.getPOSTag(),
  711. pNode.getSentScore())
  712. else:
  713. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  714. pNode.getPOSTag(),
  715. pNode.getSentScore(),
  716. vDependents)
  717. else:
  718. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  719. if len(pNode.dependents) == 0:
  720. vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  721. pNode.getPOSTag(),
  722. pdOptions["neutral"])
  723. else:
  724. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  725. pNode.getPOSTag(),
  726. pdOptions["neutral"],
  727. vDependents)
  728. else:
  729. if len(pNode.dependents) == 0:
  730. vKSubtree = "(%s %s)" % (pNode.getDepRel(pCurrentHead),
  731. pNode.getPOSTag())
  732. else:
  733. vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  734. pNode.getPOSTag(),
  735. vDependents)
  736. return vKSubtree
  737. # (score (rel (roles (pos form)))) -> no equivalent number format in version 0.3
  738. def _generateSLSADKSubtree3(self, pNode, pCurrentHead, pdOptions = {}):
  739. '''
  740. Recursively generates the kernel subtree of the given node in
  741. bracketing representation in (score (rel (roles (pos form)))) format
  742. pCurrentHead identifies which head is calling this method in case
  743. the node has multiple heads.
  744. pdOptions contains the following options to be used in formatting:
  745. - neutral: the way the neutral words, i.e. those without a sentiment
  746. score should be treated. The possible values include
  747. an empty string which means do not add any node for such
  748. words, and a string value which will be used as a node
  749. to be inserted in the same way the scores are.
  750. - no-arg: the way non-argument nodes are represented. The possible
  751. values include an empty string which means the node will
  752. be represented as in non-semantic format, and a string
  753. value (e.g. null) which will be used as the label for
  754. representing semantic role of such nodes.
  755. '''
  756. vDependents = ''.join([self._generateSLSADKSubtree3(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
  757. if pNode.isArgument():
  758. if pNode.getSentScore() is not None:
  759. if len(pNode.dependents) == 0:
  760. vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
  761. pNode.getDepRel(pCurrentHead),
  762. '_'.join(pNode.getArgRoles()),
  763. pNode.getPOSTag(),
  764. pNode.form)
  765. else:
  766. vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pNode.getSentScore(),
  767. pNode.getDepRel(pCurrentHead),
  768. '_'.join(pNode.getArgRoles()),
  769. pNode.getPOSTag(),
  770. pNode.form,
  771. vDependents)
  772. else:
  773. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  774. if len(pNode.dependents) == 0:
  775. vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
  776. pNode.getDepRel(pCurrentHead),
  777. '_'.join(pNode.getArgRoles()),
  778. pNode.getPOSTag(),
  779. pNode.form)
  780. else:
  781. vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pdOptions["neutral"],
  782. pNode.getDepRel(pCurrentHead),
  783. '_'.join(pNode.getArgRoles()),
  784. pNode.getPOSTag(),
  785. pNode.form,
  786. vDependents)
  787. else:
  788. if len(pNode.dependents) == 0:
  789. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  790. '_'.join(pNode.getArgRoles()),
  791. pNode.getPOSTag(),
  792. pNode.form)
  793. else:
  794. vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getDepRel(pCurrentHead),
  795. '_'.join(pNode.getArgRoles()),
  796. pNode.getPOSTag(),
  797. pNode.form,
  798. vDependents)
  799. elif "no-arg" in pdOptions and pdOptions["no-arg"].strip() != '':
  800. if pNode.getSentScore() is not None:
  801. if len(pNode.dependents) == 0:
  802. vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
  803. pNode.getDepRel(pCurrentHead),
  804. pdOptions["no-arg"],
  805. pNode.getPOSTag(),
  806. pNode.form)
  807. else:
  808. vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pNode.getSentScore(),
  809. pNode.getDepRel(pCurrentHead),
  810. pdOptions["no-arg"],
  811. pNode.getPOSTag(),
  812. pNode.form,
  813. vDependents)
  814. else:
  815. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  816. if len(pNode.dependents) == 0:
  817. vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
  818. pNode.getDepRel(pCurrentHead),
  819. pdOptions["no-arg"],
  820. pNode.getPOSTag(),
  821. pNode.form)
  822. else:
  823. vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pdOptions["neutral"],
  824. pNode.getDepRel(pCurrentHead),
  825. pdOptions["no-arg"],
  826. pNode.getPOSTag(),
  827. pNode.form,
  828. vDependents)
  829. else:
  830. if len(pNode.dependents) == 0:
  831. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  832. pdOptions["no-arg"],
  833. pNode.getPOSTag(),
  834. pNode.form)
  835. else:
  836. vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getDepRel(pCurrentHead),
  837. pdOptions["no-arg"],
  838. pNode.getPOSTag(),
  839. pNode.form,
  840. vDependents)
  841. else:
  842. if pNode.getSentScore() is not None:
  843. if len(pNode.dependents) == 0:
  844. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
  845. pNode.getDepRel(pCurrentHead),
  846. pNode.getPOSTag(),
  847. pNode.form)
  848. else:
  849. vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
  850. pNode.getDepRel(pCurrentHead),
  851. pNode.getPOSTag(),
  852. pNode.form,
  853. vDependents)
  854. else:
  855. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  856. if len(pNode.dependents) == 0:
  857. vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
  858. pNode.getDepRel(pCurrentHead),
  859. pNode.getPOSTag(),
  860. pNode.form)
  861. else:
  862. vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
  863. pNode.getDepRel(pCurrentHead),
  864. pNode.getPOSTag(),
  865. pNode.form,
  866. vDependents)
  867. else:
  868. if len(pNode.dependents) == 0:
  869. vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  870. pNode.getPOSTag(),
  871. pNode.form)
  872. else:
  873. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  874. pNode.getPOSTag(),
  875. pNode.form,
  876. vDependents)
  877. return vKSubtree
  878. # (score (rel (pos_roles form))) -> no equivalent number format in version 0.3
  879. def _generateSLSADKSubtree4(self, pNode, pCurrentHead, pdOptions = {}):
  880. '''
  881. Recursively generates the kernel subtree of the given node in
  882. bracketing representation in (score (rel (pos_roles form))) format
  883. pCurrentHead identifies which head is calling this method in case
  884. the node has multiple heads.
  885. pdOptions contains the following options to be used in formatting:
  886. - neutral: the way the neutral words, i.e. those without a sentiment
  887. score should be treated. The possible values include
  888. an empty string which means do not add any node for such
  889. words, and a string value which will be used as a node
  890. to be inserted in the same way the scores are.
  891. '''
  892. vDependents = ''.join([self._generateSLSADKSubtree4(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
  893. if pNode.isArgument():
  894. if pNode.getSentScore() is not None:
  895. if len(pNode.dependents) == 0:
  896. vKSubtree = "(%s (%s (%s_%s %s)))" % (pNode.getSentScore(),
  897. pNode.getDepRel(pCurrentHead),
  898. pNode.getPOSTag(),
  899. '_'.join(pNode.getArgRoles()),
  900. pNode.form)
  901. else:
  902. vKSubtree = "(%s (%s (%s_%s (%s %s))))" % (pNode.getSentScore(),
  903. pNode.getDepRel(pCurrentHead),
  904. pNode.getPOSTag(),
  905. '_'.join(pNode.getArgRoles()),
  906. pNode.form,
  907. vDependents)
  908. else:
  909. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  910. if len(pNode.dependents) == 0:
  911. vKSubtree = "(%s (%s (%s_%s %s)))" % (pdOptions["neutral"],
  912. pNode.getDepRel(pCurrentHead),
  913. pNode.getPOSTag(),
  914. '_'.join(pNode.getArgRoles()),
  915. pNode.form)
  916. else:
  917. vKSubtree = "(%s (%s (%s_%s (%s %s))))" % (pdOptions["neutral"],
  918. pNode.getDepRel(pCurrentHead),
  919. pNode.getPOSTag(),
  920. '_'.join(pNode.getArgRoles()),
  921. pNode.form,
  922. vDependents)
  923. else:
  924. if len(pNode.dependents) == 0:
  925. vKSubtree = "(%s (%s_%s %s))" % (pNode.getDepRel(pCurrentHead),
  926. pNode.getPOSTag(),
  927. '_'.join(pNode.getArgRoles()),
  928. pNode.form)
  929. else:
  930. vKSubtree = "(%s (%s_%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  931. pNode.getPOSTag(),
  932. '_'.join(pNode.getArgRoles()),
  933. pNode.form,
  934. vDependents)
  935. else:
  936. if pNode.getSentScore() is not None:
  937. if len(pNode.dependents) == 0:
  938. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
  939. pNode.getDepRel(pCurrentHead),
  940. pNode.getPOSTag(),
  941. pNode.form)
  942. else:
  943. vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
  944. pNode.getDepRel(pCurrentHead),
  945. pNode.getPOSTag(),
  946. pNode.form,
  947. vDependents)
  948. else:
  949. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  950. if len(pNode.dependents) == 0:
  951. vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
  952. pNode.getDepRel(pCurrentHead),
  953. pNode.getPOSTag(),
  954. pNode.form)
  955. else:
  956. vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
  957. pNode.getDepRel(pCurrentHead),
  958. pNode.getPOSTag(),
  959. pNode.form,
  960. vDependents)
  961. else:
  962. if len(pNode.dependents) == 0:
  963. vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  964. pNode.getPOSTag(),
  965. pNode.form)
  966. else:
  967. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  968. pNode.getPOSTag(),
  969. pNode.form,
  970. vDependents)
  971. return vKSubtree
  972. # (rel_score (pos form)) -> no equivalent number format in version 0.3
  973. def _generateSLSADKSubtree5(self, pNode, pCurrentHead, pdOptions = {}):
  974. '''
  975. Recursively generates the kernel subtree of the given node in
  976. bracketing representation in (score (rel (pos form))) format
  977. pCurrentHead identifies which head is calling this method in case
  978. the node has multiple heads.
  979. pdOptions contains the following options to be used in formatting:
  980. - neutral: the way the neutral words, i.e. those without a sentiment
  981. score should be treated. The possible values include
  982. an empty string which means do not add any node for such
  983. words, and a string value which will be used as a node
  984. to be inserted in the same way the scores are.
  985. '''
  986. vDependents = ''.join([self._generateSLSADKSubtree5(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
  987. if pNode.getSentScore() is not None:
  988. if len(pNode.dependents) == 0:
  989. vKSubtree = "(%s_%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  990. pNode.getSentScore(),
  991. pNode.getPOSTag(),
  992. pNode.form)
  993. else:
  994. vKSubtree = "(%s_%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  995. pNode.getSentScore(),
  996. pNode.getPOSTag(),
  997. pNode.form,
  998. vDependents)
  999. else:
  1000. if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
  1001. if len(pNode.dependents) == 0:
  1002. vKSubtree = "(%s_%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  1003. pdOptions["neutral"],
  1004. pNode.getPOSTag(),
  1005. pNode.form)
  1006. else:
  1007. vKSubtree = "(%s_%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  1008. pdOptions["neutral"],
  1009. pNode.getPOSTag(),
  1010. pNode.form,
  1011. vDependents)
  1012. else:
  1013. if len(pNode.dependents) == 0:
  1014. vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
  1015. pNode.getPOSTag(),
  1016. pNode.form)
  1017. else:
  1018. vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
  1019. pNode.getPOSTag(),
  1020. pNode.form,
  1021. vDependents)
  1022. return vKSubtree
  1023. class SLSACKTree(tk.ConstKTree):
  1024. '''
  1025. The class for SLSA constituency tree for use in tree kernels
  1026. For use in tree kernels, the tree is represented in PTB bracketing
  1027. format.
  1028. '''
  1029. def generateConstKTree(self, pFormat = "(phrase (pos form))", pdOptions = None):
  1030. '''
  1031. Generates constituency kernel tree or subtree under a given node in
  1032. the required format
  1033. NOTE: The subtree here should not be confused with the notion of
  1034. subtree as a tree kernel variation used in parallel to subset tree
  1035. kernel.
  1036. The default format is (phrase (pos form)) which is the pure constituency
  1037. tree in bracketing (s-expression) format. In general, the format string
  1038. is the representation of the adequately innermost treelet in the format.
  1039. See each format-specific method for exact details.
  1040. See the documentation of the parent class.
  1041. pdOptions contains specific options to each format.
  1042. '''
  1043. if pFormat.lower() == "(score (score score))":
  1044. self.kTree = "( %s)" % self._generateSLSACKSubtree1(self.constTree.root, pdOptions = pdOptions)
  1045. elif pFormat.lower() == "(score (phrase (score (pos form))))":
  1046. self.kTree = "( %s)" % self._generateSLSACKSubtree2(self.constTree.root, pdOptions = pdOptions)
  1047. elif pFormat.lower() == "(phrase_score (pos_score form))":
  1048. self.kTree = "( %s)" % self._generateSLSACKSubtree3(self.constTree.root, pdOptions = pdOptions)
  1049. elif pFormat.lower() == "(score (phrase_args (score (pos_args form))))":
  1050. self.kTree = "( %s)" % self._generateSLSACKSubtree4(self.constTree.root, pdOptions = pdOptions)
  1051. elif pFormat.lower() == "(phrase (score )(pos (score )(form )))":
  1052. self.kTree = "( %s)" % self._generateSLSACKSubtree5(self.constTree.root, pdOptions = pdOptions)
  1053. else:
  1054. self.kTree = tk.ConstKTree.generateConstKTree(self, pFormat = pFormat)
  1055. return self.kTree
  1056. def _generateSLSACKSubtree1(self, pNode, pdOptions = None):
  1057. '''
  1058. Recursively generates the kernel subtree of the given node in
  1059. bracketing representation in (score (score score)) format
  1060. In this format, all the nodes represent the sentiment score
  1061. associated with them.
  1062. pdOptions contains the following options:
  1063. - keep-at: if true will keep the AT label and wont use sentiment
  1064. scores to replace it (default false)
  1065. '''
  1066. if pNode.isPreTerminal() or pNode.isTerminal():
  1067. vKSubtree = "(%s %s)" % (pNode.getSentScore(),
  1068. pNode.getTerminalNodes()[0].getSentScore())
  1069. elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
  1070. vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree1(n, pdOptions) for n in pNode.children]))
  1071. else:
  1072. vKSubtree = "(%s %s)" % (pNode.getSentScore(),
  1073. ''.join([self._generateSLSACKSubtree1(n, pdOptions) for n in pNode.children]))
  1074. return vKSubtree
  1075. def _generateSLSACKSubtree2(self, pNode, pdOptions = None):
  1076. '''
  1077. Recursively generates the kernel subtree of the given node in
  1078. bracketing representation in (score (phrase (score (pos form))))
  1079. format
  1080. In this format, all the nodes represent the sentiment score
  1081. associated with them.
  1082. pdOptions contains the following options:
  1083. - keep-at: if true will keep the AT label and wont use sentiment
  1084. scores to replace it (default false)
  1085. '''
  1086. if pNode.isPreTerminal():
  1087. vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
  1088. pNode.getSynTag(),
  1089. pNode.getTerminal())
  1090. elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
  1091. vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree2(n, pdOptions) for n in pNode.children]))
  1092. else:
  1093. vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
  1094. pNode.getSynTag(),
  1095. ''.join([self._generateSLSACKSubtree2(n, pdOptions) for n in pNode.children]))
  1096. return vKSubtree
  1097. def _generateSLSACKSubtree3(self, pNode, pdOptions = None):
  1098. '''
  1099. Recursively generates the kernel subtree of the given node in
  1100. bracketing representation in (phrase_score (pos_score form))
  1101. format
  1102. In this format, all the nodes represent the sentiment score
  1103. associated with them.
  1104. pdOptions contains the following options:
  1105. - keep-at: if true will keep the AT label and wont use sentiment
  1106. scores to replace it (default false)
  1107. '''
  1108. if pNode.isPreTerminal():
  1109. vKSubtree = "(%s_%s %s)" % (pNode.getSynTag(),
  1110. pNode.getSentScore(),
  1111. pNode.getTerminal())
  1112. elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
  1113. vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree3(n, pdOptions) for n in pNode.children]))
  1114. else:
  1115. vKSubtree = "(%s_%s %s)" % (pNode.getSynTag(),
  1116. pNode.getSentScore(),
  1117. ''.join([self._generateSLSACKSubtree3(n, pdOptions) for n in pNode.children]))
  1118. return vKSubtree
  1119. def _generateSLSACKSubtree4(self, pNode, pdOptions = None):
  1120. '''
  1121. Recursively generates the kernel subtree of the given node in
  1122. bracketing representation in (score (phrase_args (score (pos_args form))))
  1123. format
  1124. In this format, all the nodes represent the sentiment score
  1125. associated with them.
  1126. pdOptions contains the following options:
  1127. - keep-at: if true will keep the AT label and wont use sentiment
  1128. scores to replace it (default false)
  1129. '''
  1130. if pNode.isPreTerminal():
  1131. vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
  1132. '_'.join([pNode.getSynTag()] + pNode.getArgRoles()),
  1133. pNode.getTerminal())
  1134. elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
  1135. vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree4(n, pdOptions) for n in pNode.children]))
  1136. else:
  1137. vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
  1138. '_'.join([pNode.getSynTag()] + pNode.getArgRoles()),
  1139. ''.join([self._generateSLSACKSubtree4(n, pdOptions) for n in pNode.children]))
  1140. return vKSubtree
  1141. def _generateSLSACKSubtree5(self, pNode, pdOptions = None):
  1142. '''
  1143. Recursively generates the kernel subtree of the given node in
  1144. bracketing representation in (phrase (score )(pos (score )(form )))
  1145. format
  1146. In this format, all the nodes represent the sentiment score
  1147. associated with them.
  1148. pdOptions contains the following options:
  1149. - keep-at: if true will keep the AT label and wont use sentiment
  1150. scores to replace it (default false)
  1151. '''
  1152. if pNode.isPreTerminal():
  1153. vKSubtree = "(%s (%s )(%s ))" % (pNode.getSynTag(),
  1154. pNode.getSentScore(),
  1155. pNode.getTerminal())
  1156. elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
  1157. vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree5(n, pdOptions) for n in pNode.children]))
  1158. else:
  1159. vKSubtree = "(%s (%s )%s)" % (pNode.getSynTag(),
  1160. pNode.getSentScore(),
  1161. ''.join([self._generateSLSACKSubtree5(n, pdOptions) for n in pNode.children]))
  1162. return vKSubtree
  1163. class SLSACDKTree(tk.ConstDepKTree):
  1164. '''
  1165. Class for integrating SLSA dependency subtrees in SLSA constituency
  1166. trees for tree kernel use
  1167. '''
  1168. def _createDepKTree(self):
  1169. '''
  1170. Creates and returns a new SLSADKTree
  1171. '''
  1172. return SLSADKTree(pDepTree = self.depTree)
  1173. def generateConstDepKTree(self, pFormat = "(phrase (pos (form (rel head))))", pdOptions = {}):
  1174. '''
  1175. Generates the tree representation in the required format
  1176. See the parent class for more details.
  1177. '''
  1178. self.kTree = tk.ConstDepKTree.generateConstDepKTree(self, pFormat = pFormat, pdOptions = pdOptions)
  1179. return self.kTree
  1180. class SLSANGramKTree(tk.NGramKTree):
  1181. '''
  1182. Class for implementing n-gram tree for SLSA sentences
  1183. '''
  1184. def __init__(self, pSLSASent, pNodeContentType = "word", pdOptions = None):
  1185. '''
  1186. Constructor
  1187. pNodeContentTypes can take:
  1188. - word: word n-gram trees are produced, i.e. nodes are word forms
  1189. - POS: POS n-gram trees are produced, i.e. nodes are POS tags
  1190. - sentiment: sentiment score n-gram trees are produced, i.e. nodes are sentiment polarity scores
  1191. - wvp: prefixed words for word vector similarity computation are produced (e.g. with svmlight-tk-we)
  1192. pdOptions contains specific options to each format.
  1193. '''
  1194. if pdOptions is None:
  1195. pdOptions = {}
  1196. if pNodeContentType.lower() == "word":
  1197. tk.NGramKTree.__init__(self, plTokens = pSLSASent.getTokens())
  1198. elif pNodeContentType.lower() == "word-lower":
  1199. tk.NGramKTree.__init__(self, plTokens = [t.lower() for t in pSLSASent.getTokens()])
  1200. elif pNodeContentType.lower() == "pos":
  1201. tk.NGramKTree.__init__(self, plTokens = pSLSASent.getPOSTags())
  1202. elif pNodeContentType.lower() in ["sentiment", "polarity"]:
  1203. tk.NGramKTree.__init__(self, plTokens = [str(s) for s in pSLSASent.getSentimentScores()])
  1204. elif pNodeContentType.lower() == "wvp":
  1205. if "prefix" in pdOptions:
  1206. vPrefix= pdOptions["prefix"]
  1207. else:
  1208. vPrefix= "___"
  1209. tk.NGramKTree.__init__(self, plTokens = [vPrefix + t for t in pSLSASent.getTokens()])
  1210. else:
  1211. raise Exception("%s is an invalid node content type!" % pNodeContentType)
  1212. self.slsaSent = pSLSASent
  1213. def generateNGramKTree(self, pFormat = "binary", pdOptions = None):
  1214. '''
  1215. Generates and returns a tree representation of the sentence tokens
  1216. pdOptions contains specific options to each format.
  1217. '''
  1218. if pdOptions is None:
  1219. pdOptions = {}
  1220. if pFormat == "unary":
  1221. self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "unary")
  1222. elif pFormat == "bigram":
  1223. self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "bigram")
  1224. elif pFormat == "binary":
  1225. self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "binary")
  1226. else:
  1227. self.kTree = tk.NGramKTree.generateNGramKTree(self)
  1228. return self.kTree