absa.py 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. This module defines classes to for Aspect Based Sentiment Analysis (ABSA).
  5. NOTE: bugs identified by PyCharm should be fixed (e.g. _attachATSuffix()
  6. in ABSADNode). It seems they have never been tested.
  7. Version 3.0 (30-Apr-2018)
  8. - getTokens() and getOverallPolarity() are added to ABSAContext.
  9. Version 2.9 (03-Jan-2018 to 05-Jan-2017)
  10. - mergeWith()is added to ABSASet.
  11. - pickle() and loadFromPickle() are added to ABSASet.
  12. - getOEsIO() is added to AspectTerm.
  13. Version 2.8 (17-Jul-2017 to 18-Jul-2017)
  14. - ABSAContext is added to support SemEval 2016 data. The code is refactored so that
  15. it still works seamlessly with 2014 data.
  16. Version 2.7 (17-May-2017)
  17. - Unary tree kernels in ABSANGramKTree is upgraded.
  18. Version 2.6 (24-Mar-2017)
  19. - AspectTerm.getSentenceMarking() is added.
  20. Version 2.5 (23-Jan-2017)
  21. - Bug in ABSASet.loadBratOE() fixed to account for cases where SE is annotated
  22. from/to the middle of a token due to tokenization problem.
  23. - ABSASet.oeToBIO() is upgraded to include POS tags in the output.
  24. Version 2.4 (01-Dec-2016 to 08-Dec-2016)
  25. - oeToBIO() is added to ABSASent.
  26. Version 2.3 (11-Oct-2016 to 13-Oct-2016)
  27. - loadBratOEs() is added to ABSASent to load opinion/sentiment expressions
  28. annotated in Brat annotator.
  29. - oes is added to AspectTemrm to keep opinion/sentiment expressions.
  30. - vABSASent.oe is renamed to oes.
  31. - Type is added to OpinionExpression.
  32. Version 2.2 (22-Jun-2016)
  33. - writeBratInput()is added to ABSASet.
  34. Version 2.1 (02-May-2016 to 05-May-2016)
  35. - ABSANGramKTree is upgraded to use customized unigram auxiliary nodes
  36. instead of X for binary format.
  37. - extractAT2RootPathTree() is added to ABSADTree.
  38. - ABSANGramKTree is upgraded to produce lower case word tree kernels.
  39. - New format is added to ABSANGramKTree kernels for word vector similarity.
  40. Version 2.0 (12-Apr-2016 to 26-Apr-2016)
  41. - ABSANGramKTree is upgraded to produce POS tag and sentiment score
  42. tree kernels similar to word tree kernels.
  43. - getPOSTags() is added to ABSASent.
  44. - New formats are added to ABSADKTree and ABSACKTree.
  45. - getVocabulary() and loadWordVectors() are added to the ABSASet.
  46. - getWordVectors() is added to ABSASent.
  47. - A new attribute is added to ABSASent to store the ABSASet it belongs to.
  48. Its value is set during loading the data.
  49. Version 1.9 (04-Apr-2016 to 11-Apr-2016)
  50. - New formats are added to ABSANGramKTree and getNGramKTreeEmbeding()
  51. of AspectTerm is modified to handle various n-gram tree formats.
  52. - getPOSTags() is added to ABSASent.
  53. Version 1.8 (10-Mar-2016 to 29-Mar-2016)
  54. - extractAT2OEPath() of ABSADepTree is renamed to extractAT2OEDepRelPath().
  55. - getPolarScores() is added to ABSASent.
  56. Version 1.7 (17-Feb-2016 to 09-Mar-2016)
  57. - Constituency and dependency path between aspect term and opinion
  58. expression is extracted.
  59. - Bug in ABSADNode.loadFromDepNode() is fixed which assigned DepTree
  60. object instead of ABSADepTree to depTree attribute.
  61. - Average sentiment score of the sentence and opinion expressions can
  62. be extracted using newly added methods to ABSASent and OpinionExpression.
  63. - getTokens() is added to OpinionExpression and AspectTerm.
  64. - The aspect length groups are now extracted as part of the aspect term
  65. statistics by ABSASet.extractATStat().
  66. Version 1.6 (28-Jan-2016 to 02-Feb-2016)
  67. - Opinion expression is introduced and OpinionExpression is added.
  68. - loadBIOOpinionExpressions() is added to ABSASet and ABSASent.
  69. - Methods are added to embed opinion expressions in the trees.
  70. - ABSADNode.decorate() is renamed to ABSADNode.decorateAT().
  71. - New formats are added to ABSADKTree.
  72. Version 1.5 (19-Nov-2015 to 26-Nov-2015)
  73. - Naming of internal methods is ABSACKTree and ABSADKTree are modified
  74. to fix the bug causing conflict between the methods of these classes
  75. and their parent classes.
  76. - New formats are added to ABSACKTree.
  77. - shallowCopy() is added to ABSAConstTree.
  78. - deep copying method the trees in tree embedding generations in aspec
  79. term class is changed to use python deepcopy as it performed better
  80. with ConstTree in an experiment. However, it has not been tested for
  81. the dependency tree despite applying the idea. In both cases, this
  82. still needs to be further confirmed in the future experiments.
  83. Version 1.4 (13-Nov-2015 to 16-Nov-2015)
  84. - New format is added to ABSADKTree to handle semantic roles.
  85. Version 1.3 (22-Oct-2015 to 02-Nov-2015)
  86. - ABSACTree.loadSentimentScore() is changed and ABSACNode.loadSentScores()
  87. is added to enable optionally propagating sentiment scores in the
  88. tree nodes.
  89. - pNeutralScore is added to loadSentimentScores in ABSASet and ABSASent
  90. to assign a score for neutral words other than None.
  91. - ABSACKTree is added with two new formats.
  92. - ABSASent.getSentimentScores() is changed to return the scores in a
  93. list instead of a dictionary.
  94. Version 1.2 (12-Oct-2015 to 15-Oct-2015)
  95. - ABSADKTree.generateDepKTree() supports subtrees under a node provided
  96. as a parameter in addition to the whole tree under the root.
  97. - ABSACDKTree is added to support the integration of constituency and
  98. dependency trees.
  99. - ABSANGramKTree is added to construct a tree from text token forms.
  100. - getNGramKTreeEmbeding() is added to AspectTerm.
  101. - extractCaraFXInput() is added to ABSASent.
  102. Version 1.1 (28-Sep-2015 to 02-Oct-2015)
  103. - getSentimentScores(), length() are added to ABSASent.
  104. - tokenLength() and getSentimentScores() are added to ABSASet.
  105. - New formats are added to generateDepKTree() of ABSADKTree.
  106. - Sentiment score representation is added to constituency tree decoration.
  107. - loadSSInCTree() is added to ABSASet and ABSASent.
  108. Version 1.0 (03-Sep-2015 to 08-Sep-2015)
  109. - loadSentimenScores() is added to ABSASet and ABSASent.
  110. - sentScores is added to ABSASent to represent sentiment scores of the
  111. sentence words.
  112. - sentScore is added to ABSADNode to represent sentiment score of the
  113. associated node.
  114. - loadSSInDTrees() is added to ABSASet to load sentiment scores into
  115. the dependency trees.
  116. - loadSSInDTree() is added to ABSASent to load sentiment scores into
  117. the dependency tree of the sentence.
  118. - loadSentScores() is added to ABSADTree.
  119. - setSentScore() is added to ABSADNode.
  120. - deepCopy() is added to ABSADNode and the constructor and node creation
  121. methods were accordingly changed.
  122. Version 0.9 (05-Aug-2015 to 26-Aug-2015)
  123. - ABSADTree and ABSADNode are added, and all the corresponding updates
  124. are done in ABSASet, ABSASent, AspectTerm.
  125. Version 0.8 (30-Jul-2015 to 04-Aug-2015)
  126. - The representation of aspect terms in the constituency tree is decoupled
  127. from embedding them in the tree. Once an aspect term is embedded in
  128. the tree (either in pre-terminals in its span or in the constituent
  129. node mapped to it), it can be represented in the tree by inserting
  130. a new node, attaching suffix and attaching polarity in various formats.
  131. This is done in ABSACNode.decorateAT(). AspectTerm.getCTreeEmbeding()
  132. and ABSACNode.embedAspectTerm() were changed accordingly. Also,
  133. ABSACNode.getPTBFormat() is not overridden any more since polarity
  134. attachment is done in decorateAT(). Finally, inserting node during
  135. embedding aspect terms only happens optionally when there is a mismatch.
  136. This has been implemented by changing pInsertATNode to pOnMismatch
  137. which specifies the method for handling aspect term/constituent
  138. mismatch, on of which being inserting new AT node.
  139. Version 0.7 (27-Jul-2015 to 29-Jul-2015)
  140. - Another option (pAttachATSuffix) is added to embedding aspect term
  141. in constituency tree to mark nodes in the aspect term subtree with
  142. a specific suffix tag (_AT).
  143. - ConstNode.embedAspectTerm() is edited to support embedding the aspect
  144. term in pre-terminals in addition to constituent nodes spanning the
  145. aspect term. _embedATINSpanConst() and _embedATINPreterminals() are
  146. added to ConstNode for this purpose and extractAspectTerms() is edited
  147. to account for repetitions stemming from embedding aspect terms in
  148. pre-terminals.
  149. - ABSADTree and ABSADNode are implemented.
  150. Version 0.6 (22-Jun-2015 to 23-Jun-2015)
  151. - aspectTerm in ABSACNode is changed to aspectTerms which is a list of
  152. aspect terms ambeded into the node to allow a node to carry more than
  153. one aspect terms. The methods handling this attribute have consequently
  154. been updated.
  155. - extractATsInCTree() is added to ABSASent.
  156. - getSentencesWithNoAT() is added to ABSASet.
  157. - ABSASet.extractATStat() is update to extract more statistics.
  158. Version 0.5 18-May-2015
  159. - ABSACNode.embedAspectTerm() is edited to better handle the situation
  160. where pInsertATNode is set to mismatch. ABSACNode._plugAspectTerm
  161. has accordingly been updated.
  162. - getAspectTermCount() is added to ABSASet and ABSASent.
  163. Version 0.4 08-May-2015
  164. - loadConstTrees() in ABSASet and ABSASent are changed not to embed
  165. the aspect terms. Embeding aspect terms is done instead by new methods
  166. ABSASet.embedATsInCTrees() and ABSASent.embedATsInCTree().
  167. - embedAspectTerm() is added to ABSASent to be able to embed individual
  168. aspect terms in the tree.
  169. - sentence (attr), embedInCtree(), getCTreeEmbeding() are added to
  170. AspectTerm.
  171. - The default value for pflgAttachATPolarity argument of getPTBFormat()
  172. method in ABSACTree and ABSACNode is changed to False.
  173. Version 0.3 06-May-2015
  174. - Embedding aspect terms is changed. Accordingly, ABSACNode.embedAspectTerms()
  175. is renamed to embedAspectTerm().
  176. - addSentences() is added to ABSASet.
  177. Version 0.2 18-Feb-2015
  178. - The NP structures in ABSACTree can be modified to avoid term/constituent
  179. mismatch.
  180. Version 0.1 17-Feb-2015
  181. - ABSA, ABSASent, AspectTerm, ABSACTree, ABSACNode are added.
  182. """
  183. from collections import namedtuple
  184. import re, copy
  185. import pickle
  186. class ABSASet:
  187. '''
  188. Class for aspect-based sentiment analysis data set
  189. '''
  190. def __init__(self):
  191. '''
  192. Constructor
  193. '''
  194. self.contexts = []
  195. # WordVector object containing the word vectors of the vocabulary in the dataset
  196. self.wv = None
  197. @property
  198. def size(self):
  199. '''
  200. Returns the size of the data set which is the number of its sentences
  201. '''
  202. return len(self.getSentences())
  203. def getContextCount(self):
  204. '''
  205. Returns the number of contexts i the data set
  206. '''
  207. return len(self.contexts)
  208. @property
  209. def tokenLength(self):
  210. '''
  211. Returns the number of tokens in the data set
  212. '''
  213. return sum([s.length for s in self.getSentences()])
  214. def getContexts(self):
  215. '''
  216. Returns ABSA contexts of the data set
  217. '''
  218. return self.contexts
  219. def getSentences(self, pSort = ''):
  220. '''
  221. Returns the ABSA sentences
  222. The sort options are:
  223. - None: in document order
  224. - text: in sentence text order
  225. '''
  226. vlSentences = [s for c in self.getContexts() for s in c.getSentences()]
  227. if pSort.lower() == "text":
  228. return [s for s in sorted(vlSentences, key = lambda x: x.getText())]
  229. else:
  230. return vlSentences
  231. def getSentencesWithNoAT(self):
  232. '''
  233. Returns the ABSA sentences which do not have any aspect terms
  234. '''
  235. return [s for s in self.getSentences() if s.getAspectTermCount() == 0]
  236. def addContext(self, pContext):
  237. '''
  238. Adds ABSA context to the data set
  239. '''
  240. if pContext not in self.getContexts():
  241. pContext.dataset = self
  242. self.contexts.append(pContext)
  243. def addContexts(self, plContexts):
  244. '''
  245. Adds ABSA contexts to existing contexts
  246. '''
  247. for vContext in plContexts:
  248. self.addContext(vContext)
  249. def delContext(self, pContextIdx):
  250. '''
  251. Deletes context at the given index from the dataset
  252. '''
  253. del self.contexts[pContextIdx]
  254. def mergeWith(self, pABSASet):
  255. '''
  256. Merges this data set with the given ABSA data set
  257. It first checks if both sets to be merged are of the same class derived from ABSASet)
  258. '''
  259. if self.__class__ != pABSASet.__class__:
  260. raise Exception("Two data sets must be of the same ABSA type: %s vs. %s" %(self.__class__ , pABSASet.__class__))
  261. # merging contexts
  262. self.addContexts(pABSASet.getContexts())
  263. # NOTE: word embeddings should also be merged
  264. print("Word embeddings were not merged. Reload them for the merged data set.")
  265. # NOTE: take care of other newly added attributes if any
  266. def addSentences(self, plSentences):
  267. '''
  268. Adds ABSA sentences to existing sentences
  269. Currently, no care is taken regarding ID duplication.
  270. '''
  271. for vSent in plSentences:
  272. if vSent.context not in self.getContexts():
  273. self.addContext(vSent.context)
  274. vSent.context.addSentence(vSent)
  275. def getVocabulary(self):
  276. '''
  277. Extracts and returns the vocabulary of the dataset
  278. '''
  279. return sorted(set([t for s in self.getSentences() for t in s.getTokens()]))
  280. def getAspectTerms(self):
  281. '''
  282. Returns all aspect terms objects
  283. '''
  284. vlAspectTerms = []
  285. for vSent in self.getSentences():
  286. vlAspectTerms += vSent.getAspectTerms()
  287. return vlAspectTerms
  288. def getAspectTermCount(self):
  289. '''
  290. Returns the total number of aspect terms in the dataset
  291. '''
  292. vTotalCount = 0
  293. for vSent in self.getSentences():
  294. vTotalCount += vSent.getAspectTermCount()
  295. return vTotalCount
  296. def extractSentenceForms(self, pSort = None):
  297. '''
  298. Returns the surface form of the sentences
  299. The sort options are:
  300. - None: in document order
  301. - id: in sentence ID order
  302. - text: in sentence text order
  303. - at: in aspect term order
  304. '''
  305. if pSort.lower() == "at":
  306. return [at.sentence.getText() for at in self.getAspectTerms()]
  307. else:
  308. return [s.getText() for s in self.getSentences(pSort = pSort)]
  309. def extractAspectTerms(self):
  310. '''
  311. Extracts all aspect terms (forms) in the data and their counts
  312. '''
  313. vlAspectTerms = []
  314. for vSent in self.getSentences():
  315. vlAspectTerms += vSent.getAspectTermForms()
  316. return util.groupBy(vlAspectTerms)
  317. def extractATStat(self):
  318. '''
  319. Extract various aspect term statistics
  320. '''
  321. # data structure to store statistics
  322. ATStat = namedtuple('ATStat', "sentsWithAnAT, sentsWithNoAT, allATCount distATCount hapaxATCount polCounts CTreeATCount, ATConstMismatch, ATLenGroups")
  323. vSentsWithNoAT = len(self.getSentencesWithNoAT())
  324. vSentsWithAnAT = self.size - vSentsWithNoAT
  325. vdATs = self.extractAspectTerms()
  326. # allATCount: number of all aspect terms
  327. vAllATCount = sum([at for at in vdATs.itervalues()])
  328. # distATCount: number of distinct aspect terms
  329. vDistATCount = len(vdATs)
  330. # hapaxATCount: number of hapax aspect terms (terms appearing just once)
  331. vHapaxATCount = sum([at for at in vdATs.itervalues() if at == 1])
  332. # polCounts: number of each polarity type
  333. vdPolCounts = {"positive": 0, "negative": 0, "neutral": 0, "conflict": 0}
  334. for vAT in self.getAspectTerms():
  335. vdPolCounts[vAT.getPolarity()] += 1
  336. # CTreeATCount: number of aspect terms transferred to the contituency tree
  337. vCTreeATCount = sum([len(s.extractATsInCTree()) for s in self.getSentences()])
  338. # ATConstMismatch: number of mismatched aspect terms and constituents
  339. vATConstMismatch = sum([len(s.extractATermConstMismatch()) for s in self.getSentences()])
  340. # ATLenGroups: grouped AT token lengths in a dictionary
  341. vdATLenGroups = util.groupBy([len(at.getTokens()) for at in self.getAspectTerms()])
  342. return ATStat(vSentsWithAnAT, vSentsWithNoAT, vAllATCount, vDistATCount, vHapaxATCount, vdPolCounts, vCTreeATCount, vATConstMismatch, vdATLenGroups)
  343. def loadConstTrees(self, plConstTrees):
  344. '''
  345. Loads the constituency parse trees of the sentences
  346. It assumes that the provided constituency trees are in the order
  347. in which the sentences are loaded.
  348. The constituency trees can be provided in bracketing format or as
  349. constparse.ConstTree objects (in a list).
  350. '''
  351. for vSent, pCTree in zip(self.getSentences(), plConstTrees):
  352. vSent.loadConstTree(pCTree)
  353. def loadDepTrees(self, plDepTrees):
  354. '''
  355. Loads the dependency parse trees of the sentences
  356. It assumes that the provided dependency trees are in the order in
  357. which the sentences are loaded.
  358. The dependency trees are assumed to be provided a list of
  359. depparse.DepTree objects.
  360. '''
  361. for vSent, pDTree in zip(self.getSentences(), plDepTrees):
  362. vSent.loadDepTree(pDTree)
  363. def loadPOSTaggings(self, pPOSTaggingFilename, pSort = ''):
  364. '''
  365. Loads POS taggings of the sentences in the dataset
  366. POS tagging file should be in a columnar format and the order of the sentences should be given in pSort.
  367. '''
  368. for vSent, vSentPOS in zip(self.getSentences(pSort = pSort), open(pPOSTaggingFilename).read().strip().split('\n\n')):
  369. vPOSTagging = pos.POSTagging()
  370. vPOSTagging.loadFromColumnar(vSentPOS)
  371. if vPOSTagging.length != vSent.length:
  372. raise Exception("Length of sentence and POS tags don't match:\n\n%s\n%s" % (vSent.getTokens(), vPOSTagging.toLorgInput()))
  373. else:
  374. vSent.loadPOSTagging(vPOSTagging)
  375. def loadBIOOpinionExpressions(self, pllBIO):
  376. '''
  377. Loads opinion expression annotations based on BIO tagging
  378. The BIO tagging is provided in a 2D list the first dimension of which is supposed to match sentences in the
  379. dataset and the second the BIO labels of the tokens in each sentence.
  380. '''
  381. for vSent, vlOEBIO in zip(self.getSentences(), pllBIO):
  382. vSent.loadBIOOpinionExpressions(vlOEBIO)
  383. def embedATsInCTrees(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'nothing', pflgExtendATSpanToDT = False):
  384. '''
  385. Embeds the aspect terms in the constituency trees
  386. pflgModNPStruct specifies whether the NP structures should be edited
  387. in order to avoid term/constituent mismatch.
  388. pEmbedPosition specifies where the aspect term should be embedded
  389. in the subtree. The possible values are:
  390. - (span)ning-constituent: embedding into a node in subtree which
  391. spans the aspect term tokens. This may
  392. cause mismatches where the aspect term
  393. span is not fully covered by the node
  394. span. pOnMismatch can be set to fix the
  395. issue.
  396. - (pre)-terminals: embedding into each and every pre-terminal node
  397. in the subtree falling in the aspect term span.
  398. pOnMismatch specifies the method for handling aspect term/constituent
  399. mismatch. The possible values are:
  400. - nothing (or none): do not handle the mismatches
  401. - node (or insert): insert a new node covering the aspect term span
  402. '''
  403. for vSent in self.getSentences():
  404. vSent.embedATsInCTree(pflgModNPStruct = pflgModNPStruct, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendATSpanToDT = pflgExtendATSpanToDT)
  405. def embedOEsInCTrees(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'nothing'):
  406. '''
  407. Embeds the opinion expressions in the constituency trees
  408. pflgModNPStruct specifies whether the NP structures should be edited in order to avoid expression/constituent
  409. mismatch.
  410. pEmbedPosition specifies where the opinion expression should be embedded in the subtree. The possible values are:
  411. - (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches
  412. where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
  413. the issue.
  414. - (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
  415. pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
  416. - nothing (or none): do not handle the mismatches
  417. - node (or insert): insert a new node covering the aspect term span
  418. '''
  419. for vSent in self.getSentences():
  420. vSent.embedOEsInCTree(pflgModNPStruct = pflgModNPStruct, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
  421. def embedATsInDTrees(self):
  422. '''
  423. Embeds the aspect terms in the dependency trees
  424. '''
  425. for vSent in self.getSentences():
  426. vSent.embedATsInDTree()
  427. def embedOEsInDTrees(self):
  428. '''
  429. Embeds the opinion expressions in the dependency trees
  430. '''
  431. for vSent in self.getSentences():
  432. vSent.embedOEsInDTree()
  433. def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
  434. '''
  435. Loads polarity scores to sentences from a sentiment lexicon which
  436. is a Sentexicon object
  437. For details about Sentexicon object, see sentexicon.py
  438. '''
  439. vTotalWordNum = 0 # total number of words in the data set
  440. vTotalEntryWordNum = 0 # number of words found in the lexicon
  441. for vSent in self.getSentences():
  442. vWordNum, vEntryWordNum = vSent.loadSentimentScores(pSentexicon, pNeutralScore)
  443. vTotalWordNum += vWordNum
  444. vTotalEntryWordNum += vEntryWordNum
  445. return vTotalWordNum, vTotalEntryWordNum
  446. def getSentimentScores(self):
  447. '''
  448. Returns a dictionary of words in the data set and the sentiment
  449. scores attached to them
  450. '''
  451. vdResult = {}
  452. for vSent in self.getSentences():
  453. for vWord, vScore in zip(vSent.getTokens(), vSent.getSentimentScores()):
  454. vdResult[vWord] = vScore
  455. return vdResult
  456. def loadSSInDTrees(self):
  457. '''
  458. Loads sentiment scores into dependency tree nodes
  459. '''
  460. for vSent in self.getSentences():
  461. vSent.loadSSInDTree()
  462. def loadSSInCTrees(self, pPropagation = None):
  463. '''
  464. Loads sentiment scores into constituency tree nodes
  465. '''
  466. for vSent in self.getSentences():
  467. vSent.loadSSInCTree(pPropagation = pPropagation)
  468. def loadWordVectors(self, pWordVectors, pflgFilter = True):
  469. '''
  470. Loads word vectors from a file or WordVector object, whichever is given
  471. By default, it filters out the words not in the data vocabulary, which can be changed to not filter (e.g. when
  472. the input is already filtered).
  473. '''
  474. from ml import wv
  475. self.wv = wv.WordVector()
  476. if type(pWordVectors) == str:
  477. vWV.load(pWVFilename = vWVFile, plFilterVocab = self.getVocabualry())
  478. else:
  479. self.wv = pWordVectors
  480. def toBratInput(self):
  481. '''
  482. Generates the dataset in Brat annotation input format
  483. There are two input types: a text input containing the raw sentences and a annotation input containing the
  484. annotation.
  485. '''
  486. vSentDocOffset = 0 # document offset of the current sentence
  487. vlTxt = []
  488. vlAnn = []
  489. for i, vAT in enumerate(self.getAspectTerms(), start = 1):
  490. vlTxt.append(vAT.sentence.getText())
  491. # setting the aspect term's character offset in the document
  492. # part of the sentence before the aspect term token
  493. vSentUpToAT = ' '.join(vAT.sentence.getTokens()[ : (vAT.getTokenSpan()[0] - 1)])
  494. # character offset of the aspect term in token(s) containing the aspect term (e.g. 5 for built in well-built)
  495. vATPosInToken = ' '.join(vAT.getTokens()).find(vAT.getForm())
  496. if (vSentUpToAT == ''):
  497. vATDocOffset = vSentDocOffset + len(vSentUpToAT) + vATPosInToken
  498. else:
  499. vATDocOffset = vSentDocOffset + len(vSentUpToAT) + vATPosInToken + 1
  500. vlAnn.append("T%s\tAT-%s %s %s\t%s" % (i, vAT.getPolarity()[:3], vATDocOffset, vATDocOffset + len(vAT.getForm()), vAT.getForm()))
  501. # setting the next sentence's character offset in the document
  502. vSentDocOffset += len(vAT.sentence.getText()) + 1 # +1 for newline
  503. return vlTxt, vlAnn
  504. def loadBratOE(self, pBratAnnFilename, pflgVerbos = False, pdNewAnnTypes = None):
  505. '''
  506. Loads opinion or sentiment expression annotation from Brat annotator output format
  507. It assumes that the Brat input was provided using toBratInput() method, so the order
  508. of the sentences are retained.
  509. pdNewAnnNames is a dictionary which translates the annotation type names with new ones. These is primarily added
  510. to rename OE (opinion exression) to SE (sentiment expression) in the SE annotation internship project.
  511. '''
  512. # first loading OE from Brat ann file
  513. vlOEs = []
  514. for l in open(pBratAnnFilename).read().strip().split('\n'):
  515. vlSplit = l.split('\t')
  516. if vlSplit[1].startswith("OE"):
  517. vlOESplit = vlSplit[1].split()
  518. # renaming the annotation types if new names are given
  519. if pdNewAnnTypes is not None and pdNewAnnTypes != {}:
  520. vAnnType = pdNewAnnTypes[vlOESplit[0]]
  521. else:
  522. vAnnType = vlOESplit[0]
  523. vlOEs.append({"type": vAnnType, "from": int(vlOESplit[1]), "to": int(vlOESplit[2]), "surface": vlSplit[2]})
  524. # sorting OE list based on their document offset
  525. vlOEs.sort(key = lambda x: x["from"])
  526. # extracting OEs for each aspect term: assumes that the Brat input was provided using toBratInput() method, so the order
  527. # of the sentences are retained.
  528. vSentDocOffset = 0
  529. l = []
  530. for i, vAT in enumerate(self.getAspectTerms(), start = 1):
  531. if pflgVerbos:
  532. print "%s) %s" % (i, vAT.sentence.getText())
  533. print "\nAT: " + vAT.getForm()
  534. print " Polairty: " + vAT.getPolarity()
  535. print "OEs:"
  536. l.append(vAT.sentence.getText())
  537. # calculating document character span of the aspect term's sentence
  538. vSentDocSpan = (vSentDocOffset, vSentDocOffset + len(vAT.sentence.getText()))
  539. # collecting OEs of the current AT based on character offsets: not very efficient
  540. vlATOEs = [oe for oe in vlOEs if oe["from"] >= vSentDocSpan[0] and oe["to"] <= vSentDocSpan[1]]
  541. if pflgVerbos:
  542. if len(vlATOEs) == 0:
  543. print " No OE annotated for this aspect term (polarity is %s)" % vAT.getPolarity()
  544. # adding the OEs to the aspect term
  545. for oe in vlATOEs:
  546. # sanity check
  547. if oe["to"] > vSentDocSpan[1]:
  548. raise Exception("Invalid OE span: %s > %s; the end falls out of the sentence" % (oe["to"], vSentDocSpan[1]))
  549. # finding token span of the OE
  550. vTokenOffsetStart = vSentDocOffset
  551. vOETokenSpanStart = 0
  552. vOETokenSpanEnd = 0
  553. for i, token in enumerate(vAT.sentence.getTokens(), start = 1):
  554. vTokenOffsetEnd = vTokenOffsetStart + len(token) # end of this token
  555. # start token: sometimes the OE is annotated from the middle of token when the tokenization has problem (e.g. -when)
  556. if oe["from"] >= vTokenOffsetStart and oe["from"] <= vTokenOffsetEnd:
  557. vOETokenSpanStart = i
  558. # end token: sometimes the OE is annotated until the middle of token when the tokenization has problem (e.g. headphones/mic)
  559. if oe["to"] >= vTokenOffsetStart and oe["to"] <= vTokenOffsetEnd:
  560. vOETokenSpanEnd = i
  561. vTokenOffsetStart += len(token) + 1 # start of next token
  562. vOE = OpinionExpression()
  563. vOE.span = (vOETokenSpanStart, vOETokenSpanEnd)
  564. vOE.type = oe["type"]
  565. if vOE.span == (0, 0):
  566. raise Exception("No token match was found!\n%s" % oe)
  567. else:
  568. vAT.addOE(vOE)
  569. if pflgVerbos:
  570. print " %s" % vOE.type
  571. print " Original: %s" % oe["surface"]
  572. print " Extracted: %s" % vOE.getForm()
  573. if pflgVerbos:
  574. print "\n............................................."
  575. # setting the next sentence's character offset in the document
  576. vSentDocOffset += len(vAT.sentence.getText()) + 1 # +1 for newline
  577. def oeToBIO(self, pflgPOSTags = False):
  578. '''
  579. Converts and returns opinion/sentiment expressions of each aspect term to BIO format
  580. The returned output is a 2D list of aspect terms sentences and their tokens in "token\ttag" format. Optionally,
  581. the POS tags can also be included in the output making the format "token\tPOS\ttag".
  582. '''
  583. vllOutput = []
  584. for vAT in self.getAspectTerms():
  585. vllOutput.append([])
  586. vlAnn = ['O' for i in range(vAT.sentence.length)]
  587. if len(vAT.oes) > 0:
  588. for vOE in vAT.oes:
  589. vlAnn[vOE.getTokenSpan()[0] - 1] = 'B'
  590. for i in range(vOE.getTokenSpan()[0], vOE.getTokenSpan()[1]):
  591. vlAnn[i] = 'I'
  592. if pflgPOSTags:
  593. for vAnn, vPOS, vTok in zip(vlAnn, vAT.sentence.getPOSTags(), vAT.sentence.getTokens()):
  594. vllOutput[-1].append("%s\t%s\t%s" % (vTok, vPOS, vAnn))
  595. else:
  596. for vAnn, vTok in zip(vlAnn, vAT.sentence.getTokens()):
  597. vllOutput[-1].append("%s\t%s" % (vTok, vAnn))
  598. return vllOutput
  599. def oeToIO(self):
  600. '''
  601. Converts and returns opinion/sentiment expressions of each aspect term to binary IO format
  602. Binary IO tags can be used when there is only one type of sentiment expression annotated and one sentiment expression
  603. per sentence is possible.
  604. '''
  605. return [at.getOEsIO() for at in self.getAspectTerms()]
  606. def pickle(self, pFilename):
  607. '''
  608. Pickles the object into the give file name
  609. '''
  610. pickle.dump(self.__dict__, open(pFilename, "wb"), protocol = 2)
  611. def loadFromPickle(self, pFilename):
  612. '''
  613. Loads the pickled ABSASet object to this object
  614. '''
  615. self.__dict__.update(pickle.load(open(pFilename)))
  616. def getPOSTagSet(self):
  617. '''
  618. Retunrs the POS tag set of the sentences of the dataset
  619. '''
  620. vlPOSTags = []
  621. for vSent in self.getSentences():
  622. vlPOSTags += vSent.getPOSTags()
  623. return set(vlPOSTags)
  624. class ABSAContext:
  625. """
  626. Class for aspect-based sentiment analysis context (set of sentences)
  627. """
  628. def __init__(self, pABSASet):
  629. '''
  630. Constructor
  631. '''
  632. self.dataset = pABSASet
  633. self.sentences = []
  634. def getSentences(self, pSort=''):
  635. '''
  636. Returns the ABSA sentences
  637. The sort options are:
  638. - None: in document order
  639. - text: in sentence text order
  640. '''
  641. if pSort.lower() == "text":
  642. return [s for s in sorted(self.getSentences(), key=lambda x: x.getText())]
  643. else:
  644. return self.sentences
  645. def addSentence(self, pSentence):
  646. '''
  647. Adds ABSA sentence to the context
  648. '''
  649. if pSentence not in self.sentences:
  650. pSentence.context = self
  651. self.sentences.append(pSentence)
  652. def getTokens(self):
  653. '''
  654. Returns token list of all sentences in the context
  655. For tokens to make sense, make sure the data is tokenized.
  656. '''
  657. return [t for s in self.getSentences() for t in s.getTokens()]
  658. def getATPolarities(self):
  659. '''
  660. Returns list of polarities of all aspect temrs in the context
  661. '''
  662. return [at.getPolarity() for s in self.getSentences() for at in s.getAspectTerms()]
  663. def getOverallPolarity(self):
  664. '''
  665. Returns the overall polarity of the context based on the polarity of its aspect terms
  666. See the code for how the overall is calculated. Basically, number positive or negative polarities in the context
  667. should be at least twice as many as the opposite polarity, or otherwise the polarity will be considered neutral.
  668. ToDo: polarity values are fixated here. They should be variable based on the data.
  669. '''
  670. vdPolarities = util.groupBy(self.getATPolarities())
  671. if "negative" not in vdPolarities and "positive" not in vdPolarities:
  672. return "neutral"
  673. elif "negative" not in vdPolarities and "positive" in vdPolarities:
  674. return "positive"
  675. elif "negative" in vdPolarities and "positive" not in vdPolarities:
  676. return "negative"
  677. elif "negative" in vdPolarities and "positive" in vdPolarities:
  678. if vdPolarities["negative"] >= (2 * vdPolarities["positive"]):
  679. return "negative"
  680. elif vdPolarities["positive"] >= (2 * vdPolarities["negative"]):
  681. return "positive"
  682. else:
  683. return "neutral"
  684. else:
  685. raise Exception("Strange situation: %s" % vdPolarities)
  686. class ABSASent:
  687. '''
  688. Class for aspect-based sentiment analysis sentence
  689. '''
  690. def __init__(self, pABSAContext):
  691. '''
  692. Constructor
  693. '''
  694. # ABSAContext the sentence belongs to
  695. self.context = pABSAContext
  696. self.text = None
  697. self.aspectTerms = []
  698. self.cTree = None
  699. self.dTree = None
  700. self.posTagging = None
  701. self.sentScores = [] # sentiment score, one per word in the tokenized self.text
  702. self.oes = [] # opinion expressions in the sentence (not those of aspect terms)
  703. def getText(self):
  704. '''
  705. Returns sentence text (form)
  706. '''
  707. return self.text
  708. def getTokens(self):
  709. '''
  710. Returns the tokenization of the sentence
  711. The sentence text is assumed to be in tokenized format and only
  712. splits on space.
  713. '''
  714. return self.getText().split()
  715. @property
  716. def length(self):
  717. '''
  718. Returns the sentence length
  719. '''
  720. return len(self.getTokens())
  721. def getAspectTerms(self):
  722. '''
  723. Returns aspect terms (objects) of the sentence
  724. '''
  725. return self.aspectTerms
  726. def getOEs(self):
  727. '''
  728. Returns opinion expressions (objects) of the sentence
  729. '''
  730. return self.oes
  731. def getAspectTermCount(self):
  732. '''
  733. Returns the number of aspect terms in the sentence
  734. '''
  735. return len(self.aspectTerms)
  736. def getAspectTermForms(self):
  737. '''
  738. Returns aspect term forms (term attributes) of the sentence
  739. '''
  740. return [t.getForm() for t in self.getAspectTerms()]
  741. def getConstTree(self):
  742. '''
  743. Returns the constituency parse tree of the sentence
  744. The returned object is of type constparse.ConstTree
  745. '''
  746. return self.cTree
  747. def getPOSTags(self):
  748. '''
  749. Returns the list of POS tags which matches the token list
  750. The POS tags are extracted from the constituency tree or dependency tree
  751. '''
  752. if self.cTree is not None:
  753. return self.cTree.getPOSs()
  754. elif self.dTree is not None:
  755. return self.dTree.getPOSs()
  756. elif self.posTagging is not None:
  757. return self.posTagging.getPOSTags()
  758. else:
  759. return []
  760. def getDepTree(self):
  761. '''
  762. Returns the dependency parse tree of the sentence
  763. The returned object is of type depparse.DepTree
  764. '''
  765. return self.dTree
  766. def loadConstTree(self, pConstTree):
  767. '''
  768. Loads the constituency parse tree of the sentence
  769. The contituency tree can be provided in bracketing format or as
  770. constparse.ConstTree object.
  771. '''
  772. # loading the tree
  773. if isinstance(pConstTree, constparse.ConstTree):
  774. vConstTree = pConstTree.getPTBFormat()
  775. else:
  776. vConstTree = pConstTree
  777. self.cTree = ABSACTree()
  778. self.cTree.loadPTBTree(vConstTree, pflgExpandTerminal = True)
  779. # sanity check; comment out
  780. #if self.cTree.surface != self.getText():
  781. # print "Sentence and tree mismatch:\nSentence: %s\nTree: %s\n" % (self.getText(), self.cTree.getPTBFormat())
  782. def loadDepTree(self, pDepTree):
  783. '''
  784. Loads the dependency parse tree of the sentence
  785. The dependency tree is assumed to be depparse.DepTree object.
  786. '''
  787. # loading the tree
  788. if not isinstance(pDepTree, depparse.DepTree):
  789. raise Exception("A DepTree object is expected!")
  790. self.dTree = ABSADTree()
  791. self.dTree.loadFromDepTree(pDepTree = pDepTree)
  792. # sanity check; comment out
  793. #if self.dTree.surface != self.getText():
  794. # print "Sentence and tree mismatch:\nSentence: %s\nTree:%s\n" % (self.getText(), self.dTree.surface)
  795. def loadPOSTagging(self, pPOSTagging):
  796. '''
  797. Loads POS tagging of the sentence
  798. POS tagging is a pos.POSTagging object.
  799. '''
  800. self.posTagging = pPOSTagging
  801. def loadBIOOpinionExpressions(self, plBIO):
  802. '''
  803. Loads opinion expression annotations of the sentence based on BIO tagging
  804. The BIO labels are provided in a list which is supposed to match the tokens in the sentence.
  805. '''
  806. # start and end of 1-based span
  807. vSpanStart = 0
  808. vSpanEnd = 0
  809. for i, vLabel in enumerate(plBIO, start = 1):
  810. if vLabel.lower() == 'b':
  811. vSpanStart = i
  812. vSpanEnd = i
  813. elif vLabel.lower() == 'i':
  814. vSpanEnd += 1
  815. elif vLabel.lower() == 'o':
  816. if vSpanStart != 0: # means the first token after span
  817. vOE = OpinionExpression()
  818. vOE.span = (vSpanStart, vSpanEnd)
  819. vOE.sentence = self
  820. self.oes.append(vOE)
  821. vSpanStart = 0
  822. vSpanEnd = 0
  823. def embedATsInCTree(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'no', pflgExtendATSpanToDT = False):
  824. '''
  825. Embeds the aspect terms in the constituency tree
  826. pflgModNPStruct specifies whether the NP structures should be edited
  827. in order to avoid term/constituent mismatch.
  828. pEmbedPosition specifies where the aspect term should be embedded
  829. in the subtree. The possible values are:
  830. - (span)ning-constituent: embedding into a node in subtree which
  831. spans the aspect term tokens. This may
  832. cause mismatches where the aspect term
  833. span is not fully covered by the node
  834. span. pOnMismatch can be set to fix the
  835. issue.
  836. - (pre)-terminals: embedding into each and every pre-terminal node
  837. in the subtree falling in the aspect term span.
  838. pOnMismatch specifies the method for handling aspect term/constituent
  839. mismatch. The possible values are:
  840. - nothing (or none): do not handle the mismatches
  841. - node (or insert): insert a new node covering the aspect term span
  842. If pflgExtendATSpanToDT is set to true, the span of aspect terms which
  843. only exclude the determiner of the NP is extended to cover the
  844. determiner to reduce the number of mismatches.
  845. '''
  846. # modifying NP structures
  847. if pflgModNPStruct:
  848. self.cTree.modifyNPStruct()
  849. # embeding the aspect terms
  850. self.cTree.embedAspectTerms(plAspectTerm = self.getAspectTerms(), pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendATSpanToDT)
  851. def embedOEsInCTree(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'no'):
  852. '''
  853. Embeds the aspect terms in the constituency tree
  854. pflgModNPStruct specifies whether the NP structures should be edited in order to avoid expression/constituent
  855. mismatch.
  856. pEmbedPosition specifies where the opinion expression should be embedded in the subtree. The possible values are:
  857. - (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches
  858. where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
  859. the issue.
  860. - (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
  861. pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
  862. - nothing (or none): do not handle the mismatches
  863. - node (or insert): insert a new node covering the aspect term span
  864. '''
  865. # modifying NP structures
  866. if pflgModNPStruct:
  867. self.cTree.modifyNPStruct()
  868. # embeding the opinion expressions
  869. self.cTree.embedOpinionExpressions(plOEs = self.getOEs(), pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
  870. def embedATsInDTree(self):
  871. '''
  872. Embeds the aspect terms in the dependency tree
  873. '''
  874. # embeding the aspect terms
  875. self.dTree.embedAspectTerms(plAspectTerms = self.getAspectTerms())
  876. def embedOEsInDTree(self):
  877. '''
  878. Embeds the opinion expressions in the dependency tree
  879. '''
  880. self.dTree.embedOpinionExpressions(plOEs = self.getOEs())
  881. def getPTBConstTree(self):
  882. '''
  883. Returns the constituency tree of the sentence in PTB bracketing format
  884. '''
  885. return self.cTree.getPTBFormat()
  886. def extractATsInCTree(self):
  887. '''
  888. Extracts and returns the list of aspect terms embedded into the
  889. constituency tree if the tree is already loaded and an empty list
  890. otherwise
  891. '''
  892. if self.cTree == None:
  893. return []
  894. else:
  895. return self.cTree.extractAspectTerms()
  896. def extractATermConstMismatch(self):
  897. '''
  898. Extracts the aspect terms which do not match a constituent node
  899. in the tree thus missing in the constituency tree
  900. Aspect term/constituent node mismatches happen because of not being
  901. embeded in the tree which occur due to the inconsistency between
  902. syntactic phrases and the phrases to which these terms are originally
  903. assigned. Parsing errors can be one reason for this but also the
  904. annotation scheme, such as flat noun phrase annotation, can also
  905. cause this problem.
  906. '''
  907. if self.cTree == None:
  908. return []
  909. vlSentATerms = self.getAspectTerms()
  910. vlCTreeATerms = self.cTree.extractAspectTerms()
  911. vlMismatches = []
  912. if len(vlCTreeATerms) != len(vlSentATerms):
  913. for vSentAT in vlSentATerms:
  914. if vSentAT not in vlCTreeATerms:
  915. vlMismatches.append(vSentAT)
  916. return vlMismatches
  917. def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
  918. '''
  919. Loads sentiment scores to words in the sentence from a sentiment
  920. lexicon which is a Sentexicon object
  921. It returns a tuple of the total number of words in the sentence
  922. and the number of words found in the lexicon.
  923. If the word is not found in the lexicon, None will be used.
  924. For details about Sentexicon object, see sentexicon.py
  925. '''
  926. vlWords = self.getTokens()
  927. self.sentScores = []
  928. for vWord in vlWords:
  929. vScore = pSentexicon.getScore(vWord)
  930. if vScore is None and pNeutralScore is not None:
  931. self.sentScores.append(pNeutralScore)
  932. else:
  933. self.sentScores.append(vScore)
  934. return self.length, len([s for s in self.sentScores if s is not None])
  935. def getSentimentScores(self):
  936. '''
  937. Returns the sentiment scores in a list corresponding to the token list
  938. '''
  939. return self.sentScores
  940. def loadSSInDTree(self):
  941. '''
  942. Loads sentiment scores into the dependency tree of the sentence
  943. '''
  944. self.dTree.loadSentScores(self.sentScores)
  945. def loadSSInCTree(self, pPropagation = None):
  946. '''
  947. Loads sentiment scores into the constituency tree of the sentence
  948. '''
  949. self.cTree.loadSentScores(self.sentScores, pPropagation = pPropagation)
  950. def generateNGramKTree(self):
  951. '''
  952. Generates and returns the tree representation of the surface form
  953. of the sentence
  954. '''
  955. vABSANGramKTree = ABSANGramKTree(pABSASent = self)
  956. return vABSANGramKTree.generateNGramKTree()
  957. def extractCaraFXInput(self):
  958. '''
  959. Extracts the data in the format required for Cara feature extractor
  960. Cara feature extractor requires three files: normalized text, POS tagged sentences and aspect term indexes. See
  961. https://github.com/CNGL-repo/Cara/wiki/Cara pipeline.
  962. '''
  963. # The data must be tokenized as loading time.
  964. vNormalizedTxt = self.getText()
  965. vNormalizedTxt = vNormalizedTxt.replace("(", "-LRB-")
  966. vNormalizedTxt = vNormalizedTxt.replace(")", "-RRB-")
  967. # format: (POS_1 token_1)(POS_2 token_2)...(POS_n token_n)
  968. vPOSTagged = ''.join(["(%s %s)" % (p, t) for p, t in zip(self.cTree.getPOSs(), self.getTokens())])
  969. vPOSTagged = vPOSTagged.replace(" (", " -LRB-")
  970. vPOSTagged = vPOSTagged.replace(" )", " -RRB-")
  971. return vNormalizedTxt, vPOSTagged
  972. def getAvgOESentScore(self):
  973. '''
  974. Calculates and returns the average sentiment score of the opinion expression tokens of the sentence
  975. '''
  976. vlOEs = self.getOEs()
  977. if len(vlOEs) == 0:
  978. return 0
  979. else:
  980. return sum([oe.getAvgSentScore() for oe in vlOEs]) / len(vlOEs)
  981. def getAvgSentScore(self):
  982. '''
  983. Calculates and returns the average sentiment score sentence tokens
  984. '''
  985. vlSentScores = self.getSentimentScores()
  986. return sum(vlSentScores) / len(vlSentScores)
  987. def getPolarScores(self, pNeutralScore = None):
  988. '''
  989. Returns sentiment scores with non-neutral polarity
  990. Neutral polarity score can be set as parameter. It is None by default meaning that no polarity score is assigned
  991. to neutral words.
  992. '''
  993. return [s for s in self.sentScores if s != pNeutralScore]
  994. def getWordVectors(self):
  995. '''
  996. Returns the word vectors of the sentence tokens
  997. '''
  998. return [self.dataset.wv.getVector(t) for t in self.getTokens()]
  999. class AspectTerm:
  1000. '''
  1001. Class for aspect-based sentiment analysis aspect term
  1002. '''
  1003. def __init__(self):
  1004. '''
  1005. Constructor
  1006. '''
  1007. self.term = None
  1008. self.polarity = None
  1009. self.sentence = None
  1010. # the token span of the term in the text (1-base indexes)
  1011. self.span = None
  1012. # opinion/sentiment expressions towards the aspect term (not those of the sentence, i.e. ABSASent.oe
  1013. self.oes = []
  1014. def getForm(self):
  1015. '''
  1016. Returns the form of the aspect term
  1017. '''
  1018. return self.term
  1019. def getTokens(self):
  1020. '''
  1021. Returns list of tokens of the aspect term
  1022. '''
  1023. if self.span is not None:
  1024. return self.sentence.getTokens()[self.span[0]-1 : self.span[1]]
  1025. else:
  1026. return ''
  1027. def getPolarity(self):
  1028. '''
  1029. Returns the polarity of the aspect term
  1030. '''
  1031. return self.polarity
  1032. def getTokenSpan(self):
  1033. '''
  1034. Returns the token span of the term
  1035. '''
  1036. return self.span
  1037. def getSentenceMarking(self):
  1038. '''
  1039. Returns the marking of the aspect terms on its sentence
  1040. The marking is a list corresponding to the aspect term's sentence tokens, every element of which is either 0 or 1
  1041. depending on if the corresponding token is in the aspect term span or not.
  1042. '''
  1043. return [1 if self.span[0] <= (i + 1) <= self.span[1] else 0 for i in range(self.sentence.length)]
  1044. def embedInCTree(self, pEmbedPosition, pOnMismatch = "nothing", pflgExtendToDT = False):
  1045. '''
  1046. Embeds the aspect term in the constituency tree of the sentence
  1047. pEmbedPosition specifies where the aspect term should be embedded
  1048. in the subtree. The possible values are:
  1049. - (span)ning-constituent: embedding into a node in subtree which
  1050. spans the aspect term tokens. This may
  1051. cause mismatches where the aspect term
  1052. span is not fully covered by the node
  1053. span. pAspectTerm can be set to fix the
  1054. issue.
  1055. - (pre)-terminals: embedding into each and every pre-terminal node
  1056. in the subtree falling in the aspect term span.
  1057. pOnMismatch specifies the method for handling aspect term/constituent
  1058. mismacth. The possible values are:
  1059. each aspect term covering the term span. The values are:
  1060. - nothing (or none): do not handle the mismatches
  1061. - node (or insert): insert a new node covering the aspect term span
  1062. If pflgExtendATSpanToDT is set to true, the span of aspect terms which
  1063. only exclude the determiner of the NP is extended to cover the
  1064. determiner to reduce the number of mismatches.
  1065. '''
  1066. self.sentence.getConstTree().embedAspectTerm(pAspectTerm = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendToDT)
  1067. def getCTreeEmbeding(self, pEmbedPosition, pOnMismatch = "nothing", pflgExtendToDT = False, pdATReprOptions = {}):
  1068. '''
  1069. Embeds the aspect terms in a copy of the constituency tree of the
  1070. sentence and returns the resulting tree
  1071. The representation of aspect term in the tree is based on the options
  1072. specified in a dictionary (pdATReprOptions). The options involve
  1073. attaching aspect term suffix, inserting aspect term node and attaching
  1074. aspect term polarity, each with several possibilities:
  1075. - {suffix: [node/subtree/pre-terminal/parents/parents-partial]}
  1076. - {node: [parent/sister]}
  1077. - {polarity: [node/subtree]}
  1078. For details of the options, see ABSACNode.decorateAT().
  1079. '''
  1080. # copying the tree to keep the original tree intact
  1081. # python deepcopy is used as it did a better job in an experiment than the ConstTree.deepCopy().
  1082. vCTreeCopy = copy.deepcopy(self.sentence.getConstTree())
  1083. if vCTreeCopy.embedAspectTerm(pAspectTerm = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendToDT):
  1084. vCTreeCopy.decorateAT(pdATReprOptions = pdATReprOptions)
  1085. return vCTreeCopy
  1086. else:
  1087. return False
  1088. def getDTreeEmbeding(self, pdATReprOptions = {}):
  1089. '''
  1090. Embeds the aspect terms in a copy of the dependency tree of the
  1091. sentence and returns the resulting tree
  1092. The representation of aspect term in the tree is based on the options
  1093. specified in a dictionary (pdATReprOptions). The options involve
  1094. attaching aspect term suffix and attaching aspect term polarity,
  1095. each with several possibilities.
  1096. NOTE: unlike in constituency tree decoration, AT node insertion
  1097. is not done for the dependency tree. The reason is that in dependency
  1098. tree, there is a one to one relation between the nodes and sentence
  1099. tokens. Inserting a node causes a mismatch between the surface
  1100. form of the sentence and its dependency tree. Also, concepts like
  1101. word form, POS tag and dependency relation are not meaningful in
  1102. an inserted AT node. Instead, the aspect term representation using
  1103. inserted AT nodes can be done at the PTB bracketing format representation
  1104. level of the tree. See ABSADTree.generateDepKTree(). Note that
  1105. the current design of this method in terms of the argument passed
  1106. to it may not be coherent and meaningful, because what is expected
  1107. from an aspect term representation parameter (pdATReprOptions) is
  1108. to carry all the setting required for this purpose, not only the
  1109. part concerned with suffix and polarity attachement. This may be
  1110. handled in the future.
  1111. For details of the options, see ABSADNode.decorateAT().
  1112. '''
  1113. # copying the tree to keep the original tree intact
  1114. # NOTE: consider testing with python deepcopy (Python deepcopy did a
  1115. # better job in an experiment than the ConstTree.deepCopy() in the
  1116. # getCTreeEmbeding())
  1117. vDTreeCopy = self.sentence.getDepTree().deepCopy()
  1118. #vDTreeCopy = copy.deepcopy(self.sentence.getDepTree())
  1119. vDTreeCopy.embedAspectTerm(pAspectTerm = self)
  1120. vDTreeCopy.decorateAT(pdATReprOptions = pdATReprOptions)
  1121. return vDTreeCopy
  1122. def getNGramKTreeEmbeding(self, pFormat = "unary", pNodeContentType = "word", pdNGramKTreeOptions = None, pdATReprOptions = None, pdOptions = None):
  1123. '''
  1124. Generates and returns the tree representation of the surface form
  1125. of the corresponding sentence in various formats with aspect term
  1126. optionally embedded in it
  1127. Formats include:
  1128. - unary: each word is the child of its previous word
  1129. - bigram: each bigram forms a parent/child subtree and all of these subtrees
  1130. are dominated by a root node at the top. These captures unigrams as
  1131. well if subset tree kernels are used.
  1132. - binary: each node has two children which are both nodes representing the word next to the current node's word.
  1133. The first children has a dummy terminal child X which helps capture unigrams via subset tree fragments.
  1134. The second child recursively continues the format by having the next word as its children in the same
  1135. way.
  1136. pNodeContentTypes specifies the content of the n-gram tree nodes, such as word forms or POS tags. For details,
  1137. see the ABSANGramKTree constructor.
  1138. Aspect term representation options include:
  1139. - "node": inserts AT node at:
  1140. - "x": replacing X node under/above the aspect term token
  1141. - "suffix": attached AT suffix to:
  1142. - "token": to aspect term token
  1143. '''
  1144. vABSANGramKTree = ABSANGramKTree(pABSASent = self.sentence, pNodeContentType = pNodeContentType, pAspectTerm = self, pdOptions = pdNGramKTreeOptions)
  1145. return vABSANGramKTree.generateNGramKTree(pFormat, pdATReprOptions, pdOptions)
  1146. def addOE(self, pOE):
  1147. '''
  1148. Adds opinion exression object to the aspect term
  1149. '''
  1150. pOE.sentence = self.sentence
  1151. pOE.aspectTerm = self
  1152. self.oes.append(pOE)
  1153. def getOEsBIO(self, pflgPOSTags=False):
  1154. '''
  1155. Returns BIO tagging of the opinion/sentiment expressions of the aspect term
  1156. The returned output is a list of BIO tags corresponding to aspect term sentence tokens.
  1157. '''
  1158. vlAnn = ['O' for i in range(self.sentence.length)]
  1159. if len(self.oes) > 0:
  1160. for vOE in self.oes:
  1161. vlAnn[vOE.getTokenSpan()[0] - 1] = 'B'
  1162. for i in range(vOE.getTokenSpan()[0], vOE.getTokenSpan()[1]):
  1163. vlAnn[i] = 'I'
  1164. return vlAnn
  1165. def getOEsIO(self):
  1166. '''
  1167. Returns the binary IO tagging of the opinion/sentiment expression boundaries of the aspect term
  1168. Binary IO tags can be used when there is only one type of sentiment expression annotated and one sentiment expression
  1169. per sentence is possible.
  1170. '''
  1171. vlAnn = ['O'] * self.sentence.length
  1172. if len(self.oes) > 0:
  1173. for vOE in self.oes:
  1174. for i in range(vOE.getTokenSpan()[0] - 1, vOE.getTokenSpan()[1]):
  1175. vlAnn[i] = 'I'
  1176. return vlAnn
  1177. class OpinionExpression():
  1178. '''
  1179. Class for opinion expression annotations
  1180. '''
  1181. def __init__(self):
  1182. '''
  1183. Constructor
  1184. '''
  1185. self.type = None # type of the opinion expression if there is a categorization
  1186. # e.g. SE (sentiment expression), SE-pcomp (preceeding complement of SE)
  1187. self.sentence = None
  1188. # the aspect term towards which the opinion is expressed.
  1189. self.aspectTerm = None
  1190. self.span = None
  1191. def getTokens(self):
  1192. '''
  1193. Returns list of tokens of the opinion expression
  1194. '''
  1195. return self.sentence.getTokens()[self.span[0]-1 : self.span[1]]
  1196. def getForm(self):
  1197. '''
  1198. Returns the form of the opinion expression
  1199. '''
  1200. return ' '.join(self.getTokens())
  1201. def getTokenSpan(self):
  1202. '''
  1203. Returns the token span of the term
  1204. '''
  1205. return self.span
  1206. @property
  1207. def length(self):
  1208. '''
  1209. Return the token length of the opinion expression
  1210. '''
  1211. return self.span[1] - self.span[0] + 1
  1212. def embedInCTree(self, pEmbedPosition, pOnMismatch = "nothing"):
  1213. '''
  1214. Embeds the opinion expression in the constituency tree of the sentence
  1215. pEmbedPosition specifies where the opinion expression should be embedded in the subtree. The possible values are:
  1216. - (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches
  1217. where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
  1218. the issue.
  1219. - (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
  1220. pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
  1221. - nothing (or none): do not handle the mismatches
  1222. - node (or insert): insert a new node covering the aspect term span
  1223. '''
  1224. self.sentence.getConstTree().embedOpinionExpression(pOE = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
  1225. def getCTreeEmbeding(self, pEmbedPosition, pOnMismatch = "nothing", pdOEReprOptions = {}):
  1226. '''
  1227. Embeds the opinion expression in a copy of the constituency tree of the sentence and returns the resulting tree
  1228. The representation of OE in the tree is based on the options specified in a dictionary (pdOEReprOptions). The
  1229. options involve attaching OE suffix, inserting OE node and attaching OE polarity, each with several possibilities:
  1230. - {suffix: [node/subtree/pre-terminal/parents/parents-partial]}
  1231. - {node: [parent/sister]}
  1232. - {polarity: [node/subtree]}
  1233. For details of the options, see ABSACNode.decorateOE().
  1234. '''
  1235. # copying the tree to keep the original tree intact
  1236. # python deepcopy is used as it did a better job in an experiment than the ConstTree.deepCopy().
  1237. vCTreeCopy = copy.deepcopy(self.sentence.getConstTree())
  1238. if vCTreeCopy.embedOpinionExpression(pOE = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch):
  1239. vCTreeCopy.decorateOE(pdOEReprOptions = pdOEReprOptions)
  1240. return vCTreeCopy
  1241. else:
  1242. return False
  1243. def getDTreeEmbeding(self, pdOEReprOptions = {}):
  1244. '''
  1245. Embeds the opinion expression in a copy of the dependency tree of the
  1246. sentence and returns the resulting tree
  1247. The representation of OE in the tree is based on the options specified
  1248. in a dictionary (pdOEReprOptions). The options involve attaching OE
  1249. suffix and attaching its polarity, each with several possibilities.
  1250. NOTE: see the same method for aspect term for further documentation.
  1251. For details of the options, see ABSADNode.decorateOE().
  1252. '''
  1253. # copying the tree to keep the original tree intact
  1254. # NOTE: consider testing with python deepcopy (Python deepcopy did a
  1255. # better job in an experiment than the ConstTree.deepCopy() in the
  1256. # getCTreeEmbeding())
  1257. vDTreeCopy = self.sentence.getDepTree().deepCopy()
  1258. #vDTreeCopy = copy.deepcopy(self.sentence.getDepTree())
  1259. vDTreeCopy.embedOpinionExpression(pOE = self)
  1260. vDTreeCopy.decorateOE(pdOEReprOptions = pdOEReprOptions)
  1261. return vDTreeCopy
  1262. def getAvgSentScore(self):
  1263. '''
  1264. Calculates and returns the average sentiment score of the opinion expression tokens
  1265. '''
  1266. return sum(self.sentence.getSentimentScores()[self.span[0] - 1 : self.span[1]]) / self.length