123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905 |
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- This module defines classes to for Aspect Based Sentiment Analysis (ABSA).
-
- NOTE: bugs identified by PyCharm should be fixed (e.g. _attachATSuffix()
- in ABSADNode). It seems they have never been tested.
-
-
- Version 3.0 (30-Apr-2018)
- - getTokens() and getOverallPolarity() are added to ABSAContext.
-
- Version 2.9 (03-Jan-2018 to 05-Jan-2017)
- - mergeWith()is added to ABSASet.
- - pickle() and loadFromPickle() are added to ABSASet.
- - getOEsIO() is added to AspectTerm.
-
- Version 2.8 (17-Jul-2017 to 18-Jul-2017)
- - ABSAContext is added to support SemEval 2016 data. The code is refactored so that
- it still works seamlessly with 2014 data.
-
- Version 2.7 (17-May-2017)
- - Unary tree kernels in ABSANGramKTree is upgraded.
-
- Version 2.6 (24-Mar-2017)
- - AspectTerm.getSentenceMarking() is added.
-
- Version 2.5 (23-Jan-2017)
- - Bug in ABSASet.loadBratOE() fixed to account for cases where SE is annotated
- from/to the middle of a token due to tokenization problem.
- - ABSASet.oeToBIO() is upgraded to include POS tags in the output.
-
- Version 2.4 (01-Dec-2016 to 08-Dec-2016)
- - oeToBIO() is added to ABSASent.
-
- Version 2.3 (11-Oct-2016 to 13-Oct-2016)
- - loadBratOEs() is added to ABSASent to load opinion/sentiment expressions
- annotated in Brat annotator.
- - oes is added to AspectTemrm to keep opinion/sentiment expressions.
- - vABSASent.oe is renamed to oes.
- - Type is added to OpinionExpression.
-
- Version 2.2 (22-Jun-2016)
- - writeBratInput()is added to ABSASet.
-
- Version 2.1 (02-May-2016 to 05-May-2016)
- - ABSANGramKTree is upgraded to use customized unigram auxiliary nodes
- instead of X for binary format.
- - extractAT2RootPathTree() is added to ABSADTree.
- - ABSANGramKTree is upgraded to produce lower case word tree kernels.
- - New format is added to ABSANGramKTree kernels for word vector similarity.
-
- Version 2.0 (12-Apr-2016 to 26-Apr-2016)
- - ABSANGramKTree is upgraded to produce POS tag and sentiment score
- tree kernels similar to word tree kernels.
- - getPOSTags() is added to ABSASent.
- - New formats are added to ABSADKTree and ABSACKTree.
- - getVocabulary() and loadWordVectors() are added to the ABSASet.
- - getWordVectors() is added to ABSASent.
- - A new attribute is added to ABSASent to store the ABSASet it belongs to.
- Its value is set during loading the data.
-
- Version 1.9 (04-Apr-2016 to 11-Apr-2016)
- - New formats are added to ABSANGramKTree and getNGramKTreeEmbeding()
- of AspectTerm is modified to handle various n-gram tree formats.
- - getPOSTags() is added to ABSASent.
-
- Version 1.8 (10-Mar-2016 to 29-Mar-2016)
- - extractAT2OEPath() of ABSADepTree is renamed to extractAT2OEDepRelPath().
- - getPolarScores() is added to ABSASent.
-
- Version 1.7 (17-Feb-2016 to 09-Mar-2016)
- - Constituency and dependency path between aspect term and opinion
- expression is extracted.
- - Bug in ABSADNode.loadFromDepNode() is fixed which assigned DepTree
- object instead of ABSADepTree to depTree attribute.
- - Average sentiment score of the sentence and opinion expressions can
- be extracted using newly added methods to ABSASent and OpinionExpression.
- - getTokens() is added to OpinionExpression and AspectTerm.
- - The aspect length groups are now extracted as part of the aspect term
- statistics by ABSASet.extractATStat().
-
- Version 1.6 (28-Jan-2016 to 02-Feb-2016)
- - Opinion expression is introduced and OpinionExpression is added.
- - loadBIOOpinionExpressions() is added to ABSASet and ABSASent.
- - Methods are added to embed opinion expressions in the trees.
- - ABSADNode.decorate() is renamed to ABSADNode.decorateAT().
- - New formats are added to ABSADKTree.
-
- Version 1.5 (19-Nov-2015 to 26-Nov-2015)
- - Naming of internal methods is ABSACKTree and ABSADKTree are modified
- to fix the bug causing conflict between the methods of these classes
- and their parent classes.
- - New formats are added to ABSACKTree.
- - shallowCopy() is added to ABSAConstTree.
- - deep copying method the trees in tree embedding generations in aspec
- term class is changed to use python deepcopy as it performed better
- with ConstTree in an experiment. However, it has not been tested for
- the dependency tree despite applying the idea. In both cases, this
- still needs to be further confirmed in the future experiments.
-
- Version 1.4 (13-Nov-2015 to 16-Nov-2015)
- - New format is added to ABSADKTree to handle semantic roles.
-
- Version 1.3 (22-Oct-2015 to 02-Nov-2015)
- - ABSACTree.loadSentimentScore() is changed and ABSACNode.loadSentScores()
- is added to enable optionally propagating sentiment scores in the
- tree nodes.
- - pNeutralScore is added to loadSentimentScores in ABSASet and ABSASent
- to assign a score for neutral words other than None.
- - ABSACKTree is added with two new formats.
- - ABSASent.getSentimentScores() is changed to return the scores in a
- list instead of a dictionary.
-
- Version 1.2 (12-Oct-2015 to 15-Oct-2015)
- - ABSADKTree.generateDepKTree() supports subtrees under a node provided
- as a parameter in addition to the whole tree under the root.
- - ABSACDKTree is added to support the integration of constituency and
- dependency trees.
- - ABSANGramKTree is added to construct a tree from text token forms.
- - getNGramKTreeEmbeding() is added to AspectTerm.
- - extractCaraFXInput() is added to ABSASent.
-
- Version 1.1 (28-Sep-2015 to 02-Oct-2015)
- - getSentimentScores(), length() are added to ABSASent.
- - tokenLength() and getSentimentScores() are added to ABSASet.
- - New formats are added to generateDepKTree() of ABSADKTree.
- - Sentiment score representation is added to constituency tree decoration.
- - loadSSInCTree() is added to ABSASet and ABSASent.
-
- Version 1.0 (03-Sep-2015 to 08-Sep-2015)
- - loadSentimenScores() is added to ABSASet and ABSASent.
- - sentScores is added to ABSASent to represent sentiment scores of the
- sentence words.
- - sentScore is added to ABSADNode to represent sentiment score of the
- associated node.
- - loadSSInDTrees() is added to ABSASet to load sentiment scores into
- the dependency trees.
- - loadSSInDTree() is added to ABSASent to load sentiment scores into
- the dependency tree of the sentence.
- - loadSentScores() is added to ABSADTree.
- - setSentScore() is added to ABSADNode.
- - deepCopy() is added to ABSADNode and the constructor and node creation
- methods were accordingly changed.
-
- Version 0.9 (05-Aug-2015 to 26-Aug-2015)
- - ABSADTree and ABSADNode are added, and all the corresponding updates
- are done in ABSASet, ABSASent, AspectTerm.
-
- Version 0.8 (30-Jul-2015 to 04-Aug-2015)
- - The representation of aspect terms in the constituency tree is decoupled
- from embedding them in the tree. Once an aspect term is embedded in
- the tree (either in pre-terminals in its span or in the constituent
- node mapped to it), it can be represented in the tree by inserting
- a new node, attaching suffix and attaching polarity in various formats.
- This is done in ABSACNode.decorateAT(). AspectTerm.getCTreeEmbeding()
- and ABSACNode.embedAspectTerm() were changed accordingly. Also,
- ABSACNode.getPTBFormat() is not overridden any more since polarity
- attachment is done in decorateAT(). Finally, inserting node during
- embedding aspect terms only happens optionally when there is a mismatch.
- This has been implemented by changing pInsertATNode to pOnMismatch
- which specifies the method for handling aspect term/constituent
- mismatch, on of which being inserting new AT node.
-
- Version 0.7 (27-Jul-2015 to 29-Jul-2015)
- - Another option (pAttachATSuffix) is added to embedding aspect term
- in constituency tree to mark nodes in the aspect term subtree with
- a specific suffix tag (_AT).
- - ConstNode.embedAspectTerm() is edited to support embedding the aspect
- term in pre-terminals in addition to constituent nodes spanning the
- aspect term. _embedATINSpanConst() and _embedATINPreterminals() are
- added to ConstNode for this purpose and extractAspectTerms() is edited
- to account for repetitions stemming from embedding aspect terms in
- pre-terminals.
- - ABSADTree and ABSADNode are implemented.
-
- Version 0.6 (22-Jun-2015 to 23-Jun-2015)
- - aspectTerm in ABSACNode is changed to aspectTerms which is a list of
- aspect terms ambeded into the node to allow a node to carry more than
- one aspect terms. The methods handling this attribute have consequently
- been updated.
- - extractATsInCTree() is added to ABSASent.
- - getSentencesWithNoAT() is added to ABSASet.
- - ABSASet.extractATStat() is update to extract more statistics.
-
- Version 0.5 18-May-2015
- - ABSACNode.embedAspectTerm() is edited to better handle the situation
- where pInsertATNode is set to mismatch. ABSACNode._plugAspectTerm
- has accordingly been updated.
- - getAspectTermCount() is added to ABSASet and ABSASent.
-
- Version 0.4 08-May-2015
- - loadConstTrees() in ABSASet and ABSASent are changed not to embed
- the aspect terms. Embeding aspect terms is done instead by new methods
- ABSASet.embedATsInCTrees() and ABSASent.embedATsInCTree().
- - embedAspectTerm() is added to ABSASent to be able to embed individual
- aspect terms in the tree.
- - sentence (attr), embedInCtree(), getCTreeEmbeding() are added to
- AspectTerm.
- - The default value for pflgAttachATPolarity argument of getPTBFormat()
- method in ABSACTree and ABSACNode is changed to False.
-
- Version 0.3 06-May-2015
- - Embedding aspect terms is changed. Accordingly, ABSACNode.embedAspectTerms()
- is renamed to embedAspectTerm().
- - addSentences() is added to ABSASet.
-
- Version 0.2 18-Feb-2015
- - The NP structures in ABSACTree can be modified to avoid term/constituent
- mismatch.
-
- Version 0.1 17-Feb-2015
- - ABSA, ABSASent, AspectTerm, ABSACTree, ABSACNode are added.
-
- """
- from collections import namedtuple
- import re, copy
- import pickle
- class ABSASet:
- '''
- Class for aspect-based sentiment analysis data set
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.contexts = []
-
- # WordVector object containing the word vectors of the vocabulary in the dataset
- self.wv = None
-
-
-
- @property
- def size(self):
- '''
- Returns the size of the data set which is the number of its sentences
- '''
-
- return len(self.getSentences())
-
-
-
- def getContextCount(self):
- '''
- Returns the number of contexts i the data set
- '''
-
- return len(self.contexts)
-
-
-
- @property
- def tokenLength(self):
- '''
- Returns the number of tokens in the data set
- '''
-
- return sum([s.length for s in self.getSentences()])
-
-
-
- def getContexts(self):
- '''
- Returns ABSA contexts of the data set
- '''
-
- return self.contexts
-
-
-
- def getSentences(self, pSort = ''):
- '''
- Returns the ABSA sentences
-
- The sort options are:
- - None: in document order
- - text: in sentence text order
- '''
-
- vlSentences = [s for c in self.getContexts() for s in c.getSentences()]
-
- if pSort.lower() == "text":
- return [s for s in sorted(vlSentences, key = lambda x: x.getText())]
- else:
- return vlSentences
-
-
-
- def getSentencesWithNoAT(self):
- '''
- Returns the ABSA sentences which do not have any aspect terms
- '''
-
- return [s for s in self.getSentences() if s.getAspectTermCount() == 0]
-
-
-
- def addContext(self, pContext):
- '''
- Adds ABSA context to the data set
- '''
-
- if pContext not in self.getContexts():
- pContext.dataset = self
- self.contexts.append(pContext)
-
-
-
- def addContexts(self, plContexts):
- '''
- Adds ABSA contexts to existing contexts
- '''
-
- for vContext in plContexts:
- self.addContext(vContext)
-
-
-
- def delContext(self, pContextIdx):
- '''
- Deletes context at the given index from the dataset
- '''
-
- del self.contexts[pContextIdx]
-
-
-
- def mergeWith(self, pABSASet):
- '''
- Merges this data set with the given ABSA data set
-
- It first checks if both sets to be merged are of the same class derived from ABSASet)
- '''
-
- if self.__class__ != pABSASet.__class__:
- raise Exception("Two data sets must be of the same ABSA type: %s vs. %s" %(self.__class__ , pABSASet.__class__))
-
- # merging contexts
- self.addContexts(pABSASet.getContexts())
-
- # NOTE: word embeddings should also be merged
- print("Word embeddings were not merged. Reload them for the merged data set.")
-
- # NOTE: take care of other newly added attributes if any
-
-
-
- def addSentences(self, plSentences):
- '''
- Adds ABSA sentences to existing sentences
-
- Currently, no care is taken regarding ID duplication.
- '''
-
- for vSent in plSentences:
- if vSent.context not in self.getContexts():
- self.addContext(vSent.context)
-
- vSent.context.addSentence(vSent)
-
-
-
- def getVocabulary(self):
- '''
- Extracts and returns the vocabulary of the dataset
- '''
-
- return sorted(set([t for s in self.getSentences() for t in s.getTokens()]))
-
-
-
- def getAspectTerms(self):
- '''
- Returns all aspect terms objects
- '''
-
- vlAspectTerms = []
-
- for vSent in self.getSentences():
- vlAspectTerms += vSent.getAspectTerms()
-
- return vlAspectTerms
-
-
-
- def getAspectTermCount(self):
- '''
- Returns the total number of aspect terms in the dataset
- '''
-
- vTotalCount = 0
-
- for vSent in self.getSentences():
- vTotalCount += vSent.getAspectTermCount()
-
- return vTotalCount
-
-
-
- def extractSentenceForms(self, pSort = None):
- '''
- Returns the surface form of the sentences
-
- The sort options are:
- - None: in document order
- - id: in sentence ID order
- - text: in sentence text order
- - at: in aspect term order
- '''
-
- if pSort.lower() == "at":
- return [at.sentence.getText() for at in self.getAspectTerms()]
- else:
- return [s.getText() for s in self.getSentences(pSort = pSort)]
-
-
-
- def extractAspectTerms(self):
- '''
- Extracts all aspect terms (forms) in the data and their counts
- '''
-
- vlAspectTerms = []
-
- for vSent in self.getSentences():
- vlAspectTerms += vSent.getAspectTermForms()
-
- return util.groupBy(vlAspectTerms)
-
-
-
- def extractATStat(self):
- '''
- Extract various aspect term statistics
- '''
-
- # data structure to store statistics
-
- ATStat = namedtuple('ATStat', "sentsWithAnAT, sentsWithNoAT, allATCount distATCount hapaxATCount polCounts CTreeATCount, ATConstMismatch, ATLenGroups")
-
- vSentsWithNoAT = len(self.getSentencesWithNoAT())
- vSentsWithAnAT = self.size - vSentsWithNoAT
-
- vdATs = self.extractAspectTerms()
-
- # allATCount: number of all aspect terms
- vAllATCount = sum([at for at in vdATs.itervalues()])
-
- # distATCount: number of distinct aspect terms
- vDistATCount = len(vdATs)
-
- # hapaxATCount: number of hapax aspect terms (terms appearing just once)
- vHapaxATCount = sum([at for at in vdATs.itervalues() if at == 1])
-
- # polCounts: number of each polarity type
- vdPolCounts = {"positive": 0, "negative": 0, "neutral": 0, "conflict": 0}
- for vAT in self.getAspectTerms():
- vdPolCounts[vAT.getPolarity()] += 1
-
- # CTreeATCount: number of aspect terms transferred to the contituency tree
- vCTreeATCount = sum([len(s.extractATsInCTree()) for s in self.getSentences()])
-
- # ATConstMismatch: number of mismatched aspect terms and constituents
- vATConstMismatch = sum([len(s.extractATermConstMismatch()) for s in self.getSentences()])
-
- # ATLenGroups: grouped AT token lengths in a dictionary
- vdATLenGroups = util.groupBy([len(at.getTokens()) for at in self.getAspectTerms()])
-
- return ATStat(vSentsWithAnAT, vSentsWithNoAT, vAllATCount, vDistATCount, vHapaxATCount, vdPolCounts, vCTreeATCount, vATConstMismatch, vdATLenGroups)
-
-
-
- def loadConstTrees(self, plConstTrees):
- '''
- Loads the constituency parse trees of the sentences
-
- It assumes that the provided constituency trees are in the order
- in which the sentences are loaded.
-
- The constituency trees can be provided in bracketing format or as
- constparse.ConstTree objects (in a list).
- '''
-
- for vSent, pCTree in zip(self.getSentences(), plConstTrees):
- vSent.loadConstTree(pCTree)
-
-
-
- def loadDepTrees(self, plDepTrees):
- '''
- Loads the dependency parse trees of the sentences
-
- It assumes that the provided dependency trees are in the order in
- which the sentences are loaded.
-
- The dependency trees are assumed to be provided a list of
- depparse.DepTree objects.
- '''
-
- for vSent, pDTree in zip(self.getSentences(), plDepTrees):
- vSent.loadDepTree(pDTree)
-
-
-
- def loadPOSTaggings(self, pPOSTaggingFilename, pSort = ''):
- '''
- Loads POS taggings of the sentences in the dataset
-
- POS tagging file should be in a columnar format and the order of the sentences should be given in pSort.
- '''
-
- for vSent, vSentPOS in zip(self.getSentences(pSort = pSort), open(pPOSTaggingFilename).read().strip().split('\n\n')):
- vPOSTagging = pos.POSTagging()
- vPOSTagging.loadFromColumnar(vSentPOS)
-
- if vPOSTagging.length != vSent.length:
- raise Exception("Length of sentence and POS tags don't match:\n\n%s\n%s" % (vSent.getTokens(), vPOSTagging.toLorgInput()))
- else:
- vSent.loadPOSTagging(vPOSTagging)
-
-
-
- def loadBIOOpinionExpressions(self, pllBIO):
- '''
- Loads opinion expression annotations based on BIO tagging
-
- The BIO tagging is provided in a 2D list the first dimension of which is supposed to match sentences in the
- dataset and the second the BIO labels of the tokens in each sentence.
- '''
-
- for vSent, vlOEBIO in zip(self.getSentences(), pllBIO):
- vSent.loadBIOOpinionExpressions(vlOEBIO)
-
-
-
- def embedATsInCTrees(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'nothing', pflgExtendATSpanToDT = False):
- '''
- Embeds the aspect terms in the constituency trees
-
- pflgModNPStruct specifies whether the NP structures should be edited
- in order to avoid term/constituent mismatch.
-
- pEmbedPosition specifies where the aspect term should be embedded
- in the subtree. The possible values are:
- - (span)ning-constituent: embedding into a node in subtree which
- spans the aspect term tokens. This may
- cause mismatches where the aspect term
- span is not fully covered by the node
- span. pOnMismatch can be set to fix the
- issue.
- - (pre)-terminals: embedding into each and every pre-terminal node
- in the subtree falling in the aspect term span.
-
- pOnMismatch specifies the method for handling aspect term/constituent
- mismatch. The possible values are:
- - nothing (or none): do not handle the mismatches
- - node (or insert): insert a new node covering the aspect term span
- '''
-
- for vSent in self.getSentences():
- vSent.embedATsInCTree(pflgModNPStruct = pflgModNPStruct, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendATSpanToDT = pflgExtendATSpanToDT)
-
-
-
- def embedOEsInCTrees(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'nothing'):
- '''
- Embeds the opinion expressions in the constituency trees
-
- pflgModNPStruct specifies whether the NP structures should be edited in order to avoid expression/constituent
- mismatch.
-
- pEmbedPosition specifies where the opinion expression should be embedded in the subtree. The possible values are:
- - (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches
- where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
- the issue.
- - (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
-
- pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
- - nothing (or none): do not handle the mismatches
- - node (or insert): insert a new node covering the aspect term span
- '''
-
- for vSent in self.getSentences():
- vSent.embedOEsInCTree(pflgModNPStruct = pflgModNPStruct, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
-
-
-
- def embedATsInDTrees(self):
- '''
- Embeds the aspect terms in the dependency trees
- '''
-
- for vSent in self.getSentences():
- vSent.embedATsInDTree()
-
-
-
- def embedOEsInDTrees(self):
- '''
- Embeds the opinion expressions in the dependency trees
- '''
-
- for vSent in self.getSentences():
- vSent.embedOEsInDTree()
-
-
-
- def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
- '''
- Loads polarity scores to sentences from a sentiment lexicon which
- is a Sentexicon object
-
- For details about Sentexicon object, see sentexicon.py
- '''
-
- vTotalWordNum = 0 # total number of words in the data set
- vTotalEntryWordNum = 0 # number of words found in the lexicon
-
- for vSent in self.getSentences():
- vWordNum, vEntryWordNum = vSent.loadSentimentScores(pSentexicon, pNeutralScore)
-
- vTotalWordNum += vWordNum
- vTotalEntryWordNum += vEntryWordNum
-
- return vTotalWordNum, vTotalEntryWordNum
-
-
-
- def getSentimentScores(self):
- '''
- Returns a dictionary of words in the data set and the sentiment
- scores attached to them
- '''
-
- vdResult = {}
-
- for vSent in self.getSentences():
- for vWord, vScore in zip(vSent.getTokens(), vSent.getSentimentScores()):
- vdResult[vWord] = vScore
-
- return vdResult
-
-
-
- def loadSSInDTrees(self):
- '''
- Loads sentiment scores into dependency tree nodes
- '''
-
- for vSent in self.getSentences():
- vSent.loadSSInDTree()
-
-
-
- def loadSSInCTrees(self, pPropagation = None):
- '''
- Loads sentiment scores into constituency tree nodes
- '''
-
- for vSent in self.getSentences():
- vSent.loadSSInCTree(pPropagation = pPropagation)
-
-
-
- def loadWordVectors(self, pWordVectors, pflgFilter = True):
- '''
- Loads word vectors from a file or WordVector object, whichever is given
-
- By default, it filters out the words not in the data vocabulary, which can be changed to not filter (e.g. when
- the input is already filtered).
- '''
-
- from ml import wv
-
- self.wv = wv.WordVector()
-
- if type(pWordVectors) == str:
- vWV.load(pWVFilename = vWVFile, plFilterVocab = self.getVocabualry())
- else:
- self.wv = pWordVectors
-
-
-
- def toBratInput(self):
- '''
- Generates the dataset in Brat annotation input format
-
- There are two input types: a text input containing the raw sentences and a annotation input containing the
- annotation.
- '''
-
-
- vSentDocOffset = 0 # document offset of the current sentence
- vlTxt = []
- vlAnn = []
-
- for i, vAT in enumerate(self.getAspectTerms(), start = 1):
- vlTxt.append(vAT.sentence.getText())
-
- # setting the aspect term's character offset in the document
-
- # part of the sentence before the aspect term token
- vSentUpToAT = ' '.join(vAT.sentence.getTokens()[ : (vAT.getTokenSpan()[0] - 1)])
-
- # character offset of the aspect term in token(s) containing the aspect term (e.g. 5 for built in well-built)
- vATPosInToken = ' '.join(vAT.getTokens()).find(vAT.getForm())
-
- if (vSentUpToAT == ''):
- vATDocOffset = vSentDocOffset + len(vSentUpToAT) + vATPosInToken
- else:
- vATDocOffset = vSentDocOffset + len(vSentUpToAT) + vATPosInToken + 1
-
- vlAnn.append("T%s\tAT-%s %s %s\t%s" % (i, vAT.getPolarity()[:3], vATDocOffset, vATDocOffset + len(vAT.getForm()), vAT.getForm()))
-
- # setting the next sentence's character offset in the document
- vSentDocOffset += len(vAT.sentence.getText()) + 1 # +1 for newline
-
- return vlTxt, vlAnn
-
-
-
- def loadBratOE(self, pBratAnnFilename, pflgVerbos = False, pdNewAnnTypes = None):
- '''
- Loads opinion or sentiment expression annotation from Brat annotator output format
-
- It assumes that the Brat input was provided using toBratInput() method, so the order
- of the sentences are retained.
-
- pdNewAnnNames is a dictionary which translates the annotation type names with new ones. These is primarily added
- to rename OE (opinion exression) to SE (sentiment expression) in the SE annotation internship project.
- '''
-
- # first loading OE from Brat ann file
-
- vlOEs = []
-
- for l in open(pBratAnnFilename).read().strip().split('\n'):
- vlSplit = l.split('\t')
- if vlSplit[1].startswith("OE"):
- vlOESplit = vlSplit[1].split()
-
- # renaming the annotation types if new names are given
- if pdNewAnnTypes is not None and pdNewAnnTypes != {}:
- vAnnType = pdNewAnnTypes[vlOESplit[0]]
- else:
- vAnnType = vlOESplit[0]
-
- vlOEs.append({"type": vAnnType, "from": int(vlOESplit[1]), "to": int(vlOESplit[2]), "surface": vlSplit[2]})
-
-
- # sorting OE list based on their document offset
- vlOEs.sort(key = lambda x: x["from"])
-
-
- # extracting OEs for each aspect term: assumes that the Brat input was provided using toBratInput() method, so the order
- # of the sentences are retained.
-
- vSentDocOffset = 0
- l = []
- for i, vAT in enumerate(self.getAspectTerms(), start = 1):
- if pflgVerbos:
- print "%s) %s" % (i, vAT.sentence.getText())
- print "\nAT: " + vAT.getForm()
- print " Polairty: " + vAT.getPolarity()
- print "OEs:"
-
- l.append(vAT.sentence.getText())
-
- # calculating document character span of the aspect term's sentence
- vSentDocSpan = (vSentDocOffset, vSentDocOffset + len(vAT.sentence.getText()))
-
- # collecting OEs of the current AT based on character offsets: not very efficient
- vlATOEs = [oe for oe in vlOEs if oe["from"] >= vSentDocSpan[0] and oe["to"] <= vSentDocSpan[1]]
-
- if pflgVerbos:
- if len(vlATOEs) == 0:
- print " No OE annotated for this aspect term (polarity is %s)" % vAT.getPolarity()
-
- # adding the OEs to the aspect term
- for oe in vlATOEs:
- # sanity check
- if oe["to"] > vSentDocSpan[1]:
- raise Exception("Invalid OE span: %s > %s; the end falls out of the sentence" % (oe["to"], vSentDocSpan[1]))
-
- # finding token span of the OE
- vTokenOffsetStart = vSentDocOffset
- vOETokenSpanStart = 0
- vOETokenSpanEnd = 0
- for i, token in enumerate(vAT.sentence.getTokens(), start = 1):
- vTokenOffsetEnd = vTokenOffsetStart + len(token) # end of this token
-
- # start token: sometimes the OE is annotated from the middle of token when the tokenization has problem (e.g. -when)
- if oe["from"] >= vTokenOffsetStart and oe["from"] <= vTokenOffsetEnd:
- vOETokenSpanStart = i
-
- # end token: sometimes the OE is annotated until the middle of token when the tokenization has problem (e.g. headphones/mic)
- if oe["to"] >= vTokenOffsetStart and oe["to"] <= vTokenOffsetEnd:
- vOETokenSpanEnd = i
-
- vTokenOffsetStart += len(token) + 1 # start of next token
-
- vOE = OpinionExpression()
- vOE.span = (vOETokenSpanStart, vOETokenSpanEnd)
- vOE.type = oe["type"]
-
- if vOE.span == (0, 0):
- raise Exception("No token match was found!\n%s" % oe)
- else:
- vAT.addOE(vOE)
-
- if pflgVerbos:
- print " %s" % vOE.type
- print " Original: %s" % oe["surface"]
- print " Extracted: %s" % vOE.getForm()
-
- if pflgVerbos:
- print "\n............................................."
-
- # setting the next sentence's character offset in the document
- vSentDocOffset += len(vAT.sentence.getText()) + 1 # +1 for newline
-
-
-
- def oeToBIO(self, pflgPOSTags = False):
- '''
- Converts and returns opinion/sentiment expressions of each aspect term to BIO format
-
- The returned output is a 2D list of aspect terms sentences and their tokens in "token\ttag" format. Optionally,
- the POS tags can also be included in the output making the format "token\tPOS\ttag".
- '''
-
- vllOutput = []
-
- for vAT in self.getAspectTerms():
- vllOutput.append([])
-
- vlAnn = ['O' for i in range(vAT.sentence.length)]
-
- if len(vAT.oes) > 0:
- for vOE in vAT.oes:
- vlAnn[vOE.getTokenSpan()[0] - 1] = 'B'
-
- for i in range(vOE.getTokenSpan()[0], vOE.getTokenSpan()[1]):
- vlAnn[i] = 'I'
-
- if pflgPOSTags:
- for vAnn, vPOS, vTok in zip(vlAnn, vAT.sentence.getPOSTags(), vAT.sentence.getTokens()):
- vllOutput[-1].append("%s\t%s\t%s" % (vTok, vPOS, vAnn))
- else:
- for vAnn, vTok in zip(vlAnn, vAT.sentence.getTokens()):
- vllOutput[-1].append("%s\t%s" % (vTok, vAnn))
-
- return vllOutput
-
-
-
- def oeToIO(self):
- '''
- Converts and returns opinion/sentiment expressions of each aspect term to binary IO format
-
- Binary IO tags can be used when there is only one type of sentiment expression annotated and one sentiment expression
- per sentence is possible.
- '''
-
- return [at.getOEsIO() for at in self.getAspectTerms()]
-
-
-
- def pickle(self, pFilename):
- '''
- Pickles the object into the give file name
- '''
-
- pickle.dump(self.__dict__, open(pFilename, "wb"), protocol = 2)
-
-
-
- def loadFromPickle(self, pFilename):
- '''
- Loads the pickled ABSASet object to this object
- '''
-
- self.__dict__.update(pickle.load(open(pFilename)))
-
-
-
- def getPOSTagSet(self):
- '''
- Retunrs the POS tag set of the sentences of the dataset
- '''
-
- vlPOSTags = []
-
- for vSent in self.getSentences():
- vlPOSTags += vSent.getPOSTags()
-
- return set(vlPOSTags)
-
-
-
- class ABSAContext:
- """
- Class for aspect-based sentiment analysis context (set of sentences)
- """
-
-
- def __init__(self, pABSASet):
- '''
- Constructor
- '''
-
- self.dataset = pABSASet
-
- self.sentences = []
-
-
-
- def getSentences(self, pSort=''):
- '''
- Returns the ABSA sentences
-
- The sort options are:
- - None: in document order
- - text: in sentence text order
- '''
-
- if pSort.lower() == "text":
- return [s for s in sorted(self.getSentences(), key=lambda x: x.getText())]
- else:
- return self.sentences
-
-
-
- def addSentence(self, pSentence):
- '''
- Adds ABSA sentence to the context
- '''
-
- if pSentence not in self.sentences:
- pSentence.context = self
- self.sentences.append(pSentence)
-
-
-
- def getTokens(self):
- '''
- Returns token list of all sentences in the context
-
- For tokens to make sense, make sure the data is tokenized.
- '''
-
- return [t for s in self.getSentences() for t in s.getTokens()]
-
-
-
- def getATPolarities(self):
- '''
- Returns list of polarities of all aspect temrs in the context
- '''
-
- return [at.getPolarity() for s in self.getSentences() for at in s.getAspectTerms()]
-
-
-
- def getOverallPolarity(self):
- '''
- Returns the overall polarity of the context based on the polarity of its aspect terms
-
- See the code for how the overall is calculated. Basically, number positive or negative polarities in the context
- should be at least twice as many as the opposite polarity, or otherwise the polarity will be considered neutral.
-
- ToDo: polarity values are fixated here. They should be variable based on the data.
- '''
- vdPolarities = util.groupBy(self.getATPolarities())
-
- if "negative" not in vdPolarities and "positive" not in vdPolarities:
- return "neutral"
- elif "negative" not in vdPolarities and "positive" in vdPolarities:
- return "positive"
- elif "negative" in vdPolarities and "positive" not in vdPolarities:
- return "negative"
- elif "negative" in vdPolarities and "positive" in vdPolarities:
- if vdPolarities["negative"] >= (2 * vdPolarities["positive"]):
- return "negative"
- elif vdPolarities["positive"] >= (2 * vdPolarities["negative"]):
- return "positive"
- else:
- return "neutral"
- else:
- raise Exception("Strange situation: %s" % vdPolarities)
-
-
-
- class ABSASent:
- '''
- Class for aspect-based sentiment analysis sentence
- '''
-
-
- def __init__(self, pABSAContext):
- '''
- Constructor
- '''
-
- # ABSAContext the sentence belongs to
- self.context = pABSAContext
-
- self.text = None
- self.aspectTerms = []
- self.cTree = None
- self.dTree = None
- self.posTagging = None
- self.sentScores = [] # sentiment score, one per word in the tokenized self.text
- self.oes = [] # opinion expressions in the sentence (not those of aspect terms)
-
-
-
- def getText(self):
- '''
- Returns sentence text (form)
- '''
-
- return self.text
-
-
-
- def getTokens(self):
- '''
- Returns the tokenization of the sentence
-
- The sentence text is assumed to be in tokenized format and only
- splits on space.
- '''
-
- return self.getText().split()
-
-
-
- @property
- def length(self):
- '''
- Returns the sentence length
- '''
-
- return len(self.getTokens())
-
-
-
- def getAspectTerms(self):
- '''
- Returns aspect terms (objects) of the sentence
- '''
-
- return self.aspectTerms
-
-
-
- def getOEs(self):
- '''
- Returns opinion expressions (objects) of the sentence
- '''
-
- return self.oes
-
-
-
- def getAspectTermCount(self):
- '''
- Returns the number of aspect terms in the sentence
- '''
-
- return len(self.aspectTerms)
-
-
-
- def getAspectTermForms(self):
- '''
- Returns aspect term forms (term attributes) of the sentence
- '''
-
- return [t.getForm() for t in self.getAspectTerms()]
-
-
-
- def getConstTree(self):
- '''
- Returns the constituency parse tree of the sentence
-
- The returned object is of type constparse.ConstTree
- '''
-
- return self.cTree
-
-
-
- def getPOSTags(self):
- '''
- Returns the list of POS tags which matches the token list
-
- The POS tags are extracted from the constituency tree or dependency tree
- '''
-
- if self.cTree is not None:
- return self.cTree.getPOSs()
- elif self.dTree is not None:
- return self.dTree.getPOSs()
- elif self.posTagging is not None:
- return self.posTagging.getPOSTags()
- else:
- return []
-
-
-
- def getDepTree(self):
- '''
- Returns the dependency parse tree of the sentence
-
- The returned object is of type depparse.DepTree
- '''
-
- return self.dTree
-
-
-
- def loadConstTree(self, pConstTree):
- '''
- Loads the constituency parse tree of the sentence
-
- The contituency tree can be provided in bracketing format or as
- constparse.ConstTree object.
- '''
-
- # loading the tree
-
- if isinstance(pConstTree, constparse.ConstTree):
- vConstTree = pConstTree.getPTBFormat()
- else:
- vConstTree = pConstTree
-
- self.cTree = ABSACTree()
- self.cTree.loadPTBTree(vConstTree, pflgExpandTerminal = True)
-
- # sanity check; comment out
- #if self.cTree.surface != self.getText():
- # print "Sentence and tree mismatch:\nSentence: %s\nTree: %s\n" % (self.getText(), self.cTree.getPTBFormat())
-
-
-
- def loadDepTree(self, pDepTree):
- '''
- Loads the dependency parse tree of the sentence
-
- The dependency tree is assumed to be depparse.DepTree object.
- '''
-
- # loading the tree
-
- if not isinstance(pDepTree, depparse.DepTree):
- raise Exception("A DepTree object is expected!")
-
- self.dTree = ABSADTree()
- self.dTree.loadFromDepTree(pDepTree = pDepTree)
-
- # sanity check; comment out
- #if self.dTree.surface != self.getText():
- # print "Sentence and tree mismatch:\nSentence: %s\nTree:%s\n" % (self.getText(), self.dTree.surface)
-
-
-
- def loadPOSTagging(self, pPOSTagging):
- '''
- Loads POS tagging of the sentence
-
- POS tagging is a pos.POSTagging object.
- '''
-
- self.posTagging = pPOSTagging
-
-
-
- def loadBIOOpinionExpressions(self, plBIO):
- '''
- Loads opinion expression annotations of the sentence based on BIO tagging
-
- The BIO labels are provided in a list which is supposed to match the tokens in the sentence.
- '''
-
- # start and end of 1-based span
- vSpanStart = 0
- vSpanEnd = 0
-
- for i, vLabel in enumerate(plBIO, start = 1):
- if vLabel.lower() == 'b':
- vSpanStart = i
- vSpanEnd = i
- elif vLabel.lower() == 'i':
- vSpanEnd += 1
- elif vLabel.lower() == 'o':
- if vSpanStart != 0: # means the first token after span
- vOE = OpinionExpression()
- vOE.span = (vSpanStart, vSpanEnd)
- vOE.sentence = self
- self.oes.append(vOE)
- vSpanStart = 0
- vSpanEnd = 0
-
-
-
- def embedATsInCTree(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'no', pflgExtendATSpanToDT = False):
- '''
- Embeds the aspect terms in the constituency tree
-
- pflgModNPStruct specifies whether the NP structures should be edited
- in order to avoid term/constituent mismatch.
-
- pEmbedPosition specifies where the aspect term should be embedded
- in the subtree. The possible values are:
- - (span)ning-constituent: embedding into a node in subtree which
- spans the aspect term tokens. This may
- cause mismatches where the aspect term
- span is not fully covered by the node
- span. pOnMismatch can be set to fix the
- issue.
- - (pre)-terminals: embedding into each and every pre-terminal node
- in the subtree falling in the aspect term span.
-
- pOnMismatch specifies the method for handling aspect term/constituent
- mismatch. The possible values are:
- - nothing (or none): do not handle the mismatches
- - node (or insert): insert a new node covering the aspect term span
-
- If pflgExtendATSpanToDT is set to true, the span of aspect terms which
- only exclude the determiner of the NP is extended to cover the
- determiner to reduce the number of mismatches.
- '''
-
- # modifying NP structures
-
- if pflgModNPStruct:
- self.cTree.modifyNPStruct()
-
- # embeding the aspect terms
-
- self.cTree.embedAspectTerms(plAspectTerm = self.getAspectTerms(), pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendATSpanToDT)
-
-
-
- def embedOEsInCTree(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'no'):
- '''
- Embeds the aspect terms in the constituency tree
-
- pflgModNPStruct specifies whether the NP structures should be edited in order to avoid expression/constituent
- mismatch.
-
- pEmbedPosition specifies where the opinion expression should be embedded in the subtree. The possible values are:
- - (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches
- where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
- the issue.
- - (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
-
- pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
- - nothing (or none): do not handle the mismatches
- - node (or insert): insert a new node covering the aspect term span
- '''
-
- # modifying NP structures
-
- if pflgModNPStruct:
- self.cTree.modifyNPStruct()
-
- # embeding the opinion expressions
-
- self.cTree.embedOpinionExpressions(plOEs = self.getOEs(), pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
-
-
-
- def embedATsInDTree(self):
- '''
- Embeds the aspect terms in the dependency tree
- '''
-
- # embeding the aspect terms
-
- self.dTree.embedAspectTerms(plAspectTerms = self.getAspectTerms())
-
-
-
- def embedOEsInDTree(self):
- '''
- Embeds the opinion expressions in the dependency tree
- '''
-
- self.dTree.embedOpinionExpressions(plOEs = self.getOEs())
-
-
-
- def getPTBConstTree(self):
- '''
- Returns the constituency tree of the sentence in PTB bracketing format
- '''
-
- return self.cTree.getPTBFormat()
-
-
-
- def extractATsInCTree(self):
- '''
- Extracts and returns the list of aspect terms embedded into the
- constituency tree if the tree is already loaded and an empty list
- otherwise
- '''
-
- if self.cTree == None:
- return []
- else:
- return self.cTree.extractAspectTerms()
-
-
-
- def extractATermConstMismatch(self):
- '''
- Extracts the aspect terms which do not match a constituent node
- in the tree thus missing in the constituency tree
-
- Aspect term/constituent node mismatches happen because of not being
- embeded in the tree which occur due to the inconsistency between
- syntactic phrases and the phrases to which these terms are originally
- assigned. Parsing errors can be one reason for this but also the
- annotation scheme, such as flat noun phrase annotation, can also
- cause this problem.
- '''
-
- if self.cTree == None:
- return []
-
- vlSentATerms = self.getAspectTerms()
- vlCTreeATerms = self.cTree.extractAspectTerms()
-
- vlMismatches = []
-
- if len(vlCTreeATerms) != len(vlSentATerms):
- for vSentAT in vlSentATerms:
- if vSentAT not in vlCTreeATerms:
- vlMismatches.append(vSentAT)
-
- return vlMismatches
-
-
-
- def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
- '''
- Loads sentiment scores to words in the sentence from a sentiment
- lexicon which is a Sentexicon object
-
- It returns a tuple of the total number of words in the sentence
- and the number of words found in the lexicon.
-
- If the word is not found in the lexicon, None will be used.
-
- For details about Sentexicon object, see sentexicon.py
- '''
-
- vlWords = self.getTokens()
- self.sentScores = []
-
- for vWord in vlWords:
- vScore = pSentexicon.getScore(vWord)
- if vScore is None and pNeutralScore is not None:
- self.sentScores.append(pNeutralScore)
- else:
- self.sentScores.append(vScore)
-
- return self.length, len([s for s in self.sentScores if s is not None])
-
-
-
- def getSentimentScores(self):
- '''
- Returns the sentiment scores in a list corresponding to the token list
- '''
-
- return self.sentScores
-
-
-
- def loadSSInDTree(self):
- '''
- Loads sentiment scores into the dependency tree of the sentence
- '''
-
- self.dTree.loadSentScores(self.sentScores)
-
-
-
- def loadSSInCTree(self, pPropagation = None):
- '''
- Loads sentiment scores into the constituency tree of the sentence
- '''
-
- self.cTree.loadSentScores(self.sentScores, pPropagation = pPropagation)
-
-
-
- def generateNGramKTree(self):
- '''
- Generates and returns the tree representation of the surface form
- of the sentence
- '''
-
- vABSANGramKTree = ABSANGramKTree(pABSASent = self)
-
- return vABSANGramKTree.generateNGramKTree()
-
-
-
- def extractCaraFXInput(self):
- '''
- Extracts the data in the format required for Cara feature extractor
-
- Cara feature extractor requires three files: normalized text, POS tagged sentences and aspect term indexes. See
- https://github.com/CNGL-repo/Cara/wiki/Cara pipeline.
- '''
-
- # The data must be tokenized as loading time.
- vNormalizedTxt = self.getText()
- vNormalizedTxt = vNormalizedTxt.replace("(", "-LRB-")
- vNormalizedTxt = vNormalizedTxt.replace(")", "-RRB-")
-
- # format: (POS_1 token_1)(POS_2 token_2)...(POS_n token_n)
- vPOSTagged = ''.join(["(%s %s)" % (p, t) for p, t in zip(self.cTree.getPOSs(), self.getTokens())])
- vPOSTagged = vPOSTagged.replace(" (", " -LRB-")
- vPOSTagged = vPOSTagged.replace(" )", " -RRB-")
-
-
- return vNormalizedTxt, vPOSTagged
-
-
-
- def getAvgOESentScore(self):
- '''
- Calculates and returns the average sentiment score of the opinion expression tokens of the sentence
- '''
-
- vlOEs = self.getOEs()
-
- if len(vlOEs) == 0:
- return 0
- else:
- return sum([oe.getAvgSentScore() for oe in vlOEs]) / len(vlOEs)
-
-
-
- def getAvgSentScore(self):
- '''
- Calculates and returns the average sentiment score sentence tokens
- '''
-
- vlSentScores = self.getSentimentScores()
-
- return sum(vlSentScores) / len(vlSentScores)
-
-
-
- def getPolarScores(self, pNeutralScore = None):
- '''
- Returns sentiment scores with non-neutral polarity
-
- Neutral polarity score can be set as parameter. It is None by default meaning that no polarity score is assigned
- to neutral words.
- '''
-
- return [s for s in self.sentScores if s != pNeutralScore]
-
-
-
- def getWordVectors(self):
- '''
- Returns the word vectors of the sentence tokens
- '''
-
- return [self.dataset.wv.getVector(t) for t in self.getTokens()]
-
-
-
- class AspectTerm:
- '''
- Class for aspect-based sentiment analysis aspect term
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.term = None
- self.polarity = None
-
- self.sentence = None
-
- # the token span of the term in the text (1-base indexes)
- self.span = None
-
- # opinion/sentiment expressions towards the aspect term (not those of the sentence, i.e. ABSASent.oe
- self.oes = []
-
-
-
- def getForm(self):
- '''
- Returns the form of the aspect term
- '''
-
- return self.term
-
-
-
- def getTokens(self):
- '''
- Returns list of tokens of the aspect term
- '''
-
- if self.span is not None:
- return self.sentence.getTokens()[self.span[0]-1 : self.span[1]]
- else:
- return ''
-
-
-
- def getPolarity(self):
- '''
- Returns the polarity of the aspect term
- '''
-
- return self.polarity
-
-
-
- def getTokenSpan(self):
- '''
- Returns the token span of the term
- '''
-
- return self.span
-
-
-
- def getSentenceMarking(self):
- '''
- Returns the marking of the aspect terms on its sentence
-
- The marking is a list corresponding to the aspect term's sentence tokens, every element of which is either 0 or 1
- depending on if the corresponding token is in the aspect term span or not.
- '''
-
- return [1 if self.span[0] <= (i + 1) <= self.span[1] else 0 for i in range(self.sentence.length)]
-
-
-
- def embedInCTree(self, pEmbedPosition, pOnMismatch = "nothing", pflgExtendToDT = False):
- '''
- Embeds the aspect term in the constituency tree of the sentence
-
- pEmbedPosition specifies where the aspect term should be embedded
- in the subtree. The possible values are:
- - (span)ning-constituent: embedding into a node in subtree which
- spans the aspect term tokens. This may
- cause mismatches where the aspect term
- span is not fully covered by the node
- span. pAspectTerm can be set to fix the
- issue.
- - (pre)-terminals: embedding into each and every pre-terminal node
- in the subtree falling in the aspect term span.
-
- pOnMismatch specifies the method for handling aspect term/constituent
- mismacth. The possible values are:
- each aspect term covering the term span. The values are:
- - nothing (or none): do not handle the mismatches
- - node (or insert): insert a new node covering the aspect term span
-
- If pflgExtendATSpanToDT is set to true, the span of aspect terms which
- only exclude the determiner of the NP is extended to cover the
- determiner to reduce the number of mismatches.
- '''
-
- self.sentence.getConstTree().embedAspectTerm(pAspectTerm = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendToDT)
-
-
-
- def getCTreeEmbeding(self, pEmbedPosition, pOnMismatch = "nothing", pflgExtendToDT = False, pdATReprOptions = {}):
- '''
- Embeds the aspect terms in a copy of the constituency tree of the
- sentence and returns the resulting tree
-
- The representation of aspect term in the tree is based on the options
- specified in a dictionary (pdATReprOptions). The options involve
- attaching aspect term suffix, inserting aspect term node and attaching
- aspect term polarity, each with several possibilities:
- - {suffix: [node/subtree/pre-terminal/parents/parents-partial]}
- - {node: [parent/sister]}
- - {polarity: [node/subtree]}
-
- For details of the options, see ABSACNode.decorateAT().
- '''
-
- # copying the tree to keep the original tree intact
- # python deepcopy is used as it did a better job in an experiment than the ConstTree.deepCopy().
- vCTreeCopy = copy.deepcopy(self.sentence.getConstTree())
-
- if vCTreeCopy.embedAspectTerm(pAspectTerm = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendToDT):
- vCTreeCopy.decorateAT(pdATReprOptions = pdATReprOptions)
- return vCTreeCopy
- else:
- return False
-
-
-
- def getDTreeEmbeding(self, pdATReprOptions = {}):
- '''
- Embeds the aspect terms in a copy of the dependency tree of the
- sentence and returns the resulting tree
-
- The representation of aspect term in the tree is based on the options
- specified in a dictionary (pdATReprOptions). The options involve
- attaching aspect term suffix and attaching aspect term polarity,
- each with several possibilities.
-
- NOTE: unlike in constituency tree decoration, AT node insertion
- is not done for the dependency tree. The reason is that in dependency
- tree, there is a one to one relation between the nodes and sentence
- tokens. Inserting a node causes a mismatch between the surface
- form of the sentence and its dependency tree. Also, concepts like
- word form, POS tag and dependency relation are not meaningful in
- an inserted AT node. Instead, the aspect term representation using
- inserted AT nodes can be done at the PTB bracketing format representation
- level of the tree. See ABSADTree.generateDepKTree(). Note that
- the current design of this method in terms of the argument passed
- to it may not be coherent and meaningful, because what is expected
- from an aspect term representation parameter (pdATReprOptions) is
- to carry all the setting required for this purpose, not only the
- part concerned with suffix and polarity attachement. This may be
- handled in the future.
-
- For details of the options, see ABSADNode.decorateAT().
- '''
-
- # copying the tree to keep the original tree intact
- # NOTE: consider testing with python deepcopy (Python deepcopy did a
- # better job in an experiment than the ConstTree.deepCopy() in the
- # getCTreeEmbeding())
- vDTreeCopy = self.sentence.getDepTree().deepCopy()
- #vDTreeCopy = copy.deepcopy(self.sentence.getDepTree())
-
- vDTreeCopy.embedAspectTerm(pAspectTerm = self)
- vDTreeCopy.decorateAT(pdATReprOptions = pdATReprOptions)
-
- return vDTreeCopy
-
-
-
- def getNGramKTreeEmbeding(self, pFormat = "unary", pNodeContentType = "word", pdNGramKTreeOptions = None, pdATReprOptions = None, pdOptions = None):
- '''
- Generates and returns the tree representation of the surface form
- of the corresponding sentence in various formats with aspect term
- optionally embedded in it
-
- Formats include:
- - unary: each word is the child of its previous word
- - bigram: each bigram forms a parent/child subtree and all of these subtrees
- are dominated by a root node at the top. These captures unigrams as
- well if subset tree kernels are used.
- - binary: each node has two children which are both nodes representing the word next to the current node's word.
- The first children has a dummy terminal child X which helps capture unigrams via subset tree fragments.
- The second child recursively continues the format by having the next word as its children in the same
- way.
-
- pNodeContentTypes specifies the content of the n-gram tree nodes, such as word forms or POS tags. For details,
- see the ABSANGramKTree constructor.
-
- Aspect term representation options include:
- - "node": inserts AT node at:
- - "x": replacing X node under/above the aspect term token
- - "suffix": attached AT suffix to:
- - "token": to aspect term token
- '''
-
- vABSANGramKTree = ABSANGramKTree(pABSASent = self.sentence, pNodeContentType = pNodeContentType, pAspectTerm = self, pdOptions = pdNGramKTreeOptions)
-
- return vABSANGramKTree.generateNGramKTree(pFormat, pdATReprOptions, pdOptions)
-
-
-
- def addOE(self, pOE):
- '''
- Adds opinion exression object to the aspect term
- '''
-
- pOE.sentence = self.sentence
- pOE.aspectTerm = self
-
- self.oes.append(pOE)
-
-
-
- def getOEsBIO(self, pflgPOSTags=False):
- '''
- Returns BIO tagging of the opinion/sentiment expressions of the aspect term
-
- The returned output is a list of BIO tags corresponding to aspect term sentence tokens.
- '''
-
- vlAnn = ['O' for i in range(self.sentence.length)]
-
- if len(self.oes) > 0:
- for vOE in self.oes:
- vlAnn[vOE.getTokenSpan()[0] - 1] = 'B'
-
- for i in range(vOE.getTokenSpan()[0], vOE.getTokenSpan()[1]):
- vlAnn[i] = 'I'
-
- return vlAnn
-
-
-
- def getOEsIO(self):
- '''
- Returns the binary IO tagging of the opinion/sentiment expression boundaries of the aspect term
-
- Binary IO tags can be used when there is only one type of sentiment expression annotated and one sentiment expression
- per sentence is possible.
- '''
-
- vlAnn = ['O'] * self.sentence.length
-
- if len(self.oes) > 0:
- for vOE in self.oes:
- for i in range(vOE.getTokenSpan()[0] - 1, vOE.getTokenSpan()[1]):
- vlAnn[i] = 'I'
-
- return vlAnn
-
-
-
- class OpinionExpression():
- '''
- Class for opinion expression annotations
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.type = None # type of the opinion expression if there is a categorization
- # e.g. SE (sentiment expression), SE-pcomp (preceeding complement of SE)
-
- self.sentence = None
-
- # the aspect term towards which the opinion is expressed.
- self.aspectTerm = None
-
- self.span = None
-
-
-
- def getTokens(self):
- '''
- Returns list of tokens of the opinion expression
- '''
-
- return self.sentence.getTokens()[self.span[0]-1 : self.span[1]]
-
-
-
- def getForm(self):
- '''
- Returns the form of the opinion expression
- '''
-
- return ' '.join(self.getTokens())
-
-
-
- def getTokenSpan(self):
- '''
- Returns the token span of the term
- '''
-
- return self.span
-
-
-
- @property
- def length(self):
- '''
- Return the token length of the opinion expression
- '''
-
- return self.span[1] - self.span[0] + 1
-
-
-
- def embedInCTree(self, pEmbedPosition, pOnMismatch = "nothing"):
- '''
- Embeds the opinion expression in the constituency tree of the sentence
-
- pEmbedPosition specifies where the opinion expression should be embedded in the subtree. The possible values are:
- - (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches
- where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
- the issue.
- - (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
-
- pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
- - nothing (or none): do not handle the mismatches
- - node (or insert): insert a new node covering the aspect term span
- '''
-
- self.sentence.getConstTree().embedOpinionExpression(pOE = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
-
-
-
- def getCTreeEmbeding(self, pEmbedPosition, pOnMismatch = "nothing", pdOEReprOptions = {}):
- '''
- Embeds the opinion expression in a copy of the constituency tree of the sentence and returns the resulting tree
-
- The representation of OE in the tree is based on the options specified in a dictionary (pdOEReprOptions). The
- options involve attaching OE suffix, inserting OE node and attaching OE polarity, each with several possibilities:
- - {suffix: [node/subtree/pre-terminal/parents/parents-partial]}
- - {node: [parent/sister]}
- - {polarity: [node/subtree]}
-
- For details of the options, see ABSACNode.decorateOE().
- '''
-
- # copying the tree to keep the original tree intact
- # python deepcopy is used as it did a better job in an experiment than the ConstTree.deepCopy().
- vCTreeCopy = copy.deepcopy(self.sentence.getConstTree())
-
- if vCTreeCopy.embedOpinionExpression(pOE = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch):
- vCTreeCopy.decorateOE(pdOEReprOptions = pdOEReprOptions)
- return vCTreeCopy
- else:
- return False
-
-
-
- def getDTreeEmbeding(self, pdOEReprOptions = {}):
- '''
- Embeds the opinion expression in a copy of the dependency tree of the
- sentence and returns the resulting tree
-
- The representation of OE in the tree is based on the options specified
- in a dictionary (pdOEReprOptions). The options involve attaching OE
- suffix and attaching its polarity, each with several possibilities.
-
- NOTE: see the same method for aspect term for further documentation.
-
- For details of the options, see ABSADNode.decorateOE().
- '''
-
- # copying the tree to keep the original tree intact
- # NOTE: consider testing with python deepcopy (Python deepcopy did a
- # better job in an experiment than the ConstTree.deepCopy() in the
- # getCTreeEmbeding())
- vDTreeCopy = self.sentence.getDepTree().deepCopy()
- #vDTreeCopy = copy.deepcopy(self.sentence.getDepTree())
-
- vDTreeCopy.embedOpinionExpression(pOE = self)
- vDTreeCopy.decorateOE(pdOEReprOptions = pdOEReprOptions)
-
- return vDTreeCopy
-
-
-
- def getAvgSentScore(self):
- '''
- Calculates and returns the average sentiment score of the opinion expression tokens
- '''
-
- return sum(self.sentence.getSentimentScores()[self.span[0] - 1 : self.span[1]]) / self.length
-
-
-
|