123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737 |
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- This module defines classes for sentence-level sentiment analysis (SLSA).
-
- Version 0.1 (06-Jul-2016)
- - SLSASet, SLSASent are added.
- """
- from parse import constparse, depparse
- from ml import tk
- from nlp import nlp
- class SLSASet:
- '''
- Class for sentence level sentiment analysis dataset.
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.sentences = []
-
- # WordVector object containing the word vectors of the vocabulary in the dataset
- self.wv = None
-
-
-
- def load(self, pTxtFilename, pPolarityFilename, pflgTokenize = False, pLanguage = "en"):
- '''
- Loads the data from input files
-
- This is the basic loader of SLSA data which expects the sentences
- (text) and polarity scores to be provided in separate parallel files.
- '''
-
-
- self.sentences = []
-
- vlTxtLines = open(pTxtFilename).read().strip().split('\n')
- vlPolarityScores = open(pPolarityFilename).read().strip().split('\n')
-
- if len(vlTxtLines) != len(vlPolarityScores):
- raise Exception("Number of sentences does not match number of scores: %s vs. %s" % (len(vlTxtLines), len(vlPolarityScores)))
-
- for vTxtLine, vPolScore in zip(vlTxtLines, vlPolarityScores):
- vSLSASent = SLSASent(pSLSASet = self)
- vSLSASent.load(vTxtLine, vPolScore, pflgTokenize = pflgTokenize, pLanguage = pLanguage)
- self.sentences.append(vSLSASent)
-
-
-
- @property
- def size(self):
- '''
- Returns the size of the data set which is the number of its sentences
- '''
-
- return len(self.getSentences())
-
-
-
- @property
- def tokenLength(self):
- '''
- Returns the number of tokens in the data set
- '''
-
- return sum([s.length for s in self.getSentences()])
-
-
-
- def getSentences(self, pSort = ''):
- '''
- Returns the SLSA sentences
-
- The sort options are:
- - None: in document order
- - text: in sentence text order
- '''
-
- if pSort.lower() == "text":
- return [s for s in sorted(self.sentences, key = lambda x: x.getText())]
- else:
- return [s for s in self.sentences]
-
-
-
- def addSentences(self, plSentences):
- '''
- Adds SLSA sentences to existing sentences
-
- Currently, no care is taken regarding ID duplication.
- '''
-
- for vSent in plSentences:
- vSent.dataset = self
- self.sentences.append(vSent)
-
-
-
- def getVocabualry(self):
- '''
- Extracts and returns the vocabulary of the dataset
- '''
-
- return sorted(set([t for s in self.getSentences() for t in s.getTokens()]))
-
-
-
- def extractSentenceForms(self, pSort = None):
- '''
- Returns the surface form of the sentences
-
- The sort options are:
- - None: in document order
- - id: in sentence ID order
- - text: in sentence text order
- '''
-
- return [s.getText() for s in self.getSentences(pSort = pSort)]
-
-
-
- def loadConstTrees(self, plConstTrees):
- '''
- Loads the constituency parse trees of the sentences
-
- It assumes that the provided constituency trees are in the order
- in which the sentences are loaded.
-
- The constituency trees can be provided in bracketing format or as
- constparse.ConstTree objects (in a list).
- '''
-
- for vSent, pCTree in zip(self.getSentences(), plConstTrees):
- vSent.loadConstTree(pCTree)
-
-
-
- def loadDepTrees(self, plDepTrees):
- '''
- Loads the dependency parse trees of the sentences
-
- It assumes that the provided dependency trees are in the order in
- which the sentences are loaded.
-
- The dependency trees are assumed to be provided a list of
- depparse.DepTree objects.
- '''
-
- for vSent, pDTree in zip(self.getSentences(), plDepTrees):
- vSent.loadDepTree(pDTree)
-
-
-
- def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
- '''
- Loads polarity scores to sentences from a sentiment lexicon which
- is a Sentexicon object
-
- For details about Sentexicon object, see sentexicon.py
- '''
-
- vTotalWordNum = 0 # total number of words in the data set
- vTotalEntryWordNum = 0 # number of words found in the lexicon
-
- for vSent in self.getSentences():
- vWordNum, vEntryWordNum = vSent.loadSentimentScores(pSentexicon, pNeutralScore)
-
- vTotalWordNum += vWordNum
- vTotalEntryWordNum += vEntryWordNum
-
- return vTotalWordNum, vTotalEntryWordNum
-
-
-
- def getSentimentScores(self):
- '''
- Returns a dictionary of words in the data set and the sentiment
- scores attached to them
- '''
-
- vdResult = {}
-
- for vSent in self.getSentences():
- for vWord, vScore in zip(vSent.getTokens(), vSent.getSentimentScores()):
- vdResult[vWord] = vScore
-
- return vdResult
-
-
-
- def loadSSInDTrees(self):
- '''
- Loads sentiment scores into dependency tree nodes
- '''
-
- for vSent in self.getSentences():
- vSent.loadSSInDTree()
-
-
-
- def loadSSInCTrees(self, pPropagation = None):
- '''
- Loads sentiment scores into constituency tree nodes
- '''
-
- for vSent in self.getSentences():
- vSent.loadSSInCTree(pPropagation = pPropagation)
-
-
-
- def loadWordVectors(self, pWordVectors, pflgFilter = True):
- '''
- Loads word vectors from a file or WordVector object, whichever is given
-
- By default, it filters out the words not in the data vocabulary, which can be changed to not filter (e.g. when
- the input is already filtered).
- '''
-
- from ml import wv
-
- self.wv = wv.WordVector()
-
- if type(pWordVectors) == str:
- vWV.load(pWVFilename = vWVFile, plFilterVocab = self.getVocabualry())
- else:
- self.wv = pWordVectors
-
-
-
- class SLSASent:
- '''
- Class for SLSA sentence
- '''
-
-
- def __init__(self, pSLSASet):
- '''
- Constructor
- '''
-
- # SLSASet the sentence belongs to
- self.dataset = pSLSASet
-
- self.text = None
- self.polarity = None
-
- self.cTree = None
- self.dTree = None
- self.sentScores = [] # sentiment score, one per word in the tokenized self.text
-
-
-
- def load(self, pText, pPolarity = None, pflgTokenize = False, pLanguage = "en"):
- '''
- Loads sentence form and its label
-
- It optionally tokenizes the text.
- '''
-
- if pflgTokenize:
- self.text = nlp.tokenizeSegment(pText, pLang = pLanguage, pflgTokenizeFSlash = False)
- else:
- self.text = pText
-
- self.polarity = pPolarity
-
-
-
- def getText(self):
- '''
- Returns sentence text (form)
- '''
-
- return self.text
-
-
-
- def getTokens(self):
- '''
- Returns the tokenization of the sentence
-
- The sentence text is assumed to be in tokenized format and only
- splits on space.
- '''
-
- return self.getText().split()
-
-
-
- def getPolarity(self):
- '''
- Returns the polarity of the sentence
- '''
-
- return self.polarity
-
-
-
- @property
- def length(self):
- '''
- Returns the sentence length
- '''
-
- return len(self.getTokens())
-
-
-
- def getConstTree(self):
- '''
- Returns the constituency parse tree of the sentence
-
- The returned object is of type constparse.ConstTree
- '''
-
- return self.cTree
-
-
-
- def getPOSTags(self):
- '''
- Returns the list of POS tags which matches the token list
-
- The POS tags are extracted from the constituency tree or dependency tree
- '''
-
- if self.cTree is not None:
- return self.cTree.getPOSs()
- elif self.dTree is not None:
- return self.dTree.getPOSs()
- else:
- return []
-
-
-
- def getDepTree(self):
- '''
- Returns the dependency parse tree of the sentence
-
- The returned object is of type depparse.DepTree
- '''
-
- return self.dTree
-
-
-
- def loadConstTree(self, pConstTree):
- '''
- Loads the constituency parse tree of the sentence
-
- The contituency tree can be provided in bracketing format or as
- constparse.ConstTree object.
- '''
-
- # loading the tree
-
- if isinstance(pConstTree, constparse.ConstTree):
- vConstTree = pConstTree.getPTBFormat()
- else:
- vConstTree = pConstTree
-
- self.cTree = SLSACTree()
- self.cTree.loadPTBTree(vConstTree, pflgExpandTerminal = True)
-
- # sanity check; comment out
- #if self.cTree.surface != self.getText():
- # print "Sentence and tree mismatch:\nSentence: %s\nTree: %s\n" % (self.getText(), self.cTree.surface)
-
-
-
- def loadDepTree(self, pDepTree):
- '''
- Loads the dependency parse tree of the sentence
-
- The dependency tree is assumed to be depparse.DepTree object.
- '''
-
- # loading the tree
-
- if not isinstance(pDepTree, depparse.DepTree):
- raise Exception("A DepTree object is expected!")
-
- self.dTree = SLSADTree()
- self.dTree.loadFromDepTree(pDepTree = pDepTree)
-
- # sanity check; comment out
- #if self.dTree.surface != self.getText():
- # print "Sentence and tree mismatch:\nSentence: %s\nTree:%s\n" % (self.getText(), self.dTree.surface)
-
-
-
- def getPTBConstTree(self):
- '''
- Returns the constituency tree of the sentence in PTB bracketing format
- '''
-
- return self.cTree.getPTBFormat()
-
-
-
- def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
- '''
- Loads sentiment scores to words in the sentence from a sentiment
- lexicon which is a Sentexicon object
-
- It returns a tuple of the total number of words in the sentence
- and the number of words found in the lexicon.
-
- If the word is not found in the lexicon, None will be used.
-
- For details about Sentexicon object, see sentexicon.py
- '''
-
- vlWords = self.getTokens()
- self.sentScores = []
-
- for vWord in vlWords:
- vScore = pSentexicon.getScore(vWord)
- if vScore is None and pNeutralScore is not None:
- self.sentScores.append(pNeutralScore)
- else:
- self.sentScores.append(vScore)
-
- return self.length, len([s for s in self.sentScores if s is not None])
-
-
-
- def getSentimentScores(self):
- '''
- Returns the sentiment scores in a list corresponding to the token list
- '''
-
- return self.sentScores
-
-
-
- def loadSSInDTree(self):
- '''
- Loads sentiment scores into the dependency tree of the sentence
- '''
-
- self.dTree.loadSentScores(self.sentScores)
-
-
-
- def loadSSInCTree(self, pPropagation = None):
- '''
- Loads sentiment scores into the constituency tree of the sentence
- '''
-
- self.cTree.loadSentScores(self.sentScores, pPropagation = pPropagation)
-
-
-
- def generateNGramKTree(self, pNodeContentType = "word", pFormat = "binary", pdOptions = None):
- '''
- Generates and returns the tree representation of the surface form
- of the sentence
- '''
-
- vSLSANGramKTree = SLSANGramKTree(pSLSASent = self, pNodeContentType = pNodeContentType, pdOptions = pdOptions)
-
- return vSLSANGramKTree.generateNGramKTree(pFormat = pFormat, pdOptions = pdOptions)
-
-
-
- def getAvgSentScore(self):
- '''
- Calculates and returns the average sentiment score sentence tokens
- '''
-
- vlSentScores = self.getSentimentScores()
-
- return sum(vlSentScores) / len(vlSentScores)
-
-
-
- def getPolarScores(self, pNeutralScore = None):
- '''
- Returns sentiment scores with non-neutral polarity
-
- Neutral polarity score can be set as parameter. It is None by default meaning that no polarity score is assigned
- to neutral words.
- '''
-
- return [s for s in self.sentScores if s != pNeutralScore]
-
-
-
- def getWordVectors(self):
- '''
- Returns the word vectors of the sentence tokens
- '''
-
- return [self.dataset.wv.getVector(t) for t in self.getTokens()]
-
-
-
- class SLSACTree(constparse.ConstTree):
- '''
- Class for constituency parse tree of a SLSA sentence
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- constparse.ConstTree.__init__(self)
-
-
-
- def _createNewTree(self):
- '''
- Creates and returns a new SLSACTree
-
- This method is useful in class inheriting.
- '''
-
- return SLSACTree()
-
-
-
- def _createRoot(self):
- '''
- Creates and returns the root node
- '''
-
- return SLSACNode()
-
-
-
- def modifyNPStruct(self):
- '''
- Modifies the structure of the noun phrases in order to avoid
- term/constituent mismatch caused by a flat NP structure
- '''
-
- self.root.modifyNPStruct()
-
-
-
- def loadSentScores(self, plSentScores, pPropagation = None):
- '''
- Loads sentiment scores to tree nodes
-
- The sentiment scores are given in a list which corresponds to the
- terminal (i.e. token) list.
-
- Propagation argument specifies the method by which the score are
- propagated from terminals up to the root node. None means scores
- are only assigned to the terminal nodes, thus they are 0 for the
- phrase nodes.
- '''
-
- self.root.loadSentScores(plSentScores, pPropagation = pPropagation)
-
-
-
- def extractTopmostVP(self, pSpan):
- '''
- Extracts and returns the topmost verb phrase node above the given span in the tree
- '''
-
- return self.root.extractTopmostVP(pSpan)
-
-
-
- class SLSACNode(constparse.ConstNode):
- '''
- Class for constituency tree node of a SLSA sentence
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- constparse.ConstNode.__init__(self)
-
- # sentiment score
- self.sentScore = None
-
- # list of type OpinionExpression
- self.oe = []
-
-
-
- def deepCopy(self, pflgCopyTree = False):
- '''
- NOTE: it seems python deepcopy() works better. Try the idea before
- using this method.
-
- Creates and returns a deep copy of the node optionally including
- the sub tree under it
- '''
-
- vNodeCopy = constparse.ConstNode.deepCopy(self, pflgCopyTree)
-
- # copying the sentiment score
- vNodeCopy.sentScore = self.sentScore
-
- return vNodeCopy
-
-
-
- def shallowCopy(self, pflgCopyTree = False):
- '''
- NOTE: before using, check if python shallowcopy() does not work
- as expected.
-
- Creates and returns a shallow copy of the node
-
- The shallow copy does not have a parent and children.
- '''
-
- vNodeCopy = constparse.ConstNode.shallowCopy(self, pflgCopyTree)
-
- # copying the sentiment score
- vNodeCopy.sentScore = self.sentScore
-
- return vNodeCopy
-
-
-
- def _getNewNode(self):
- '''
- Creates and returns a node
- '''
-
- return SLSACNode()
-
-
-
- def setSentScore(self, pScore):
- '''
- Sets the value of sentiment score of the node's word
- '''
-
- self.sentScore = pScore
-
-
-
- def getSentScore(self):
- '''
- Returns the sentiment score of the node's word
- '''
-
- return self.sentScore
-
-
-
- def modifyNPStruct(self):
- '''
- Modifies the structure of the noun phrases in order to avoid
- term/constituent mismatch caused by a flat NP structure
- '''
-
- if self.getSynTag() == "NP":
- vChildLabelSeq = ' '.join(self.getChildrenTags())
-
- if re.search("^(DT|PRP\$) NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) JJ[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("ADJP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) VBG NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) NN[A-Z]* NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 4))
- return
-
- if re.search("^(DT|PRP\$) JJ[A-Z]* NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (3, 4))
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) ADJP NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) CD NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 4))
- return
-
- if re.search("^(DT|PRP\$) JJ[A-Z]* JJ[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (3, 4))
- self.insertIntermChild("NP", (2, 3))
- return
-
- for vChild in self.getChildren():
- vChild.modifyNPStruct()
-
-
-
- def loadSentScores(self, plSentScores, pPropagation = None):
- '''
- Loads sentiment scores to nodes in the subtree
-
- The sentiment scores are given in a list which corresponds to the
- terminal (i.e. token) list.
-
- Neutral score will be used instead of None for words (and nodes when
- propagating) without a sentiment score (None).
-
- Propagation argument specifies the method by which the score are
- propagated from terminals up to this node. None means scores
- are only assigned to the terminal nodes, thus they are 0 for the
- phrase nodes. The following are the possible methods:
- - sum: nodes score is the sum of its children score
- - vote: node score is the dominant positive or negative score in the
- children nodes (i.e. more +1: score is +1, more -1: score -1)
- '''
-
- vTokenSpan = self.getTokenSpan()
-
- if self.isTerminal():
- # sanity check
- if vTokenSpan[0] != vTokenSpan[1]:
- raise Exception("Either the node is not terminal or its span is wrong: %s" % self)
- else:
- self.setSentScore(plSentScores[vTokenSpan[0] - 1])
- else:
- vlChildrenSSores = []
- for vChild in self.getChildren():
- vlChildrenSSores.append(vChild.loadSentScores(plSentScores, pPropagation))
-
- # calculating the sentiment score of the node based on its children's (propagation)
- if pPropagation is not None:
- if pPropagation.lower() == "sum":
- self.setSentScore(sum(vlChildrenSSores))
- elif pPropagation.lower() == "vote":
- self.setSentScore(self._getDominantSentiment(vlChildrenSSores))
-
- return self.getSentScore()
-
-
-
- def _getDominantSentiment(self, plScores):
- '''
- Returns +1 or -1 whichever is dominant in the given list of sentiment
- scores
-
- If the same number of both sentiment scores exist, 0 is returned.
- '''
-
- vPosCount = 0
- vNegCount = 0
-
- for vScore in plScores:
- if vScore == 1:
- vPosCount += 1
- elif vScore == -1:
- vNegCount += 1
-
- if vPosCount > vNegCount:
- return 1
- elif vPosCount < vNegCount:
- return -1
- else:
- return 0
-
-
-
- def extractTopmostVP(self, pSpan):
- '''
- Extracts and returns the topmost verb phrase node which overlaps the given span in the node subtree
-
- Overlap means that the given span and the span of the VP must not ne disjoint. So, left and right crossing will
- also be considered.
- '''
-
- if self.getSynTag() == 'VP':
- vSpanRel = self.getTokenSpanRelation(pSpan)
- if vSpanRel != -4:
- return self
- else:
- return None
- else:
- for vChild in self.getChildren():
- vTopVP = vChild.extractTopmostVP(pSpan)
- if vTopVP is not None:
- return vTopVP
-
- return None
-
-
-
- class SLSADTree(depparse.DepTree):
- '''
- Class for dependency parse tree of a SLSA sentence
- '''
-
-
- def __init__(self, pLanguage = ''):
- '''
- Constructor
- '''
-
- depparse.DepTree.__init__(self, pLanguage = pLanguage)
-
-
-
- def loadFromDepTree(self, pDepTree):
- '''
- Loads the tree from DepTree object
- '''
-
- # 1. nodes
-
- self.nodes = []
-
- for vNode in pDepTree.nodes:
- self.nodes.append(SLSADNode())
- self.nodes[-1].loadFromDepNode(pSLSADTree = self, pDepNode = vNode)
-
- # 2. SRL
-
- self.srl = pDepTree.srl
-
- # 3. language
-
- self.language = pDepTree.language
-
-
-
- def _createNewTree(self, pLanguage = ''):
- '''
- Creates and returns a new tree
- '''
-
- return SLSADTree(pLanguage = pLanguage)
-
-
-
- def loadSentScores(self, plSentScores):
- '''
- Loads sentiment scores to tree nodes
-
- The sentiment scores are given in a list which corresponds to the
- tree node list.
- '''
-
- for vNode, vSentScore in zip(self.getNodes(), plSentScores):
- vNode.setSentScore(vSentScore)
-
-
-
- def generateDepKTree(self, pFormat = "(rel form)", pdOptions = {}):
- '''
- Generates the dependency tree representation in PTB bracketing
- for tree kernels
-
- pdOptions provides options specific to each format.
- '''
-
- vDepKTree = SLSADKTree(pDepTree = self)
-
- return vDepKTree.generateDepKTree(pNode = "root", pFormat = pFormat, pdOptions = pdOptions)
-
-
-
- class SLSADNode(depparse.DepNode):
- '''
- Class for dependency parse node of a SLSA sentence
- '''
-
-
- def __init__(self, pSLSADTree = None, pForm = "", pPosition = 0, plHeadDeps = None, plDependents = None, pPOSTag = "", plPredRoles = None, pSentScore = None):
- '''
- Constructor
- '''
-
- depparse.DepNode.__init__(self, pDepTree = pSLSADTree, pForm = pForm, pPosition = pPosition, plHeadDeps = plHeadDeps, plDependents = plDependents, pPOSTag = pPOSTag, plPredRoles = plPredRoles)
-
- # sentiment score
- self.sentScore = pSentScore
-
-
-
- def loadFromDepNode(self, pSLSADTree, pDepNode):
- '''
- Loads the node data from DepNode object
- '''
-
- self.depTree = pSLSADTree # SLSA dependency tree the node belongs to
- self.form = pDepNode.form # token surface form
- self.position = pDepNode.position # token position in the sentence
- self.headDeps = pDepNode.headDeps # list of head and dependency tuples
- self.dependents = pDepNode.dependents # children
- self.posTag = pDepNode.posTag # POS tag
- self.predRoles = pDepNode.predRoles # list of (predicate position, semantic role) tuples
-
-
-
- def _createNewNode(self, pDepTree = None, pForm = "", pPosition = 0, plHeadDeps = None, plDependents = None, pPOSTag = "", plPredRoles = None, pSentScore = None):
- '''
- Creates and returns an new node
- '''
-
- return SLSADNode(pSLSADTree = pDepTree,
- pForm = pForm,
- pPosition = pPosition,
- plHeadDeps = plHeadDeps[:],
- plDependents = plDependents[:],
- pPOSTag = pPOSTag,
- plPredRoles = plPredRoles[:],
- pSentScore = pSentScore)
-
-
-
- def deepCopy(self, pDepTree):
- '''
- NOTE: it seems python deepcopy() works better. Try the idea before
- using this method.
-
- Creates and returns a new dependency node which is a deep copy of
- the current node
- '''
-
- return self._createNewNode(pDepTree = pDepTree,
- pForm = self.form,
- pPosition = self.position,
- plHeadDeps = self.headDeps[:],
- plDependents = self.dependents[:],
- pPOSTag = self.posTag,
- plPredRoles = self.predRoles[:],
- pSentScore = self.sentScore)
-
-
-
- def setSentScore(self, pScore):
- '''
- Sets the value of sentiment score of the node's word
- '''
-
- self.sentScore = pScore
-
-
-
- def getSentScore(self):
- '''
- Returns the sentiment score of the node's word
- '''
-
- return self.sentScore
-
-
-
- class SLSADKTree(tk.DepKTree):
- '''
- The class for SLSA dependency tree for use in tree kernels.
-
- For use in tree kernels, the tree is represented in PTB bracketing
- format.
- '''
-
-
- def generateDepKTree(self, pNode = "root", pFormat = "(rel form)", pdOptions = {}):
- '''
- Generates dependency kernel tree or subtree under a given node in
- the required format
-
- NOTE: The subtree here should not be confused with the notion of
- subtree as a tree kernel variation used in parallel to subset tree
- kernel.
-
- The default format is (rel form) which is the pure dependency tree
- with only dependency relations and token forms as node labels. In
- general, the format string is the representation of the innermost
- treelet in the format. See each format-specific method for exact
- details.
-
- pdOptions provides options specific to each format.
-
- See the documentation of the parent class.
- '''
-
- if type(pNode) is str and pNode.lower() == "root":
- pNode = self.depTree.root
-
- if pFormat == "(score (rel (pos form)))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree1(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(rel (pos score))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree2(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(score (rel (roles (pos form))))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree3(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(score (rel (pos_roles form)))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree4(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(rel_score (pos form))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree5(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- else:
- self.kTree = tk.DepKTree.generateDepKTree(self, pNode = pNode, pFormat = pFormat)
-
- return self.kTree
-
-
-
- # (score (rel (pos form))) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree1(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (pos form))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree1(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- # (rel (pos score)) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree2(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (rel (pos score)) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree2(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.getSentScore())
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.getSentScore(),
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pdOptions["neutral"])
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pdOptions["neutral"],
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s %s)" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag())
- else:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- vDependents)
-
- return vKSubtree
-
-
-
- # (score (rel (roles (pos form)))) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree3(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (roles (pos form)))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- - no-arg: the way non-argument nodes are represented. The possible
- values include an empty string which means the node will
- be represented as in non-semantic format, and a string
- value (e.g. null) which will be used as the label for
- representing semantic role of such nodes.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree3(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
-
- if pNode.isArgument():
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- elif "no-arg" in pdOptions and pdOptions["no-arg"].strip() != '':
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- # (score (rel (pos_roles form))) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree4(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (pos_roles form))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree4(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
-
- if pNode.isArgument():
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s_%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s_%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s_%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s_%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s_%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form)
- else:
- vKSubtree = "(%s (%s_%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form,
- vDependents)
- else:
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- # (rel_score (pos form)) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree5(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (pos form))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree5(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s_%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getSentScore(),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s_%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getSentScore(),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s_%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["neutral"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s_%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["neutral"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- class SLSACKTree(tk.ConstKTree):
- '''
- The class for SLSA constituency tree for use in tree kernels
-
- For use in tree kernels, the tree is represented in PTB bracketing
- format.
- '''
-
-
-
- def generateConstKTree(self, pFormat = "(phrase (pos form))", pdOptions = None):
- '''
- Generates constituency kernel tree or subtree under a given node in
- the required format
-
- NOTE: The subtree here should not be confused with the notion of
- subtree as a tree kernel variation used in parallel to subset tree
- kernel.
-
- The default format is (phrase (pos form)) which is the pure constituency
- tree in bracketing (s-expression) format. In general, the format string
- is the representation of the adequately innermost treelet in the format.
- See each format-specific method for exact details.
-
- See the documentation of the parent class.
-
- pdOptions contains specific options to each format.
- '''
-
- if pFormat.lower() == "(score (score score))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree1(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(score (phrase (score (pos form))))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree2(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(phrase_score (pos_score form))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree3(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(score (phrase_args (score (pos_args form))))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree4(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(phrase (score )(pos (score )(form )))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree5(self.constTree.root, pdOptions = pdOptions)
- else:
- self.kTree = tk.ConstKTree.generateConstKTree(self, pFormat = pFormat)
-
- return self.kTree
-
-
-
- def _generateSLSACKSubtree1(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (score score)) format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal() or pNode.isTerminal():
- vKSubtree = "(%s %s)" % (pNode.getSentScore(),
- pNode.getTerminalNodes()[0].getSentScore())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree1(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s %s)" % (pNode.getSentScore(),
- ''.join([self._generateSLSACKSubtree1(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree2(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (phrase (score (pos form))))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- pNode.getSynTag(),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree2(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- pNode.getSynTag(),
- ''.join([self._generateSLSACKSubtree2(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree3(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (phrase_score (pos_score form))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s_%s %s)" % (pNode.getSynTag(),
- pNode.getSentScore(),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree3(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s_%s %s)" % (pNode.getSynTag(),
- pNode.getSentScore(),
- ''.join([self._generateSLSACKSubtree3(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree4(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (phrase_args (score (pos_args form))))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- '_'.join([pNode.getSynTag()] + pNode.getArgRoles()),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree4(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- '_'.join([pNode.getSynTag()] + pNode.getArgRoles()),
- ''.join([self._generateSLSACKSubtree4(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree5(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (phrase (score )(pos (score )(form )))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s (%s )(%s ))" % (pNode.getSynTag(),
- pNode.getSentScore(),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree5(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s (%s )%s)" % (pNode.getSynTag(),
- pNode.getSentScore(),
- ''.join([self._generateSLSACKSubtree5(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- class SLSACDKTree(tk.ConstDepKTree):
- '''
- Class for integrating SLSA dependency subtrees in SLSA constituency
- trees for tree kernel use
- '''
-
-
- def _createDepKTree(self):
- '''
- Creates and returns a new SLSADKTree
- '''
-
- return SLSADKTree(pDepTree = self.depTree)
-
-
-
- def generateConstDepKTree(self, pFormat = "(phrase (pos (form (rel head))))", pdOptions = {}):
- '''
- Generates the tree representation in the required format
-
- See the parent class for more details.
- '''
-
-
- self.kTree = tk.ConstDepKTree.generateConstDepKTree(self, pFormat = pFormat, pdOptions = pdOptions)
-
- return self.kTree
-
-
-
- class SLSANGramKTree(tk.NGramKTree):
- '''
- Class for implementing n-gram tree for SLSA sentences
- '''
-
-
- def __init__(self, pSLSASent, pNodeContentType = "word", pdOptions = None):
- '''
- Constructor
-
- pNodeContentTypes can take:
- - word: word n-gram trees are produced, i.e. nodes are word forms
- - POS: POS n-gram trees are produced, i.e. nodes are POS tags
- - sentiment: sentiment score n-gram trees are produced, i.e. nodes are sentiment polarity scores
- - wvp: prefixed words for word vector similarity computation are produced (e.g. with svmlight-tk-we)
-
- pdOptions contains specific options to each format.
- '''
-
- if pdOptions is None:
- pdOptions = {}
-
- if pNodeContentType.lower() == "word":
- tk.NGramKTree.__init__(self, plTokens = pSLSASent.getTokens())
- elif pNodeContentType.lower() == "word-lower":
- tk.NGramKTree.__init__(self, plTokens = [t.lower() for t in pSLSASent.getTokens()])
- elif pNodeContentType.lower() == "pos":
- tk.NGramKTree.__init__(self, plTokens = pSLSASent.getPOSTags())
- elif pNodeContentType.lower() in ["sentiment", "polarity"]:
- tk.NGramKTree.__init__(self, plTokens = [str(s) for s in pSLSASent.getSentimentScores()])
- elif pNodeContentType.lower() == "wvp":
- if "prefix" in pdOptions:
- vPrefix= pdOptions["prefix"]
- else:
- vPrefix= "___"
- tk.NGramKTree.__init__(self, plTokens = [vPrefix + t for t in pSLSASent.getTokens()])
- else:
- raise Exception("%s is an invalid node content type!" % pNodeContentType)
-
- self.slsaSent = pSLSASent
-
-
-
- def generateNGramKTree(self, pFormat = "binary", pdOptions = None):
- '''
- Generates and returns a tree representation of the sentence tokens
-
- pdOptions contains specific options to each format.
- '''
-
- if pdOptions is None:
- pdOptions = {}
-
- if pFormat == "unary":
- self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "unary")
- elif pFormat == "bigram":
- self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "bigram")
- elif pFormat == "binary":
- self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "binary")
- else:
- self.kTree = tk.NGramKTree.generateNGramKTree(self)
-
- return self.kTree
-
-
-
|