|
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- This module defines classes for sentence-level sentiment analysis (SLSA).
-
- Version 0.1 (06-Jul-2016)
- - SLSASet, SLSASent are added.
- """
- from parse import constparse, depparse
- from ml import tk
- from nlp import nlp
- class SLSASet:
- '''
- Class for sentence level sentiment analysis dataset.
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.sentences = []
-
- # WordVector object containing the word vectors of the vocabulary in the dataset
- self.wv = None
-
-
-
- def load(self, pTxtFilename, pPolarityFilename, pflgTokenize = False, pLanguage = "en"):
- '''
- Loads the data from input files
-
- This is the basic loader of SLSA data which expects the sentences
- (text) and polarity scores to be provided in separate parallel files.
- '''
-
-
- self.sentences = []
-
- vlTxtLines = open(pTxtFilename).read().strip().split('\n')
- vlPolarityScores = open(pPolarityFilename).read().strip().split('\n')
-
- if len(vlTxtLines) != len(vlPolarityScores):
- raise Exception("Number of sentences does not match number of scores: %s vs. %s" % (len(vlTxtLines), len(vlPolarityScores)))
-
- for vTxtLine, vPolScore in zip(vlTxtLines, vlPolarityScores):
- vSLSASent = SLSASent(pSLSASet = self)
- vSLSASent.load(vTxtLine, vPolScore, pflgTokenize = pflgTokenize, pLanguage = pLanguage)
- self.sentences.append(vSLSASent)
-
-
-
- @property
- def size(self):
- '''
- Returns the size of the data set which is the number of its sentences
- '''
-
- return len(self.getSentences())
-
-
-
- @property
- def tokenLength(self):
- '''
- Returns the number of tokens in the data set
- '''
-
- return sum([s.length for s in self.getSentences()])
-
-
-
- def getSentences(self, pSort = ''):
- '''
- Returns the SLSA sentences
-
- The sort options are:
- - None: in document order
- - text: in sentence text order
- '''
-
- if pSort.lower() == "text":
- return [s for s in sorted(self.sentences, key = lambda x: x.getText())]
- else:
- return [s for s in self.sentences]
-
-
-
- def addSentences(self, plSentences):
- '''
- Adds SLSA sentences to existing sentences
-
- Currently, no care is taken regarding ID duplication.
- '''
-
- for vSent in plSentences:
- vSent.dataset = self
- self.sentences.append(vSent)
-
-
-
- def getVocabualry(self):
- '''
- Extracts and returns the vocabulary of the dataset
- '''
-
- return sorted(set([t for s in self.getSentences() for t in s.getTokens()]))
-
-
-
- def extractSentenceForms(self, pSort = None):
- '''
- Returns the surface form of the sentences
-
- The sort options are:
- - None: in document order
- - id: in sentence ID order
- - text: in sentence text order
- '''
-
- return [s.getText() for s in self.getSentences(pSort = pSort)]
-
-
-
- def loadConstTrees(self, plConstTrees):
- '''
- Loads the constituency parse trees of the sentences
-
- It assumes that the provided constituency trees are in the order
- in which the sentences are loaded.
-
- The constituency trees can be provided in bracketing format or as
- constparse.ConstTree objects (in a list).
- '''
-
- for vSent, pCTree in zip(self.getSentences(), plConstTrees):
- vSent.loadConstTree(pCTree)
-
-
-
- def loadDepTrees(self, plDepTrees):
- '''
- Loads the dependency parse trees of the sentences
-
- It assumes that the provided dependency trees are in the order in
- which the sentences are loaded.
-
- The dependency trees are assumed to be provided a list of
- depparse.DepTree objects.
- '''
-
- for vSent, pDTree in zip(self.getSentences(), plDepTrees):
- vSent.loadDepTree(pDTree)
-
-
-
- def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
- '''
- Loads polarity scores to sentences from a sentiment lexicon which
- is a Sentexicon object
-
- For details about Sentexicon object, see sentexicon.py
- '''
-
- vTotalWordNum = 0 # total number of words in the data set
- vTotalEntryWordNum = 0 # number of words found in the lexicon
-
- for vSent in self.getSentences():
- vWordNum, vEntryWordNum = vSent.loadSentimentScores(pSentexicon, pNeutralScore)
-
- vTotalWordNum += vWordNum
- vTotalEntryWordNum += vEntryWordNum
-
- return vTotalWordNum, vTotalEntryWordNum
-
-
-
- def getSentimentScores(self):
- '''
- Returns a dictionary of words in the data set and the sentiment
- scores attached to them
- '''
-
- vdResult = {}
-
- for vSent in self.getSentences():
- for vWord, vScore in zip(vSent.getTokens(), vSent.getSentimentScores()):
- vdResult[vWord] = vScore
-
- return vdResult
-
-
-
- def loadSSInDTrees(self):
- '''
- Loads sentiment scores into dependency tree nodes
- '''
-
- for vSent in self.getSentences():
- vSent.loadSSInDTree()
-
-
-
- def loadSSInCTrees(self, pPropagation = None):
- '''
- Loads sentiment scores into constituency tree nodes
- '''
-
- for vSent in self.getSentences():
- vSent.loadSSInCTree(pPropagation = pPropagation)
-
-
-
- def loadWordVectors(self, pWordVectors, pflgFilter = True):
- '''
- Loads word vectors from a file or WordVector object, whichever is given
-
- By default, it filters out the words not in the data vocabulary, which can be changed to not filter (e.g. when
- the input is already filtered).
- '''
-
- from ml import wv
-
- self.wv = wv.WordVector()
-
- if type(pWordVectors) == str:
- vWV.load(pWVFilename = vWVFile, plFilterVocab = self.getVocabualry())
- else:
- self.wv = pWordVectors
-
-
-
- class SLSASent:
- '''
- Class for SLSA sentence
- '''
-
-
- def __init__(self, pSLSASet):
- '''
- Constructor
- '''
-
- # SLSASet the sentence belongs to
- self.dataset = pSLSASet
-
- self.text = None
- self.polarity = None
-
- self.cTree = None
- self.dTree = None
- self.sentScores = [] # sentiment score, one per word in the tokenized self.text
-
-
-
- def load(self, pText, pPolarity = None, pflgTokenize = False, pLanguage = "en"):
- '''
- Loads sentence form and its label
-
- It optionally tokenizes the text.
- '''
-
- if pflgTokenize:
- self.text = nlp.tokenizeSegment(pText, pLang = pLanguage, pflgTokenizeFSlash = False)
- else:
- self.text = pText
-
- self.polarity = pPolarity
-
-
-
- def getText(self):
- '''
- Returns sentence text (form)
- '''
-
- return self.text
-
-
-
- def getTokens(self):
- '''
- Returns the tokenization of the sentence
-
- The sentence text is assumed to be in tokenized format and only
- splits on space.
- '''
-
- return self.getText().split()
-
-
-
- def getPolarity(self):
- '''
- Returns the polarity of the sentence
- '''
-
- return self.polarity
-
-
-
- @property
- def length(self):
- '''
- Returns the sentence length
- '''
-
- return len(self.getTokens())
-
-
-
- def getConstTree(self):
- '''
- Returns the constituency parse tree of the sentence
-
- The returned object is of type constparse.ConstTree
- '''
-
- return self.cTree
-
-
-
- def getPOSTags(self):
- '''
- Returns the list of POS tags which matches the token list
-
- The POS tags are extracted from the constituency tree or dependency tree
- '''
-
- if self.cTree is not None:
- return self.cTree.getPOSs()
- elif self.dTree is not None:
- return self.dTree.getPOSs()
- else:
- return []
-
-
-
- def getDepTree(self):
- '''
- Returns the dependency parse tree of the sentence
-
- The returned object is of type depparse.DepTree
- '''
-
- return self.dTree
-
-
-
- def loadConstTree(self, pConstTree):
- '''
- Loads the constituency parse tree of the sentence
-
- The contituency tree can be provided in bracketing format or as
- constparse.ConstTree object.
- '''
-
- # loading the tree
-
- if isinstance(pConstTree, constparse.ConstTree):
- vConstTree = pConstTree.getPTBFormat()
- else:
- vConstTree = pConstTree
-
- self.cTree = SLSACTree()
- self.cTree.loadPTBTree(vConstTree, pflgExpandTerminal = True)
-
- # sanity check; comment out
- #if self.cTree.surface != self.getText():
- # print "Sentence and tree mismatch:\nSentence: %s\nTree: %s\n" % (self.getText(), self.cTree.surface)
-
-
-
- def loadDepTree(self, pDepTree):
- '''
- Loads the dependency parse tree of the sentence
-
- The dependency tree is assumed to be depparse.DepTree object.
- '''
-
- # loading the tree
-
- if not isinstance(pDepTree, depparse.DepTree):
- raise Exception("A DepTree object is expected!")
-
- self.dTree = SLSADTree()
- self.dTree.loadFromDepTree(pDepTree = pDepTree)
-
- # sanity check; comment out
- #if self.dTree.surface != self.getText():
- # print "Sentence and tree mismatch:\nSentence: %s\nTree:%s\n" % (self.getText(), self.dTree.surface)
-
-
-
- def getPTBConstTree(self):
- '''
- Returns the constituency tree of the sentence in PTB bracketing format
- '''
-
- return self.cTree.getPTBFormat()
-
-
-
- def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
- '''
- Loads sentiment scores to words in the sentence from a sentiment
- lexicon which is a Sentexicon object
-
- It returns a tuple of the total number of words in the sentence
- and the number of words found in the lexicon.
-
- If the word is not found in the lexicon, None will be used.
-
- For details about Sentexicon object, see sentexicon.py
- '''
-
- vlWords = self.getTokens()
- self.sentScores = []
-
- for vWord in vlWords:
- vScore = pSentexicon.getScore(vWord)
- if vScore is None and pNeutralScore is not None:
- self.sentScores.append(pNeutralScore)
- else:
- self.sentScores.append(vScore)
-
- return self.length, len([s for s in self.sentScores if s is not None])
-
-
-
- def getSentimentScores(self):
- '''
- Returns the sentiment scores in a list corresponding to the token list
- '''
-
- return self.sentScores
-
-
-
- def loadSSInDTree(self):
- '''
- Loads sentiment scores into the dependency tree of the sentence
- '''
-
- self.dTree.loadSentScores(self.sentScores)
-
-
-
- def loadSSInCTree(self, pPropagation = None):
- '''
- Loads sentiment scores into the constituency tree of the sentence
- '''
-
- self.cTree.loadSentScores(self.sentScores, pPropagation = pPropagation)
-
-
-
- def generateNGramKTree(self, pNodeContentType = "word", pFormat = "binary", pdOptions = None):
- '''
- Generates and returns the tree representation of the surface form
- of the sentence
- '''
-
- vSLSANGramKTree = SLSANGramKTree(pSLSASent = self, pNodeContentType = pNodeContentType, pdOptions = pdOptions)
-
- return vSLSANGramKTree.generateNGramKTree(pFormat = pFormat, pdOptions = pdOptions)
-
-
-
- def getAvgSentScore(self):
- '''
- Calculates and returns the average sentiment score sentence tokens
- '''
-
- vlSentScores = self.getSentimentScores()
-
- return sum(vlSentScores) / len(vlSentScores)
-
-
-
- def getPolarScores(self, pNeutralScore = None):
- '''
- Returns sentiment scores with non-neutral polarity
-
- Neutral polarity score can be set as parameter. It is None by default meaning that no polarity score is assigned
- to neutral words.
- '''
-
- return [s for s in self.sentScores if s != pNeutralScore]
-
-
-
- def getWordVectors(self):
- '''
- Returns the word vectors of the sentence tokens
- '''
-
- return [self.dataset.wv.getVector(t) for t in self.getTokens()]
-
-
-
- class SLSACTree(constparse.ConstTree):
- '''
- Class for constituency parse tree of a SLSA sentence
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- constparse.ConstTree.__init__(self)
-
-
-
- def _createNewTree(self):
- '''
- Creates and returns a new SLSACTree
-
- This method is useful in class inheriting.
- '''
-
- return SLSACTree()
-
-
-
- def _createRoot(self):
- '''
- Creates and returns the root node
- '''
-
- return SLSACNode()
-
-
-
- def modifyNPStruct(self):
- '''
- Modifies the structure of the noun phrases in order to avoid
- term/constituent mismatch caused by a flat NP structure
- '''
-
- self.root.modifyNPStruct()
-
-
-
- def loadSentScores(self, plSentScores, pPropagation = None):
- '''
- Loads sentiment scores to tree nodes
-
- The sentiment scores are given in a list which corresponds to the
- terminal (i.e. token) list.
-
- Propagation argument specifies the method by which the score are
- propagated from terminals up to the root node. None means scores
- are only assigned to the terminal nodes, thus they are 0 for the
- phrase nodes.
- '''
-
- self.root.loadSentScores(plSentScores, pPropagation = pPropagation)
-
-
-
- def extractTopmostVP(self, pSpan):
- '''
- Extracts and returns the topmost verb phrase node above the given span in the tree
- '''
-
- return self.root.extractTopmostVP(pSpan)
-
-
-
- class SLSACNode(constparse.ConstNode):
- '''
- Class for constituency tree node of a SLSA sentence
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- constparse.ConstNode.__init__(self)
-
- # sentiment score
- self.sentScore = None
-
- # list of type OpinionExpression
- self.oe = []
-
-
-
- def deepCopy(self, pflgCopyTree = False):
- '''
- NOTE: it seems python deepcopy() works better. Try the idea before
- using this method.
-
- Creates and returns a deep copy of the node optionally including
- the sub tree under it
- '''
-
- vNodeCopy = constparse.ConstNode.deepCopy(self, pflgCopyTree)
-
- # copying the sentiment score
- vNodeCopy.sentScore = self.sentScore
-
- return vNodeCopy
-
-
-
- def shallowCopy(self, pflgCopyTree = False):
- '''
- NOTE: before using, check if python shallowcopy() does not work
- as expected.
-
- Creates and returns a shallow copy of the node
-
- The shallow copy does not have a parent and children.
- '''
-
- vNodeCopy = constparse.ConstNode.shallowCopy(self, pflgCopyTree)
-
- # copying the sentiment score
- vNodeCopy.sentScore = self.sentScore
-
- return vNodeCopy
-
-
-
- def _getNewNode(self):
- '''
- Creates and returns a node
- '''
-
- return SLSACNode()
-
-
-
- def setSentScore(self, pScore):
- '''
- Sets the value of sentiment score of the node's word
- '''
-
- self.sentScore = pScore
-
-
-
- def getSentScore(self):
- '''
- Returns the sentiment score of the node's word
- '''
-
- return self.sentScore
-
-
-
- def modifyNPStruct(self):
- '''
- Modifies the structure of the noun phrases in order to avoid
- term/constituent mismatch caused by a flat NP structure
- '''
-
- if self.getSynTag() == "NP":
- vChildLabelSeq = ' '.join(self.getChildrenTags())
-
- if re.search("^(DT|PRP\$) NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) JJ[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("ADJP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) VBG NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) NN[A-Z]* NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 4))
- return
-
- if re.search("^(DT|PRP\$) JJ[A-Z]* NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (3, 4))
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) ADJP NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 3))
- return
-
- if re.search("^(DT|PRP\$) CD NN[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (2, 4))
- return
-
- if re.search("^(DT|PRP\$) JJ[A-Z]* JJ[A-Z]* NN[A-Z]*$", vChildLabelSeq):
- self.insertIntermChild("NP", (3, 4))
- self.insertIntermChild("NP", (2, 3))
- return
-
- for vChild in self.getChildren():
- vChild.modifyNPStruct()
-
-
-
- def loadSentScores(self, plSentScores, pPropagation = None):
- '''
- Loads sentiment scores to nodes in the subtree
-
- The sentiment scores are given in a list which corresponds to the
- terminal (i.e. token) list.
-
- Neutral score will be used instead of None for words (and nodes when
- propagating) without a sentiment score (None).
-
- Propagation argument specifies the method by which the score are
- propagated from terminals up to this node. None means scores
- are only assigned to the terminal nodes, thus they are 0 for the
- phrase nodes. The following are the possible methods:
- - sum: nodes score is the sum of its children score
- - vote: node score is the dominant positive or negative score in the
- children nodes (i.e. more +1: score is +1, more -1: score -1)
- '''
-
- vTokenSpan = self.getTokenSpan()
-
- if self.isTerminal():
- # sanity check
- if vTokenSpan[0] != vTokenSpan[1]:
- raise Exception("Either the node is not terminal or its span is wrong: %s" % self)
- else:
- self.setSentScore(plSentScores[vTokenSpan[0] - 1])
- else:
- vlChildrenSSores = []
- for vChild in self.getChildren():
- vlChildrenSSores.append(vChild.loadSentScores(plSentScores, pPropagation))
-
- # calculating the sentiment score of the node based on its children's (propagation)
- if pPropagation is not None:
- if pPropagation.lower() == "sum":
- self.setSentScore(sum(vlChildrenSSores))
- elif pPropagation.lower() == "vote":
- self.setSentScore(self._getDominantSentiment(vlChildrenSSores))
-
- return self.getSentScore()
-
-
-
- def _getDominantSentiment(self, plScores):
- '''
- Returns +1 or -1 whichever is dominant in the given list of sentiment
- scores
-
- If the same number of both sentiment scores exist, 0 is returned.
- '''
-
- vPosCount = 0
- vNegCount = 0
-
- for vScore in plScores:
- if vScore == 1:
- vPosCount += 1
- elif vScore == -1:
- vNegCount += 1
-
- if vPosCount > vNegCount:
- return 1
- elif vPosCount < vNegCount:
- return -1
- else:
- return 0
-
-
-
- def extractTopmostVP(self, pSpan):
- '''
- Extracts and returns the topmost verb phrase node which overlaps the given span in the node subtree
-
- Overlap means that the given span and the span of the VP must not ne disjoint. So, left and right crossing will
- also be considered.
- '''
-
- if self.getSynTag() == 'VP':
- vSpanRel = self.getTokenSpanRelation(pSpan)
- if vSpanRel != -4:
- return self
- else:
- return None
- else:
- for vChild in self.getChildren():
- vTopVP = vChild.extractTopmostVP(pSpan)
- if vTopVP is not None:
- return vTopVP
-
- return None
-
-
-
- class SLSADTree(depparse.DepTree):
- '''
- Class for dependency parse tree of a SLSA sentence
- '''
-
-
- def __init__(self, pLanguage = ''):
- '''
- Constructor
- '''
-
- depparse.DepTree.__init__(self, pLanguage = pLanguage)
-
-
-
- def loadFromDepTree(self, pDepTree):
- '''
- Loads the tree from DepTree object
- '''
-
- # 1. nodes
-
- self.nodes = []
-
- for vNode in pDepTree.nodes:
- self.nodes.append(SLSADNode())
- self.nodes[-1].loadFromDepNode(pSLSADTree = self, pDepNode = vNode)
-
- # 2. SRL
-
- self.srl = pDepTree.srl
-
- # 3. language
-
- self.language = pDepTree.language
-
-
-
- def _createNewTree(self, pLanguage = ''):
- '''
- Creates and returns a new tree
- '''
-
- return SLSADTree(pLanguage = pLanguage)
-
-
-
- def loadSentScores(self, plSentScores):
- '''
- Loads sentiment scores to tree nodes
-
- The sentiment scores are given in a list which corresponds to the
- tree node list.
- '''
-
- for vNode, vSentScore in zip(self.getNodes(), plSentScores):
- vNode.setSentScore(vSentScore)
-
-
-
- def generateDepKTree(self, pFormat = "(rel form)", pdOptions = {}):
- '''
- Generates the dependency tree representation in PTB bracketing
- for tree kernels
-
- pdOptions provides options specific to each format.
- '''
-
- vDepKTree = SLSADKTree(pDepTree = self)
-
- return vDepKTree.generateDepKTree(pNode = "root", pFormat = pFormat, pdOptions = pdOptions)
-
-
-
- class SLSADNode(depparse.DepNode):
- '''
- Class for dependency parse node of a SLSA sentence
- '''
-
-
- def __init__(self, pSLSADTree = None, pForm = "", pPosition = 0, plHeadDeps = None, plDependents = None, pPOSTag = "", plPredRoles = None, pSentScore = None):
- '''
- Constructor
- '''
-
- depparse.DepNode.__init__(self, pDepTree = pSLSADTree, pForm = pForm, pPosition = pPosition, plHeadDeps = plHeadDeps, plDependents = plDependents, pPOSTag = pPOSTag, plPredRoles = plPredRoles)
-
- # sentiment score
- self.sentScore = pSentScore
-
-
-
- def loadFromDepNode(self, pSLSADTree, pDepNode):
- '''
- Loads the node data from DepNode object
- '''
-
- self.depTree = pSLSADTree # SLSA dependency tree the node belongs to
- self.form = pDepNode.form # token surface form
- self.position = pDepNode.position # token position in the sentence
- self.headDeps = pDepNode.headDeps # list of head and dependency tuples
- self.dependents = pDepNode.dependents # children
- self.posTag = pDepNode.posTag # POS tag
- self.predRoles = pDepNode.predRoles # list of (predicate position, semantic role) tuples
-
-
-
- def _createNewNode(self, pDepTree = None, pForm = "", pPosition = 0, plHeadDeps = None, plDependents = None, pPOSTag = "", plPredRoles = None, pSentScore = None):
- '''
- Creates and returns an new node
- '''
-
- return SLSADNode(pSLSADTree = pDepTree,
- pForm = pForm,
- pPosition = pPosition,
- plHeadDeps = plHeadDeps[:],
- plDependents = plDependents[:],
- pPOSTag = pPOSTag,
- plPredRoles = plPredRoles[:],
- pSentScore = pSentScore)
-
-
-
- def deepCopy(self, pDepTree):
- '''
- NOTE: it seems python deepcopy() works better. Try the idea before
- using this method.
-
- Creates and returns a new dependency node which is a deep copy of
- the current node
- '''
-
- return self._createNewNode(pDepTree = pDepTree,
- pForm = self.form,
- pPosition = self.position,
- plHeadDeps = self.headDeps[:],
- plDependents = self.dependents[:],
- pPOSTag = self.posTag,
- plPredRoles = self.predRoles[:],
- pSentScore = self.sentScore)
-
-
-
- def setSentScore(self, pScore):
- '''
- Sets the value of sentiment score of the node's word
- '''
-
- self.sentScore = pScore
-
-
-
- def getSentScore(self):
- '''
- Returns the sentiment score of the node's word
- '''
-
- return self.sentScore
-
-
-
- class SLSADKTree(tk.DepKTree):
- '''
- The class for SLSA dependency tree for use in tree kernels.
-
- For use in tree kernels, the tree is represented in PTB bracketing
- format.
- '''
-
-
- def generateDepKTree(self, pNode = "root", pFormat = "(rel form)", pdOptions = {}):
- '''
- Generates dependency kernel tree or subtree under a given node in
- the required format
-
- NOTE: The subtree here should not be confused with the notion of
- subtree as a tree kernel variation used in parallel to subset tree
- kernel.
-
- The default format is (rel form) which is the pure dependency tree
- with only dependency relations and token forms as node labels. In
- general, the format string is the representation of the innermost
- treelet in the format. See each format-specific method for exact
- details.
-
- pdOptions provides options specific to each format.
-
- See the documentation of the parent class.
- '''
-
- if type(pNode) is str and pNode.lower() == "root":
- pNode = self.depTree.root
-
- if pFormat == "(score (rel (pos form)))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree1(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(rel (pos score))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree2(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(score (rel (roles (pos form))))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree3(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(score (rel (pos_roles form)))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree4(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- elif pFormat == "(rel_score (pos form))":
- self.kTree = "( %s)" % ''.join([self._generateSLSADKSubtree5(self.depTree.getNode(d), 0, pdOptions) for d in pNode.dependents])
- else:
- self.kTree = tk.DepKTree.generateDepKTree(self, pNode = pNode, pFormat = pFormat)
-
- return self.kTree
-
-
-
- # (score (rel (pos form))) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree1(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (pos form))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree1(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- # (rel (pos score)) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree2(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (rel (pos score)) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree2(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.getSentScore())
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.getSentScore(),
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pdOptions["neutral"])
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pdOptions["neutral"],
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s %s)" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag())
- else:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- vDependents)
-
- return vKSubtree
-
-
-
- # (score (rel (roles (pos form)))) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree3(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (roles (pos form)))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- - no-arg: the way non-argument nodes are represented. The possible
- values include an empty string which means the node will
- be represented as in non-semantic format, and a string
- value (e.g. null) which will be used as the label for
- representing semantic role of such nodes.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree3(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
-
- if pNode.isArgument():
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getDepRel(pCurrentHead),
- '_'.join(pNode.getArgRoles()),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- elif "no-arg" in pdOptions and pdOptions["no-arg"].strip() != '':
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s (%s %s)))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["no-arg"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- # (score (rel (pos_roles form))) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree4(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (pos_roles form))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree4(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
-
- if pNode.isArgument():
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s_%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s_%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s_%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s_%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s_%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form)
- else:
- vKSubtree = "(%s (%s_%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- '_'.join(pNode.getArgRoles()),
- pNode.form,
- vDependents)
- else:
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pNode.getSentScore(),
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s (%s %s)))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s (%s %s))))" % (pdOptions["neutral"],
- pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- # (rel_score (pos form)) -> no equivalent number format in version 0.3
- def _generateSLSADKSubtree5(self, pNode, pCurrentHead, pdOptions = {}):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (rel (pos form))) format
-
- pCurrentHead identifies which head is calling this method in case
- the node has multiple heads.
-
- pdOptions contains the following options to be used in formatting:
- - neutral: the way the neutral words, i.e. those without a sentiment
- score should be treated. The possible values include
- an empty string which means do not add any node for such
- words, and a string value which will be used as a node
- to be inserted in the same way the scores are.
- '''
-
- vDependents = ''.join([self._generateSLSADKSubtree5(self.depTree.getNode(d), pNode.position, pdOptions) for d in pNode.dependents])
-
- if pNode.getSentScore() is not None:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s_%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getSentScore(),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s_%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getSentScore(),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if "neutral" in pdOptions and pdOptions["neutral"].strip() != '':
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s_%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["neutral"],
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s_%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pdOptions["neutral"],
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
- else:
- if len(pNode.dependents) == 0:
- vKSubtree = "(%s (%s %s))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form)
- else:
- vKSubtree = "(%s (%s (%s %s)))" % (pNode.getDepRel(pCurrentHead),
- pNode.getPOSTag(),
- pNode.form,
- vDependents)
-
- return vKSubtree
-
-
-
- class SLSACKTree(tk.ConstKTree):
- '''
- The class for SLSA constituency tree for use in tree kernels
-
- For use in tree kernels, the tree is represented in PTB bracketing
- format.
- '''
-
-
-
- def generateConstKTree(self, pFormat = "(phrase (pos form))", pdOptions = None):
- '''
- Generates constituency kernel tree or subtree under a given node in
- the required format
-
- NOTE: The subtree here should not be confused with the notion of
- subtree as a tree kernel variation used in parallel to subset tree
- kernel.
-
- The default format is (phrase (pos form)) which is the pure constituency
- tree in bracketing (s-expression) format. In general, the format string
- is the representation of the adequately innermost treelet in the format.
- See each format-specific method for exact details.
-
- See the documentation of the parent class.
-
- pdOptions contains specific options to each format.
- '''
-
- if pFormat.lower() == "(score (score score))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree1(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(score (phrase (score (pos form))))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree2(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(phrase_score (pos_score form))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree3(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(score (phrase_args (score (pos_args form))))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree4(self.constTree.root, pdOptions = pdOptions)
- elif pFormat.lower() == "(phrase (score )(pos (score )(form )))":
- self.kTree = "( %s)" % self._generateSLSACKSubtree5(self.constTree.root, pdOptions = pdOptions)
- else:
- self.kTree = tk.ConstKTree.generateConstKTree(self, pFormat = pFormat)
-
- return self.kTree
-
-
-
- def _generateSLSACKSubtree1(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (score score)) format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal() or pNode.isTerminal():
- vKSubtree = "(%s %s)" % (pNode.getSentScore(),
- pNode.getTerminalNodes()[0].getSentScore())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree1(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s %s)" % (pNode.getSentScore(),
- ''.join([self._generateSLSACKSubtree1(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree2(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (phrase (score (pos form))))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- pNode.getSynTag(),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree2(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- pNode.getSynTag(),
- ''.join([self._generateSLSACKSubtree2(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree3(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (phrase_score (pos_score form))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s_%s %s)" % (pNode.getSynTag(),
- pNode.getSentScore(),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree3(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s_%s %s)" % (pNode.getSynTag(),
- pNode.getSentScore(),
- ''.join([self._generateSLSACKSubtree3(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree4(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (score (phrase_args (score (pos_args form))))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- '_'.join([pNode.getSynTag()] + pNode.getArgRoles()),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree4(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s (%s %s))" % (pNode.getSentScore(),
- '_'.join([pNode.getSynTag()] + pNode.getArgRoles()),
- ''.join([self._generateSLSACKSubtree4(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- def _generateSLSACKSubtree5(self, pNode, pdOptions = None):
- '''
- Recursively generates the kernel subtree of the given node in
- bracketing representation in (phrase (score )(pos (score )(form )))
- format
-
- In this format, all the nodes represent the sentiment score
- associated with them.
-
- pdOptions contains the following options:
- - keep-at: if true will keep the AT label and wont use sentiment
- scores to replace it (default false)
- '''
-
- if pNode.isPreTerminal():
- vKSubtree = "(%s (%s )(%s ))" % (pNode.getSynTag(),
- pNode.getSentScore(),
- pNode.getTerminal())
- elif pdOptions is not None and "keep-at" in pdOptions and pdOptions["keep-at"] == True and pNode.getSynTag().upper() == "AT":
- vKSubtree = "(AT %s)" % (''.join([self._generateSLSACKSubtree5(n, pdOptions) for n in pNode.children]))
- else:
- vKSubtree = "(%s (%s )%s)" % (pNode.getSynTag(),
- pNode.getSentScore(),
- ''.join([self._generateSLSACKSubtree5(n, pdOptions) for n in pNode.children]))
-
- return vKSubtree
-
-
-
- class SLSACDKTree(tk.ConstDepKTree):
- '''
- Class for integrating SLSA dependency subtrees in SLSA constituency
- trees for tree kernel use
- '''
-
-
- def _createDepKTree(self):
- '''
- Creates and returns a new SLSADKTree
- '''
-
- return SLSADKTree(pDepTree = self.depTree)
-
-
-
- def generateConstDepKTree(self, pFormat = "(phrase (pos (form (rel head))))", pdOptions = {}):
- '''
- Generates the tree representation in the required format
-
- See the parent class for more details.
- '''
-
-
- self.kTree = tk.ConstDepKTree.generateConstDepKTree(self, pFormat = pFormat, pdOptions = pdOptions)
-
- return self.kTree
-
-
-
- class SLSANGramKTree(tk.NGramKTree):
- '''
- Class for implementing n-gram tree for SLSA sentences
- '''
-
-
- def __init__(self, pSLSASent, pNodeContentType = "word", pdOptions = None):
- '''
- Constructor
-
- pNodeContentTypes can take:
- - word: word n-gram trees are produced, i.e. nodes are word forms
- - POS: POS n-gram trees are produced, i.e. nodes are POS tags
- - sentiment: sentiment score n-gram trees are produced, i.e. nodes are sentiment polarity scores
- - wvp: prefixed words for word vector similarity computation are produced (e.g. with svmlight-tk-we)
-
- pdOptions contains specific options to each format.
- '''
-
- if pdOptions is None:
- pdOptions = {}
-
- if pNodeContentType.lower() == "word":
- tk.NGramKTree.__init__(self, plTokens = pSLSASent.getTokens())
- elif pNodeContentType.lower() == "word-lower":
- tk.NGramKTree.__init__(self, plTokens = [t.lower() for t in pSLSASent.getTokens()])
- elif pNodeContentType.lower() == "pos":
- tk.NGramKTree.__init__(self, plTokens = pSLSASent.getPOSTags())
- elif pNodeContentType.lower() in ["sentiment", "polarity"]:
- tk.NGramKTree.__init__(self, plTokens = [str(s) for s in pSLSASent.getSentimentScores()])
- elif pNodeContentType.lower() == "wvp":
- if "prefix" in pdOptions:
- vPrefix= pdOptions["prefix"]
- else:
- vPrefix= "___"
- tk.NGramKTree.__init__(self, plTokens = [vPrefix + t for t in pSLSASent.getTokens()])
- else:
- raise Exception("%s is an invalid node content type!" % pNodeContentType)
-
- self.slsaSent = pSLSASent
-
-
-
- def generateNGramKTree(self, pFormat = "binary", pdOptions = None):
- '''
- Generates and returns a tree representation of the sentence tokens
-
- pdOptions contains specific options to each format.
- '''
-
- if pdOptions is None:
- pdOptions = {}
-
- if pFormat == "unary":
- self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "unary")
- elif pFormat == "bigram":
- self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "bigram")
- elif pFormat == "binary":
- self.kTree = tk.NGramKTree.generateNGramKTree(self, pFormat = "binary")
- else:
- self.kTree = tk.NGramKTree.generateNGramKTree(self)
-
- return self.kTree
-
-
-
|