123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932 |
- __author__ = 'rszk'
- #! /usr/bin/python
- """
- This module provides a class to extracts features for aspect-based sentiment analysis.
-
-
- Version 0.1 (02-Feb-2016 to 02-May-2016)
- - ABSAFE is added.
-
- """
- from ml import fecp, data
- from utils import util
- import numpy as np
- class ABSAFE:
- '''
- Feature extractor class for aspect-based sentiment analysis
- '''
-
-
- def __init__(self, pABSADataset):
- '''
- Constructor
- '''
-
- # an absa.ABSASet object
- self.absaDS = pABSADataset
-
-
-
- ## Generating outputs #####
-
- def generateDataset(self, pConfig = '', pdPolarityNumMap = None, pflgGenRawDataset = False):
- '''
- Extracts features based on the specified configuration in YAML-compatible format and generates dataset using
- them
-
- The generated dataset contains feature values post-processed based on the configuration. Optionally, a dataset
- with raw feature values can also be generated and returned. Both datasets are objects of type ml.data.Dataset.
-
- In addition to the datasets, an configuration string will be returned updated based on the feature extraction
- and processing outcome. For example, if a feature is binarized or scaled, the binarization (e.g. binarized
- feature names) or scaling information will be embedded into the configuration.
-
- The pConfig file contains the feature configuration. See fecp.FSConfig for the description of its format.
-
- pdPolarityNumMap is a dictionary which maps the polarity labels to numbers.
- '''
-
- # dataset with processed final feature values
- vDataset = data.Dataset()
- vDataset.loadTargets(self.extractATPolarities(pdPolarityNumMap))
-
- # dataset with raw feature values
- if pflgGenRawDataset:
- vRawDataset = data.Dataset()
- vRawDataset.loadTargets(self.extractATPolarities(pdPolarityNumMap))
-
- # feature extraction configuration processor
- vFSConfig = fecp.FSConfig(pConfig)
-
- for i, vFConfig in enumerate(vFSConfig.getFConfigs(), start = 1):
- # extracting feature values
- vFValues = self.extractFeature(vFConfig.name, vFConfig.featureParams)
-
- # creating feature based on the feature configuration and extracted values
- if vFConfig.normNominal or vFConfig.expandable:
- vlFeatures = vFSConfig.createFeature(vFConfig, vFValues)
- # adding features to the dataset
- vDataset.features.add(vlFeatures)
- vFConfig.setIndexes(i, len(vlFeatures))
- else:
- vFeature = vFSConfig.createFeature(vFConfig, vFValues)
- # adding feature to the dataset
- vDataset.features.append(vFeature)
- vFConfig.setIndexes(i)
-
- if pflgGenRawDataset:
- # generating raw-value feature
- vRawFeature = data.Feature(pName = vFConfig.name, pDataType = vFConfig.dataType)
- vRawFeature.loadValues(vFValues)
- # adding feature to the dataset
- vRawDataset.features.append(vRawFeature)
-
- # dumping configuration
- vConfig = vFSConfig.dumpConfig()
-
- if pflgGenRawDataset:
- return vDataset, vRawDataset, vConfig
- else:
- return vDataset, vConfig
-
-
-
- # extracting targets
-
- def extractATPolarities(self, pdPolarityNumMap):
- '''
- Extracts and returns aspect term polarities
-
- pdPolarityNumMap is a dictionary which maps the polarity labels to numbers.
- '''
-
- if pdPolarityNumMap is None:
- return [at.getPolarity() for at in self.absaDS.getAspectTerms()]
- else:
- return [pdPolarityNumMap[at.getPolarity()] for at in self.absaDS.getAspectTerms()]
-
-
-
- ## Feature extraction #####
-
- def extractFeature(self, pFeatureName, pdFeatureParams):
- '''
- Extracts values for the given feature name
-
- pFeatureName is the one specified in configuration file. In order
- to be able to have multiple settings of a single feature type, an
- entry for each setting is created in the configuration file with
- the feature name suffixed by any unique string. The specific feature
- parameters then goes under each entry. The reason is that configuration
- file is in YAML format which is dictionary-like and requires unique
- keys. So, the feature name can appear only once or only one of the
- appearances would be considered. For example, an n-gram can be extracted
- for different orders (n). To handle this, one entry per required order
- is put in the configuration and the feature name (e.g. n-gram) of each
- entry is suffixed with the order (e.g. n-gram-1). The specific setting
- then goes under each entry.
- '''
-
- if pFeatureName.startswith("at-surface") or pFeatureName.startswith("aspect-term-surface-form"):
- vlValues = self.extractATSurface(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("oe-surfaces") or pFeatureName.startswith("opinion-expression-surface-forms"):
- vlValues = self.extractOESurface(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("at-oe-const-path") or pFeatureName.startswith("at-oe-constituency-path"):
- vlValues = self.extractAT2OEConstPath(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("at-oe-dep-path") or pFeatureName.startswith("at-oe-dependency-path"):
- vlValues = self.extractAT2OEDepPath(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("oe-avg-sent-score") or pFeatureName.startswith("oe-average-sentiment-score"):
- vlValues = self.extractOEAvgSentScore(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("avg-sent-score") or pFeatureName.startswith("sentence-average-sentiment-score"):
- vlValues = self.extractAvgSentScore(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("at-avg-word-vector") or pFeatureName.startswith("at-averaged-word-vector"):
- vlValues = self.extractATAvgWV(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("oe-avg-word-vector") or pFeatureName.startswith("oe-averaged-word-vector"):
- vlValues = self.extractOEAvgWV(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("avg-word-vector") or pFeatureName.startswith("sentence-average-word-vector"):
- vlValues = self.extractSentAvgWV(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("at-ngram") or pFeatureName.startswith("aspect-term-ngrams"):
- vlValues = self.extractATNgrams(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("oe-ngram") or pFeatureName.startswith("opinion-expression-ngrams"):
- vlValues = self.extractOENgrams(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("sent-ngram") or pFeatureName.startswith("sentence-ngrams"):
- vlValues = self.extractSentNgrams(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("oe-word-vectors") or pFeatureName.startswith("oe-all-words-vectors"):
- vlValues = self.extractOEWV(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("at-word-vectors") or pFeatureName.startswith("at-all-words-vectors"):
- vlValues = self.extractATWV(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("at-oe-wv-dep-path") or pFeatureName.startswith("at-oe-word-vectors-in-dependency-path"):
- vlValues = self.extractAT2OEDepPathWV(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("top-at-vp-ngrams") or pFeatureName.startswith("topmost-at-verb-phrase-ngrams"):
- vlValues = self.extractTopATVPNgrams(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("polar-word-presence"):
- vlValues = self.extractPolarWordPresence(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("verb-form-in-at-heads") or pFeatureName.startswith("first-verb-form-in-at-head-chain"):
- vlValues = self.extractVerbFormInATHeads(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("sent-score-stat") or pFeatureName.startswith("sentence-sentiment-score-statistic"):
- vlValues = self.extractSentScoreStat(pFeatureName, pdFeatureParams)
- elif pFeatureName.startswith("pre-extracted") or pFeatureName.startswith("preex"):
- vlValues = self.extractPreex(pdFeatureParams)
- else:
- raise Exception("Feature %s is unknown! Check the spelling." % pFeatureName)
-
- return vlValues
-
-
-
- # pre-extracted features
-
- def extractPreex(self, pdFeatureParams):
- '''
- Extracts a previously-extracted feature from a file
-
- The file contains a column of values for the feature.
- '''
-
- vlValues = []
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam.lower() in ["source", "file", "input"]:
- vlValues = [float(s) for s in open(vValue).read().strip().split('\n')]
- else:
- raise Exception("'%s' is not a valid feature parameter for pre-extracted features!" % vParam)
-
- return vlValues
-
-
-
- # surface features
-
- def extractATSurface(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the aspect term surfaces
- '''
-
- # character used to replace spaces to avoid confusion in data files
- vSpaceFiller = '_'
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["space-filler"]:
- if vValue != None:
- vSpaceFiller = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- vlValues = [at.getForm().replace(' ', vSpaceFiller) for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractOESurface(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the opinion expression surfaces
-
- Feature parameters include:
- - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
- - closest: picks only closest OE to the AT in hand
- '''
-
- # character used to replace spaces to avoid confusion in data files
- vSpaceFiller = '_'
-
- vFilter = None
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["space-filler"]:
- if vValue != None:
- vSpaceFiller = vValue
- elif vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- if vFilter == "closest":
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append('')
- else:
- vlValues.append(vCOE.getForm().replace(' ', vSpaceFiller))
- else:
- vlValues = [[oe.getForm().replace(' ', vSpaceFiller) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractAT2OEConstPath(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the constituency path between the aspect term and objective expressions
-
- Feature parameters include:
- - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
- - closest: picks only closest OE to the AT in hand
- '''
-
- # character used to replace spaces to avoid confusion in data files
- vSpaceFiller = '_'
-
- vFilter = None
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["space-filler"]:
- if vValue != None:
- vSpaceFiller = vValue
- elif vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- if vFilter == "closest":
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append('')
- else:
- vlValues.append(vAT.sentence.getConstTree().extractAT2OEPath(vAT, vCOE))
- else:
- vlValues = [[at.sentence.getConstTree().extractAT2OEPath(at, oe) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractAT2OEDepPath(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the dependency path between the aspect term and objective expressions
-
- Feature parameters include:
- - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
- - closest: picks only closest OE to the AT in hand
- '''
-
- # character used to replace spaces to avoid confusion in data files
- vSpaceFiller = '_'
-
- vFilter = None
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["space-filler"]:
- if vValue != None:
- vSpaceFiller = vValue
- elif vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- if vFilter == "closest":
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append('')
- else:
- vlValues.append(vAT.sentence.getDepTree().extractAT2OEDepRelPath(vAT, vCOE))
- else:
- vlValues = [[at.sentence.getDepTree().extractAT2OEDepRelPath(at, oe) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractOEAvgSentScore(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the average sentiment score of the opinion expression(s) tokens
-
- Feature parameters include:
- - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
- - closest: picks only closest OE to the AT in hand
- '''
-
- vFilter = None
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- if vFilter == "closest":
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append(0.0)
- else:
- vlValues.append(vCOE.getAvgSentScore())
- else:
- vlValues = [at.sentence.getAvgOESentScore() for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractAvgSentScore(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the average sentiment score of the sentence
- '''
-
- vlValues = [at.sentence.getAvgSentScore() for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractATAvgWV(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the averaged word vectors of the aspect term tokens
-
- Feature parameters include:
- - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
- supported by ml/wv.WordVector.
- '''
-
- vWVFile = ''
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["wv-file", "word-vectors"]:
- vWVFile = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- # loading the word vectors
-
- from ml import wv
-
- vWV = wv.WordVector()
- vWV.load(pWVFilename = vWVFile)
-
- # extracting word vectors
-
- vlValues = [vWV.getAvgVector(at.getTokens()) for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractOEAvgWV(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the averaged word vectors of the opinion expression(s) tokens
-
- Feature parameters include:
- - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
- - closest: picks only closest OE to the AT in hand
- - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
- supported by ml/wv.WordVector.
- '''
-
- vFilter = None
- vWVFile = ''
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- elif vParam in ["wv-file", "word-vectors"]:
- vWVFile = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- # loading the word vectors
-
- from ml import wv
-
- vWV = wv.WordVector()
- vWV.load(pWVFilename = vWVFile)
-
- # extracting word vectors
-
- if vFilter == "closest":
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append([0.0] * vWV.dimension)
- else:
- vlValues.append(vWV.getAvgVector(vCOE.getTokens()))
- else:
- vlValues = [vWV.getAvgVector([t for oe in at.sentence.getOEs() for t in oe.getTokens()]) for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractSentAvgWV(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the averaged word vector of the sentence words
- '''
-
- vWVFile = ''
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["wv-file", "word-vectors"]:
- vWVFile = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- # loading the word vectors
-
- from ml import wv
-
- vWV = wv.WordVector()
- vWV.load(pWVFilename = vWVFile)
-
- vlValues = [vWV.getAvgVector(at.sentence.getTokens()) for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractATNgrams(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the aspect term n-grams
- '''
-
- vOrder = 1
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["order", 'n']:
- if vValue != None:
- vOrder = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- from nlp import nlp
-
- vlValues = [['-'.join(ngrams) for ngrams in nlp.extractNGrams(at.getTokens(), vOrder)] for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractOENgrams(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the opinion expression n-grams
- '''
-
- vFilter = None
- vOrder = 1
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["order", 'n']:
- if vValue != None:
- vOrder = vValue
- elif vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
-
- from nlp import nlp
-
- vlValues = []
-
- if vFilter == "closest":
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append([])
- else:
- vlValues.append(['-'.join(ngrams) for ngrams in nlp.extractNGrams(vCOE.getTokens(), vOrder)])
- else:
- for vAT in self.absaDS.getAspectTerms():
- vlOENgramsOfAT = []
- for vOE in vAT.sentence.getOEs():
- vlOENgramsOfAT += ['-'.join(ngrams) for ngrams in nlp.extractNGrams(vOE.getTokens(), vOrder)]
-
- vlValues.append(vlOENgramsOfAT)
-
- return vlValues
-
-
-
- def extractSentNgrams(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the sentence n-grams
- '''
-
- vOrder = 1
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["order", 'n']:
- if vValue != None:
- vOrder = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- from nlp import nlp
-
- vlValues = [['-'.join(ngrams) for ngrams in nlp.extractNGrams(at.sentence.getTokens(), vOrder)] for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractOEWV(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the concatenated word vectors of the opinion expression(s) tokens
-
- Feature parameters include:
- - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
- - closest: picks only closest OE to the AT in hand
- - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
- supported by ml/wv.WordVector.
- - vector-count: number of words from the beginning of opinion expression to extract the vector for. Since the length
- of opinion expressions are varying, a fixed number should be considered to be able to use in learning
- algorithm. For shorter OE than this count, zero vectors will be padded at the end.
- '''
-
- vFilter = None
- vWVFile = ''
- vWVCnt = 1
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- elif vParam in ["wv-file", "word-vectors"]:
- vWVFile = vValue
- elif vParam in ["vector-count", "word-count"]:
- vWVCnt = int(vValue)
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
-
- # loading the word vectors
-
- from ml import wv
-
- vWV = wv.WordVector()
- vWV.load(pWVFilename = vWVFile)
-
- # calculating the concatenated vector size
-
- vConcatVectorSize = vWV.dimension * vWVCnt
-
- # extracting word vectors
-
- vlValues = []
-
- if vFilter == "closest":
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append([np.float64(0.0)] * vWV.dimension * vWVCnt)
- else:
- # extracting concatenated vectors of all OEs in aspect term's sentence
- vlOEConcatVectors = [e for t in vCOE.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
- # padding for short OEs
- vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
- if vPadSize > 0:
- vlOEConcatVectors += [np.float64(0.0)] * vPadSize
- vlValues.append(vlOEConcatVectors)
- else:
- for vAT in self.absaDS.getAspectTerms():
- # extracting concatenated vectors of all OEs in aspect term's sentence
- vlOEConcatVectors = []
- for vOE in vAT.sentence.getOEs():
- vlOEConcatVectors += [e for t in vOE.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
- # padding for short OEs
- vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
- if vPadSize > 0:
- vlOEConcatVectors += [np.float64(0.0)] * vPadSize
- vlValues.append(vlOEConcatVectors)
-
- return vlValues
-
-
-
- def extractATWV(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the concatenated word vectors of the aspect term tokens
-
- Feature parameters include:
- - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
- supported by ml/wv.WordVector.
- - vector-count: number of words from the beginning of aspect term to extract the vector for. Since the length
- of aspect terms are varying, a fixed number should be considered to be able to use in learning
- algorithm. For shorter AT than this count, zero vectors will be padded at the end.
- '''
-
- vWVFile = ''
- vWVCnt = 1
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["wv-file", "word-vectors"]:
- vWVFile = vValue
- elif vParam in ["vector-count", "word-count"]:
- vWVCnt = int(vValue)
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
-
- # loading the word vectors
-
- from ml import wv
-
- vWV = wv.WordVector()
- vWV.load(pWVFilename = vWVFile)
-
- # calculating the concatenated vector size
-
- vConcatVectorSize = vWV.dimension * vWVCnt
-
- # extracting word vectors
-
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- # extracting concatenated vectors of all ATs in aspect term's sentence
- vlATConcatVectors = [e for t in vAT.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
-
- # padding for short ATs
- vPadSize = vConcatVectorSize - len(vlATConcatVectors)
- if vPadSize > 0:
- vlATConcatVectors += [np.float64(0.0)] * vPadSize
- vlValues.append(vlATConcatVectors)
-
- return vlValues
-
-
-
- def extractAT2OEDepPathWV(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the word vectors of the words in the dependency path between the aspect term and objective expressions
-
- Feature parameters include:
- - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
- - closest: picks only closest OE to the AT in hand
- - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
- supported by ml/wv.WordVector.
- - vector-count: number of words from the beginning of the path to extract the vector for. Since the length of the
- path is varying, a fixed number should be considered to be able to use in learning algorithm. For
- shorter path than this count, zero vectors will be padded at the end.
- '''
-
- vFilter = None
- vWVFile = ''
- vWVCnt = 1
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["filter"]:
- if vValue.lower() == "closest":
- vFilter = "closest"
- elif vParam in ["wv-file", "word-vectors"]:
- vWVFile = vValue
- elif vParam in ["vector-count", "word-count"]:
- vWVCnt = int(vValue)
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- # loading the word vectors
-
- from ml import wv
-
- vWV = wv.WordVector()
- vWV.load(pWVFilename = vWVFile)
-
- # calculating the concatenated vector size
-
- vConcatVectorSize = vWV.dimension * vWVCnt
-
- # extracting word vectors
-
- vlValues = []
-
- if vFilter == "closest":
- for vAT in self.absaDS.getAspectTerms():
- vCOE = self._extractClosestOE(vAT)
- if vCOE is None:
- vlValues.append([np.float64(0.0)] * vWV.dimension * vWVCnt)
- else:
- # extracting concatenated vectors of all OEs in aspect term's sentence
- vlOEConcatVectors = [e for t in vAT.sentence.getDepTree().extractAT2OEDepPathWords(vAT, vCOE)[:vWVCnt] for e in vWV.getVector(t)]
- # padding for short OEs
- vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
- if vPadSize > 0:
- vlOEConcatVectors += [np.float64(0.0)] * vPadSize
- vlValues.append(vlOEConcatVectors)
- else:
- for vAT in self.absaDS.getAspectTerms():
- # extracting concatenated vectors of all OEs in aspect term's sentence
- vlOEConcatVectors = []
- for vOE in vAT.sentence.getOEs():
- vlOEConcatVectors += [e for t in vAT.sentence.getDepTree().extractAT2OEDepPathWords(vAT, vOE)[:vWVCnt] for e in vWV.getVector(t)]
- # padding for short OEs
- vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
- if vPadSize > 0:
- vlOEConcatVectors += [np.float64(0.0)] * vPadSize
- vlValues.append(vlOEConcatVectors)
-
- return vlValues
-
-
-
- def extractTopATVPNgrams(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the n-grams under the topmost VP node dominating the aspect term
- '''
-
- vOrder = 1
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["order", 'n']:
- if vValue != None:
- vOrder = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- from nlp import nlp
-
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- vTopVPNode = vAT.sentence.getConstTree().extractTopmostVP(vAT.getTokenSpan())
- if vTopVPNode is not None:
- vlValues.append(['-'.join(ngrams) for ngrams in nlp.extractNGrams(vTopVPNode.getTokens(), vOrder)])
- else:
- vlValues.append([])
-
- return vlValues
-
-
-
- def extractPolarWordPresence(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the presence of
- '''
-
- vNeutralScore = 0
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["neutral", 'neutral-score']:
- if vValue != None:
- vNeutralScore = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- vlValues = [0 if len(at.sentence.getPolarScores(pNeutralScore = vNeutralScore)) == 0 else 1 for at in self.absaDS.getAspectTerms()]
-
- return vlValues
-
-
-
- def extractVerbFormInATHeads(self, pFeatureName, pdFeatureParams):
- '''
- Extracts the word form of the first verb in the head chain of the aspect term
-
- The following are the parameters:
- - sign: if true, the negation of the verb will be reflected by inserting a - before the verb form. For example,
- "does not have" will be represented as -have.
- '''
-
- vflgSign = False
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["sign", "negation"]:
- if vValue != None:
- vflgSign = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- vlValues = []
-
- for vAT in self.absaDS.getAspectTerms():
- vATNode = vAT.sentence.getDepTree().getNode(vAT.getTokenSpan()[0])
-
- # finding the verb in the head chain
- vVerbNode = None
- for vNode in vATNode.getHeadChain():
- if vNode.isVerb():
- vVerbNode = vNode
- break
-
- if vVerbNode is not None:
- if vVerbNode.isNegated():
- vlValues.append("-%s" % vVerbNode.getForm())
- else:
- vlValues.append(vVerbNode.getForm())
- else:
- vlValues.append('')
-
- return vlValues
-
-
-
- def extractSentScoreStat(self, pFeatureName, pdFeatureParams):
- '''
- Extracts statistic about sentiment score of the sentence
-
- Statistics include counts or percentages of positive/negative/neutral words.
- '''
-
- vPolarity = "positive"
- vStat = "count"
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam == "stat":
- if vValue != None:
- vStat = vValue
- elif vParam in ["polarity", "class", "category"]:
- if vValue != None:
- vPolarity = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
-
- ## 1. creating a list of sentiment scores per instance where the target sentiment is replaced by 1 and others
- ## by 0, e.g. in [0, +1, -1, 0, 0, -1] with target being negative polarity the outcome is [0, 0, 1, 0, 0, 1]
- if vPolarity.lower().startswith("pos"):
- vlMap = [[1 if s > 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
- elif vPolarity.lower().startswith("neg"):
- vlMap = [[1 if s < 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
- elif vPolarity.lower().startswith("neu"):
- vlMap = [[1 if s == 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
-
- # 2. creating the statistic
- if vStat.lower().startswith("count"):
- vlValues = [sum(m) for m in vlMap]
- elif vStat.lower().startswith("percent"):
- vlValues = [sum(m) * 1.0 / len(m) for m in vlMap]
- else:
- raise Exception("'%s' is not a valid statistic for %s!" % (vStat, pFeatureName))
-
-
- return vlValues
-
-
-
- # auxiliary methods
-
- def _extractClosestOE(self, pAspectTerm):
- '''
- Extracts and returns the closest opinion expression to the given aspect term in the sentence in terms of token
- number
- '''
-
- vdDists = {oe: util.getSpanDistance(pAspectTerm.getTokenSpan(), oe.getTokenSpan()) for oe in pAspectTerm.sentence.getOEs()}
-
- if vdDists == {}:
- return None
- else:
- return min(vdDists, key = lambda x: abs(vdDists[x]))
-
-
-
|