rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932
							__author__ = 'rszk'
#! /usr/bin/python

"""
	This module provides a class to extracts features for aspect-based sentiment analysis.
	
	
	Version 0.1                                                                             (02-Feb-2016 to 02-May-2016)
	- ABSAFE is added.
	
"""


from ml import fecp, data
from utils import util
import numpy as np


class ABSAFE:
	'''
	Feature extractor class for aspect-based sentiment analysis
	'''
	
	
	def __init__(self, pABSADataset):
		'''
		Constructor
		'''
		
		# an absa.ABSASet object
		self.absaDS = pABSADataset
		
	
	## Generating outputs #####
	
	def generateDataset(self, pConfig = '', pdPolarityNumMap = None, pflgGenRawDataset = False):
		'''
		Extracts features based on the specified configuration in YAML-compatible format and generates dataset using
		them
		
		The generated dataset contains feature values post-processed based on the configuration. Optionally, a dataset 
		with raw feature values can also be generated and returned. Both datasets are objects of type ml.data.Dataset.
		
		In addition to the datasets, an configuration string will be returned updated based on the feature extraction
		and processing outcome. For example, if a feature is binarized or scaled, the binarization (e.g. binarized 
		feature names) or scaling information will be embedded into the configuration. 
		
		The pConfig file contains the feature configuration. See fecp.FSConfig for the description of its format.
		
		pdPolarityNumMap is a dictionary which maps the polarity labels to numbers.
		'''
		
		# dataset with processed final feature values
		vDataset = data.Dataset()
		vDataset.loadTargets(self.extractATPolarities(pdPolarityNumMap))
		
		# dataset with raw feature values
		if pflgGenRawDataset:
			vRawDataset = data.Dataset()
			vRawDataset.loadTargets(self.extractATPolarities(pdPolarityNumMap))
		
		# feature extraction configuration processor 
		vFSConfig = fecp.FSConfig(pConfig)
		
		for i, vFConfig in enumerate(vFSConfig.getFConfigs(), start = 1):
			# extracting feature values
			vFValues = self.extractFeature(vFConfig.name, vFConfig.featureParams)
			
			# creating feature based on the feature configuration and extracted values
			if vFConfig.normNominal or vFConfig.expandable:
				vlFeatures = vFSConfig.createFeature(vFConfig, vFValues)
				# adding features to the dataset
				vDataset.features.add(vlFeatures)
				vFConfig.setIndexes(i, len(vlFeatures))
			else:
				vFeature = vFSConfig.createFeature(vFConfig, vFValues)
				# adding feature to the dataset
				vDataset.features.append(vFeature)
				vFConfig.setIndexes(i)
			
			if pflgGenRawDataset:
				# generating raw-value feature
				vRawFeature = data.Feature(pName = vFConfig.name, pDataType = vFConfig.dataType)
				vRawFeature.loadValues(vFValues)
				# adding feature to the dataset
				vRawDataset.features.append(vRawFeature)
		
		# dumping configuration
		vConfig = vFSConfig.dumpConfig()
		
		if pflgGenRawDataset:
			return vDataset, vRawDataset, vConfig
		else:
			return vDataset, vConfig
		
	
	# extracting targets
	
	def extractATPolarities(self, pdPolarityNumMap):
		'''
		Extracts and returns aspect term polarities 
		
		pdPolarityNumMap is a dictionary which maps the polarity labels to numbers.
		'''
		
		if pdPolarityNumMap is None:
			return [at.getPolarity() for at in self.absaDS.getAspectTerms()]
		else:
			return [pdPolarityNumMap[at.getPolarity()] for at in self.absaDS.getAspectTerms()]
		
	
	## Feature extraction #####
	
	def extractFeature(self, pFeatureName, pdFeatureParams):
		'''
		Extracts values for the given feature name 
		
		pFeatureName is the one specified in configuration file. In order
		to be able to have multiple settings of a single feature type, an 
		entry for each setting is created in the configuration file with 
		the feature name suffixed by any unique string. The specific feature
		parameters then goes under each entry. The reason is that configuration
		file is in YAML format which is dictionary-like and requires unique 
		keys. So, the feature name can appear only once or only one of the
		appearances would be considered. For example, an n-gram can	be extracted 
		for different orders (n). To handle this, one entry per required order
		is put in the configuration and the feature name (e.g. n-gram) of each
		entry is suffixed with the order (e.g. n-gram-1). The specific setting
		then goes under each entry.
		'''
		
		if pFeatureName.startswith("at-surface") or pFeatureName.startswith("aspect-term-surface-form"):
			vlValues = self.extractATSurface(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("oe-surfaces") or pFeatureName.startswith("opinion-expression-surface-forms"):
			vlValues = self.extractOESurface(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("at-oe-const-path") or pFeatureName.startswith("at-oe-constituency-path"):
			vlValues = self.extractAT2OEConstPath(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("at-oe-dep-path") or pFeatureName.startswith("at-oe-dependency-path"):
			vlValues = self.extractAT2OEDepPath(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("oe-avg-sent-score") or pFeatureName.startswith("oe-average-sentiment-score"):
			vlValues = self.extractOEAvgSentScore(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("avg-sent-score") or pFeatureName.startswith("sentence-average-sentiment-score"):
			vlValues = self.extractAvgSentScore(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("at-avg-word-vector") or pFeatureName.startswith("at-averaged-word-vector"):
			vlValues = self.extractATAvgWV(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("oe-avg-word-vector") or pFeatureName.startswith("oe-averaged-word-vector"):
			vlValues = self.extractOEAvgWV(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("avg-word-vector") or pFeatureName.startswith("sentence-average-word-vector"):
			vlValues = self.extractSentAvgWV(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("at-ngram") or pFeatureName.startswith("aspect-term-ngrams"):
			vlValues = self.extractATNgrams(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("oe-ngram") or pFeatureName.startswith("opinion-expression-ngrams"):
			vlValues = self.extractOENgrams(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("sent-ngram") or pFeatureName.startswith("sentence-ngrams"):
			vlValues = self.extractSentNgrams(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("oe-word-vectors") or pFeatureName.startswith("oe-all-words-vectors"):
			vlValues = self.extractOEWV(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("at-word-vectors") or pFeatureName.startswith("at-all-words-vectors"):
			vlValues = self.extractATWV(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("at-oe-wv-dep-path") or pFeatureName.startswith("at-oe-word-vectors-in-dependency-path"):
			vlValues = self.extractAT2OEDepPathWV(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("top-at-vp-ngrams") or pFeatureName.startswith("topmost-at-verb-phrase-ngrams"):
			vlValues = self.extractTopATVPNgrams(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("polar-word-presence"):
			vlValues = self.extractPolarWordPresence(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("verb-form-in-at-heads") or pFeatureName.startswith("first-verb-form-in-at-head-chain"):
			vlValues = self.extractVerbFormInATHeads(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("sent-score-stat") or pFeatureName.startswith("sentence-sentiment-score-statistic"):
			vlValues = self.extractSentScoreStat(pFeatureName, pdFeatureParams)
		elif pFeatureName.startswith("pre-extracted") or pFeatureName.startswith("preex"):
			vlValues = self.extractPreex(pdFeatureParams)
		else:
			raise Exception("Feature %s is unknown! Check the spelling." % pFeatureName)
		
		return vlValues
		
	
	# pre-extracted features
	
	def extractPreex(self, pdFeatureParams):
		'''
		Extracts a previously-extracted feature from a file
		
		The file contains a column of values for the feature.
		'''
		
		vlValues = []
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam.lower() in ["source", "file", "input"]:
				vlValues = [float(s) for s in open(vValue).read().strip().split('\n')]
			else:
				raise Exception("'%s' is not a valid feature parameter for pre-extracted features!" % vParam)
		
		return vlValues
		
	
	# surface features
	
	def extractATSurface(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the aspect term surfaces
		'''
		
		# character used to replace spaces to avoid confusion in data files
		vSpaceFiller = '_'
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["space-filler"]:
				if vValue != None:
					vSpaceFiller = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		vlValues = [at.getForm().replace(' ', vSpaceFiller) for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractOESurface(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the opinion expression surfaces
		
		Feature parameters include:
		- filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
		  - closest: picks only closest OE to the AT in hand
		'''
		
		# character used to replace spaces to avoid confusion in data files
		vSpaceFiller = '_'
		
		vFilter = None
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["space-filler"]:
				if vValue != None:
					vSpaceFiller = vValue
			elif vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		if vFilter == "closest":
			vlValues = []
			
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append('')
				else:
					vlValues.append(vCOE.getForm().replace(' ', vSpaceFiller))
		else:
			vlValues = [[oe.getForm().replace(' ', vSpaceFiller) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractAT2OEConstPath(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the constituency path between the aspect term and objective expressions
		
		Feature parameters include:
		- filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
		  - closest: picks only closest OE to the AT in hand
		'''
		
		# character used to replace spaces to avoid confusion in data files
		vSpaceFiller = '_'
		
		vFilter = None
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["space-filler"]:
				if vValue != None:
					vSpaceFiller = vValue
			elif vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		if vFilter == "closest":
			vlValues = []
			
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append('')
				else:
					vlValues.append(vAT.sentence.getConstTree().extractAT2OEPath(vAT, vCOE))
		else:
			vlValues = [[at.sentence.getConstTree().extractAT2OEPath(at, oe) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractAT2OEDepPath(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the dependency path between the aspect term and objective expressions
		
		Feature parameters include:
		- filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
		  - closest: picks only closest OE to the AT in hand
		'''
		
		# character used to replace spaces to avoid confusion in data files
		vSpaceFiller = '_'
		
		vFilter = None
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["space-filler"]:
				if vValue != None:
					vSpaceFiller = vValue
			elif vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		if vFilter == "closest":
			vlValues = []
			
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append('')
				else:
					vlValues.append(vAT.sentence.getDepTree().extractAT2OEDepRelPath(vAT, vCOE))
		else:
			vlValues = [[at.sentence.getDepTree().extractAT2OEDepRelPath(at, oe) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractOEAvgSentScore(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the average sentiment score of the opinion expression(s) tokens
		
		Feature parameters include:
		- filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
		  - closest: picks only closest OE to the AT in hand
		'''
		
		vFilter = None
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		if vFilter == "closest":
			vlValues = []
			
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append(0.0)
				else:
					vlValues.append(vCOE.getAvgSentScore())
		else:
			vlValues = [at.sentence.getAvgOESentScore() for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractAvgSentScore(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the average sentiment score of the sentence
		'''
		
		vlValues = [at.sentence.getAvgSentScore() for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractATAvgWV(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the averaged word vectors of the aspect term tokens
		
		Feature parameters include:
		- wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
		           supported by ml/wv.WordVector.
		'''
		
		vWVFile = ''
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["wv-file", "word-vectors"]:
				vWVFile = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		# loading the word vectors
		
		from ml import wv
		
		vWV = wv.WordVector()
		vWV.load(pWVFilename = vWVFile)
		
		# extracting word vectors
		
		vlValues = [vWV.getAvgVector(at.getTokens()) for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractOEAvgWV(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the averaged word vectors of the opinion expression(s) tokens
		
		Feature parameters include:
		- filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
		  - closest: picks only closest OE to the AT in hand
		- wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
		           supported by ml/wv.WordVector.
		'''
		
		vFilter = None
		vWVFile = ''
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			elif vParam in ["wv-file", "word-vectors"]:
				vWVFile = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		# loading the word vectors
		
		from ml import wv
		
		vWV = wv.WordVector()
		vWV.load(pWVFilename = vWVFile)
		
		# extracting word vectors
		
		if vFilter == "closest":
			vlValues = []
			
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append([0.0] * vWV.dimension)
				else:
					vlValues.append(vWV.getAvgVector(vCOE.getTokens()))
		else:
			vlValues = [vWV.getAvgVector([t for oe in at.sentence.getOEs() for t in oe.getTokens()]) for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractSentAvgWV(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the averaged word vector of the sentence words
		'''
		
		vWVFile = ''
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["wv-file", "word-vectors"]:
				vWVFile = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		# loading the word vectors
		
		from ml import wv
		
		vWV = wv.WordVector()
		vWV.load(pWVFilename = vWVFile)
		
		vlValues = [vWV.getAvgVector(at.sentence.getTokens()) for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractATNgrams(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the aspect term n-grams
		'''
		
		vOrder = 1
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["order", 'n']:
				if vValue != None:
					vOrder = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		from nlp import nlp
		
		vlValues = [['-'.join(ngrams) for ngrams in nlp.extractNGrams(at.getTokens(), vOrder)] for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractOENgrams(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the opinion expression  n-grams
		'''
		
		vFilter = None
		vOrder = 1
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["order", 'n']:
				if vValue != None:
					vOrder = vValue
			elif vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		
		from nlp import nlp
		
		vlValues = []
		
		if vFilter == "closest":
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append([])
				else:
					vlValues.append(['-'.join(ngrams) for ngrams in nlp.extractNGrams(vCOE.getTokens(), vOrder)])
		else:
			for vAT in self.absaDS.getAspectTerms():
				vlOENgramsOfAT = []
				for vOE in vAT.sentence.getOEs():
					vlOENgramsOfAT += ['-'.join(ngrams) for ngrams in nlp.extractNGrams(vOE.getTokens(), vOrder)]
				
				vlValues.append(vlOENgramsOfAT)
		
		return vlValues
		
	
	def extractSentNgrams(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the sentence n-grams
		'''
		
		vOrder = 1
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["order", 'n']:
				if vValue != None:
					vOrder = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		from nlp import nlp
		
		vlValues = [['-'.join(ngrams) for ngrams in nlp.extractNGrams(at.sentence.getTokens(), vOrder)] for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractOEWV(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the concatenated word vectors of the opinion expression(s) tokens
		
		Feature parameters include:
		- filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
		  - closest: picks only closest OE to the AT in hand
		- wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
		           supported by ml/wv.WordVector.
		- vector-count: number of words from the beginning of opinion expression to extract the vector for. Since the length
		                of opinion expressions are varying, a fixed number should be considered to be able to use in learning
		                algorithm. For shorter OE than this count, zero vectors will be padded at the end. 
		'''
		
		vFilter = None
		vWVFile = ''
		vWVCnt = 1
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			elif vParam in ["wv-file", "word-vectors"]:
				vWVFile = vValue
			elif vParam in ["vector-count", "word-count"]:
				vWVCnt = int(vValue)
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		
		# loading the word vectors
		
		from ml import wv
		
		vWV = wv.WordVector()
		vWV.load(pWVFilename = vWVFile)
		
		# calculating the concatenated vector size
		
		vConcatVectorSize = vWV.dimension * vWVCnt
		
		# extracting word vectors
		
		vlValues = []
		
		if vFilter == "closest":
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append([np.float64(0.0)] * vWV.dimension * vWVCnt)
				else:
					# extracting concatenated vectors of all OEs in aspect term's sentence
					vlOEConcatVectors = [e for t in vCOE.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
					# padding for short OEs
					vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
					if vPadSize > 0:
						vlOEConcatVectors += [np.float64(0.0)] * vPadSize
					vlValues.append(vlOEConcatVectors)
		else:
			for vAT in self.absaDS.getAspectTerms():
				# extracting concatenated vectors of all OEs in aspect term's sentence
				vlOEConcatVectors = []
				for vOE in vAT.sentence.getOEs():
					vlOEConcatVectors += [e for t in vOE.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
				# padding for short OEs
				vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
				if vPadSize > 0:
					vlOEConcatVectors += [np.float64(0.0)] * vPadSize
				vlValues.append(vlOEConcatVectors)
		
		return vlValues
		
	
	def extractATWV(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the concatenated word vectors of the aspect term tokens
		
		Feature parameters include:
		- wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
		           supported by ml/wv.WordVector.
		- vector-count: number of words from the beginning of aspect term to extract the vector for. Since the length
		                of aspect terms are varying, a fixed number should be considered to be able to use in learning
		                algorithm. For shorter AT than this count, zero vectors will be padded at the end. 
		'''
		
		vWVFile = ''
		vWVCnt = 1
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["wv-file", "word-vectors"]:
				vWVFile = vValue
			elif vParam in ["vector-count", "word-count"]:
				vWVCnt = int(vValue)
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		
		# loading the word vectors
		
		from ml import wv
		
		vWV = wv.WordVector()
		vWV.load(pWVFilename = vWVFile)
		
		# calculating the concatenated vector size
		
		vConcatVectorSize = vWV.dimension * vWVCnt
		
		# extracting word vectors
		
		vlValues = []
		
		for vAT in self.absaDS.getAspectTerms():
			# extracting concatenated vectors of all ATs in aspect term's sentence
			vlATConcatVectors = [e for t in vAT.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
			
			# padding for short ATs
			vPadSize = vConcatVectorSize - len(vlATConcatVectors)
			if vPadSize > 0:
				vlATConcatVectors += [np.float64(0.0)] * vPadSize
			vlValues.append(vlATConcatVectors)
		
		return vlValues
		
	
	def extractAT2OEDepPathWV(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the word vectors of the words in the dependency path between the aspect term and objective expressions
		
		Feature parameters include:
		- filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
		  - closest: picks only closest OE to the AT in hand
		- wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
		           supported by ml/wv.WordVector.
		- vector-count: number of words from the beginning of the path to extract the vector for. Since the length of the
		                path is varying, a fixed number should be considered to be able to use in learning algorithm. For
		                shorter path than this count, zero vectors will be padded at the end. 
		'''
		
		vFilter = None
		vWVFile = ''
		vWVCnt = 1
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["filter"]:
				if vValue.lower() == "closest":
					vFilter = "closest"
			elif vParam in ["wv-file", "word-vectors"]:
				vWVFile = vValue
			elif vParam in ["vector-count", "word-count"]:
				vWVCnt = int(vValue)
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		# loading the word vectors
		
		from ml import wv
		
		vWV = wv.WordVector()
		vWV.load(pWVFilename = vWVFile)
		
		# calculating the concatenated vector size
		
		vConcatVectorSize = vWV.dimension * vWVCnt
		
		# extracting word vectors
		
		vlValues = []
		
		if vFilter == "closest":
			for vAT in self.absaDS.getAspectTerms():
				vCOE = self._extractClosestOE(vAT)
				if vCOE is None:
					vlValues.append([np.float64(0.0)] * vWV.dimension * vWVCnt)
				else:
					# extracting concatenated vectors of all OEs in aspect term's sentence
					vlOEConcatVectors = [e for t in vAT.sentence.getDepTree().extractAT2OEDepPathWords(vAT, vCOE)[:vWVCnt] for e in vWV.getVector(t)]
					# padding for short OEs
					vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
					if vPadSize > 0:
						vlOEConcatVectors += [np.float64(0.0)] * vPadSize
					vlValues.append(vlOEConcatVectors)
		else:
			for vAT in self.absaDS.getAspectTerms():
				# extracting concatenated vectors of all OEs in aspect term's sentence
				vlOEConcatVectors = []
				for vOE in vAT.sentence.getOEs():
					vlOEConcatVectors += [e for t in vAT.sentence.getDepTree().extractAT2OEDepPathWords(vAT, vOE)[:vWVCnt] for e in vWV.getVector(t)]
				# padding for short OEs
				vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
				if vPadSize > 0:
					vlOEConcatVectors += [np.float64(0.0)] * vPadSize
				vlValues.append(vlOEConcatVectors)
		
		return vlValues
		
	
	def extractTopATVPNgrams(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the n-grams under the topmost VP node dominating the aspect term
		'''
		
		vOrder = 1
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["order", 'n']:
				if vValue != None:
					vOrder = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		from nlp import nlp
		
		vlValues = []
		
		for vAT in self.absaDS.getAspectTerms():
			vTopVPNode = vAT.sentence.getConstTree().extractTopmostVP(vAT.getTokenSpan())
			if vTopVPNode is not None:
				vlValues.append(['-'.join(ngrams) for ngrams in nlp.extractNGrams(vTopVPNode.getTokens(), vOrder)])
			else:
				vlValues.append([])
		
		return vlValues
		
	
	def extractPolarWordPresence(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the presence of 
		'''
		
		vNeutralScore = 0
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["neutral", 'neutral-score']:
				if vValue != None:
					vNeutralScore = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		vlValues = [0 if len(at.sentence.getPolarScores(pNeutralScore = vNeutralScore)) == 0 else 1 for at in self.absaDS.getAspectTerms()]
		
		return vlValues
		
	
	def extractVerbFormInATHeads(self, pFeatureName, pdFeatureParams):
		'''
		Extracts the word form of the first verb in the head chain of the aspect term
		
		The following are the parameters:
		- sign: if true, the negation of the verb will be reflected by inserting a - before the verb form. For example,
		"does not have" will be represented as -have.
		'''
		
		vflgSign = False
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam in ["sign", "negation"]:
				if vValue != None:
					vflgSign = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		vlValues = []
		
		for vAT in self.absaDS.getAspectTerms():
			vATNode = vAT.sentence.getDepTree().getNode(vAT.getTokenSpan()[0])
			
			# finding the verb in the head chain
			vVerbNode = None
			for vNode in vATNode.getHeadChain():
				if vNode.isVerb():
					vVerbNode = vNode
					break
			
			if vVerbNode is not None:
				if vVerbNode.isNegated():
					vlValues.append("-%s" % vVerbNode.getForm())
				else:
					vlValues.append(vVerbNode.getForm())
			else:
				vlValues.append('')
		
		return vlValues
		
	
	def extractSentScoreStat(self, pFeatureName, pdFeatureParams):
		'''
		Extracts statistic about sentiment score of the sentence
		
		Statistics include counts or percentages of positive/negative/neutral words. 
		'''
		
		vPolarity = "positive"
		vStat = "count"
		
		for vParam, vValue in pdFeatureParams.iteritems():
			if vParam == "stat":
				if vValue != None:
					vStat = vValue
			elif vParam in ["polarity", "class", "category"]:
				if vValue != None:
					vPolarity = vValue
			else:
				raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
		
		## 1. creating a list of sentiment scores per instance where the target sentiment is replaced by 1 and others
		## by 0, e.g. in [0, +1, -1, 0, 0, -1] with target being negative polarity the outcome is [0, 0, 1, 0, 0, 1] 
		if vPolarity.lower().startswith("pos"):
			vlMap = [[1 if s > 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
		elif vPolarity.lower().startswith("neg"):
			vlMap = [[1 if s < 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
		elif vPolarity.lower().startswith("neu"):
			vlMap = [[1 if s == 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
		
		# 2. creating the statistic
		if vStat.lower().startswith("count"):
			vlValues = [sum(m) for m in vlMap]
		elif vStat.lower().startswith("percent"):
			vlValues = [sum(m) * 1.0 / len(m) for m in vlMap]
		else:
			raise Exception("'%s' is not a valid statistic for %s!" % (vStat, pFeatureName))
		
		
		return vlValues
		
	
	# auxiliary methods
	
	def _extractClosestOE(self, pAspectTerm):
		'''
		Extracts and returns the closest opinion expression to the given aspect term in the sentence in terms of token
		number
		'''
		
		vdDists = {oe: util.getSpanDistance(pAspectTerm.getTokenSpan(), oe.getTokenSpan()) for oe in pAspectTerm.sentence.getOEs()}
		
		if vdDists == {}:
			return None
		else:
			return min(vdDists, key = lambda x: abs(vdDists[x]))