rszk
/
sea


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905
							#! /usr/bin/python
# -*- coding: utf-8 -*-

"""	
	This module defines classes to for Aspect Based Sentiment Analysis (ABSA).
	
	NOTE: bugs identified by PyCharm should be fixed (e.g. _attachATSuffix() 
	      in ABSADNode). It seems they have never been tested. 
	
	
	Version 3.0											   (30-Apr-2018)
	- getTokens() and getOverallPolarity() are added to ABSAContext.
	
	Version 2.9											   (03-Jan-2018 to 05-Jan-2017)
	- mergeWith()is added to ABSASet.
	- pickle() and loadFromPickle() are added to ABSASet.
	- getOEsIO() is added to AspectTerm.
	
	Version 2.8											   (17-Jul-2017 to 18-Jul-2017)
	- ABSAContext is added to support SemEval 2016 data. The code is refactored so that
	  it still works seamlessly with 2014 data. 
	
	Version 2.7											   (17-May-2017)
	- Unary tree kernels in ABSANGramKTree is upgraded.
	
	Version 2.6											   (24-Mar-2017)
	- AspectTerm.getSentenceMarking() is added. 
	
	Version 2.5											   (23-Jan-2017)
	- Bug in ABSASet.loadBratOE() fixed to account for cases where SE is annotated
	  from/to the middle of a token due to tokenization problem.
	- ABSASet.oeToBIO() is upgraded to include POS tags in the output.
	
	Version 2.4								(01-Dec-2016 to 08-Dec-2016)
	- oeToBIO() is added to ABSASent.
	
	Version 2.3								(11-Oct-2016 to 13-Oct-2016)
	- loadBratOEs() is added to ABSASent to load opinion/sentiment expressions
	  annotated in Brat annotator.
	- oes is added to AspectTemrm to keep opinion/sentiment expressions.
	- vABSASent.oe is renamed to oes.
	- Type is added to OpinionExpression.
	
	Version 2.2								(22-Jun-2016)
	- writeBratInput()is added to ABSASet.
	
	Version 2.1								(02-May-2016 to 05-May-2016)
	- ABSANGramKTree is upgraded to use customized unigram auxiliary nodes
	  instead of X for binary format.
	- extractAT2RootPathTree() is added to ABSADTree.
	- ABSANGramKTree is upgraded to produce lower case word tree kernels.
	- New format is added to ABSANGramKTree kernels for word vector similarity.
	
	Version 2.0								(12-Apr-2016 to 26-Apr-2016)
	- ABSANGramKTree is upgraded to produce POS tag and sentiment score 
	  tree kernels similar to word tree kernels.
	- getPOSTags() is added to ABSASent.
	- New formats are added to ABSADKTree and ABSACKTree.
	- getVocabulary() and loadWordVectors() are added to the ABSASet.
	- getWordVectors() is added to ABSASent.
	- A new attribute is added to ABSASent to store the ABSASet it belongs to.
	  Its value is set during loading the data.
	
	Version 1.9								(04-Apr-2016 to 11-Apr-2016)
	- New formats are added to ABSANGramKTree and getNGramKTreeEmbeding()
	  of AspectTerm is modified to handle various n-gram tree formats.
	- getPOSTags() is added to ABSASent.
	
	Version 1.8								(10-Mar-2016 to 29-Mar-2016)
	- extractAT2OEPath() of ABSADepTree is renamed to  extractAT2OEDepRelPath().
	- getPolarScores() is added to ABSASent.
	
	Version 1.7								(17-Feb-2016 to 09-Mar-2016)
	- Constituency and dependency path between aspect term and opinion 
	  expression is extracted.
	- Bug in ABSADNode.loadFromDepNode() is fixed which assigned DepTree
	  object instead of ABSADepTree to depTree attribute.
	- Average sentiment score of the sentence and opinion expressions can
	  be extracted using newly added methods to ABSASent and OpinionExpression.
	- getTokens() is added to OpinionExpression and AspectTerm.
	- The aspect length groups are now extracted as part of the aspect term
	  statistics by ABSASet.extractATStat().
	
	Version 1.6								(28-Jan-2016 to 02-Feb-2016)
	- Opinion expression is introduced and OpinionExpression is added.
	- loadBIOOpinionExpressions() is added to ABSASet and ABSASent.
	- Methods are added to embed opinion expressions in the trees.
	- ABSADNode.decorate() is renamed to ABSADNode.decorateAT(). 
	- New formats are added to ABSADKTree.
	
	Version 1.5								(19-Nov-2015 to 26-Nov-2015)
	- Naming of internal methods is ABSACKTree and ABSADKTree are modified
	  to fix the bug causing conflict between the methods of these classes
	  and their parent classes.
	- New formats are added to ABSACKTree.
	- shallowCopy() is added to ABSAConstTree.
	- deep copying method the trees in tree embedding generations in aspec 
	  term class is changed to use python deepcopy as it performed better
	  with ConstTree in an experiment. However, it has not been tested for
	  the dependency tree despite applying the idea. In both cases, this 
	  still needs to be further confirmed in the future experiments.
	
	Version 1.4								(13-Nov-2015 to 16-Nov-2015)
	- New format is added to ABSADKTree to handle semantic roles.
	
	Version 1.3								(22-Oct-2015 to 02-Nov-2015)
	- ABSACTree.loadSentimentScore() is changed and ABSACNode.loadSentScores()
	  is added to enable optionally propagating sentiment scores in the 
	  tree nodes.
	- pNeutralScore is added to loadSentimentScores in ABSASet and ABSASent
	  to assign a score for neutral words other than None.
	- ABSACKTree is added with two new formats.
	- ABSASent.getSentimentScores() is changed to return the scores in a 
	  list instead of a dictionary.
	
	Version 1.2								(12-Oct-2015 to 15-Oct-2015)
	- ABSADKTree.generateDepKTree() supports subtrees under a node provided
	  as a parameter in addition to the whole tree under the root.
	- ABSACDKTree is added to support the integration of constituency and 
	  dependency trees.
	- ABSANGramKTree is added to construct a tree from text token forms.
	- getNGramKTreeEmbeding() is added to AspectTerm.
	- extractCaraFXInput() is added to ABSASent.
	
	Version 1.1								(28-Sep-2015 to 02-Oct-2015)
	- getSentimentScores(), length() are added to ABSASent.
	- tokenLength() and getSentimentScores() are added to ABSASet.
	- New formats are added to generateDepKTree() of ABSADKTree.
	- Sentiment score representation is added to constituency tree decoration.
	- loadSSInCTree() is added to ABSASet and ABSASent.
	
	Version 1.0								(03-Sep-2015 to 08-Sep-2015)
	- loadSentimenScores() is added to ABSASet and ABSASent.
	- sentScores is added to ABSASent to represent sentiment scores of the
	  sentence words.
	- sentScore is added to ABSADNode to represent sentiment score of the
	  associated node.
	- loadSSInDTrees() is added to ABSASet to load sentiment scores into
	  the dependency trees.
	- loadSSInDTree() is added to ABSASent to load sentiment scores into
	  the dependency tree of the sentence.
	- loadSentScores() is added to ABSADTree.
	- setSentScore() is added to ABSADNode.
	- deepCopy() is added to ABSADNode and the constructor and node creation
	  methods were accordingly changed.
	
	Version 0.9								(05-Aug-2015 to 26-Aug-2015)
	- ABSADTree and ABSADNode are added, and all the corresponding updates
	  are done in ABSASet, ABSASent, AspectTerm.
	
	Version 0.8								(30-Jul-2015 to 04-Aug-2015)
	- The representation of aspect terms in the constituency tree is decoupled
	  from embedding them in the tree. Once an aspect term is embedded in
	  the tree (either in pre-terminals in its span or in the constituent
	  node mapped to it), it can be represented in the tree by inserting
	  a new node, attaching suffix and attaching polarity in various formats.
	  This is done in ABSACNode.decorateAT(). AspectTerm.getCTreeEmbeding()
	  and ABSACNode.embedAspectTerm() were changed accordingly. Also,
	  ABSACNode.getPTBFormat() is not overridden any more since polarity 
	  attachment is done in decorateAT(). Finally, inserting node during 
	  embedding aspect terms only happens optionally when there is a mismatch.
	  This has been implemented by changing pInsertATNode to pOnMismatch
	  which specifies the method for handling aspect term/constituent 
	  mismatch, on of which being inserting new AT node.
	
	Version 0.7								(27-Jul-2015 to 29-Jul-2015)
	- Another option (pAttachATSuffix) is added to embedding aspect term 
	  in constituency tree to mark nodes in the aspect term subtree with 
	  a specific suffix tag (_AT). 
	- ConstNode.embedAspectTerm() is edited to support embedding the aspect 
	  term in pre-terminals in addition to constituent nodes spanning the
	  aspect term. _embedATINSpanConst() and _embedATINPreterminals() are 
	  added to ConstNode for this purpose and extractAspectTerms() is edited
	  to account for repetitions stemming from embedding aspect terms in
	  pre-terminals.
	- ABSADTree and ABSADNode are implemented.
	
	Version 0.6								(22-Jun-2015 to 23-Jun-2015)
	- aspectTerm in ABSACNode is changed to aspectTerms which is a list of
	  aspect terms ambeded into the node to allow a node to carry more than
	  one aspect terms. The methods handling this attribute have consequently
	  been updated.
	- extractATsInCTree() is added to ABSASent.
	- getSentencesWithNoAT() is added to ABSASet.
	- ABSASet.extractATStat() is update to extract more statistics.
	
	Version 0.5													18-May-2015
	- ABSACNode.embedAspectTerm() is edited to better handle the situation 
	  where pInsertATNode is set to mismatch. ABSACNode._plugAspectTerm
	  has accordingly been updated.
	- getAspectTermCount() is added to ABSASet and ABSASent.
	
	Version 0.4													08-May-2015
	- loadConstTrees() in ABSASet and ABSASent are changed not to embed 
	  the aspect terms. Embeding aspect terms is done instead by new methods
	  ABSASet.embedATsInCTrees() and ABSASent.embedATsInCTree().
	- embedAspectTerm() is added to ABSASent to be able to embed individual
	  aspect terms in the tree.
	- sentence (attr), embedInCtree(), getCTreeEmbeding() are added to 
	  AspectTerm.
	- The default value for pflgAttachATPolarity argument of getPTBFormat()
	  method in ABSACTree and ABSACNode is changed to False. 
	
	Version 0.3													06-May-2015
	- Embedding aspect terms is changed. Accordingly, ABSACNode.embedAspectTerms()
	  is renamed to embedAspectTerm().
	- addSentences() is added to ABSASet.
	
	Version 0.2													18-Feb-2015
	- The NP structures in ABSACTree can be modified to avoid term/constituent
	  mismatch.
	
	Version 0.1													17-Feb-2015
	- ABSA, ABSASent, AspectTerm, ABSACTree, ABSACNode are added.
	
"""


from collections import namedtuple
import re, copy
import pickle


class ABSASet:
	'''
	Class for aspect-based sentiment analysis data set
	'''
	
	
	def __init__(self):
		'''
		Constructor
		'''
		
		self.contexts = []
		
		# WordVector object containing the word vectors of the vocabulary in the dataset
		self.wv = None
		
	
	@property
	def size(self):
		'''
		Returns the size of the data set which is the number of its sentences
		'''
		
		return len(self.getSentences())
		
	
	def getContextCount(self):
		'''
		Returns the number of contexts i the data set
		'''
		
		return len(self.contexts)
		
	
	@property
	def tokenLength(self):
		'''
		Returns the number of tokens in the data set
		'''
		
		return sum([s.length for s in self.getSentences()])
		
	
	def getContexts(self):
		'''
		Returns ABSA contexts of the data set 
		'''
		
		return self.contexts
		
	
	def getSentences(self, pSort = ''):
		'''
		Returns the ABSA sentences
		
		The sort options are:
		- None: in document order
		- text: in sentence text order
		'''
		
		vlSentences = [s for c in self.getContexts() for s in c.getSentences()]
		
		if pSort.lower() == "text":
			return [s for s in sorted(vlSentences, key = lambda x: x.getText())]
		else:
			return vlSentences
		
	
	def getSentencesWithNoAT(self):
		'''
		Returns the ABSA sentences which do not have any aspect terms
		'''
		
		return [s for s in self.getSentences() if s.getAspectTermCount() == 0]
		
	
	def addContext(self, pContext):
		'''
		Adds ABSA context to the data set 
		'''
		
		if pContext not in self.getContexts():
			pContext.dataset = self
			self.contexts.append(pContext)
		
	
	def addContexts(self, plContexts):
		'''
		Adds ABSA contexts to existing contexts
		'''
		
		for vContext in plContexts:
			self.addContext(vContext)
		
	
	def delContext(self, pContextIdx):
		'''
		Deletes context at the given index from the dataset 
		'''
		
		del self.contexts[pContextIdx]
		
	
	def mergeWith(self, pABSASet):
		'''
		Merges this data set with the given ABSA data set 
		
		It first checks if both sets to be merged are of the same class derived from ABSASet) 
		'''
		
		if self.__class__ != pABSASet.__class__:
			raise Exception("Two data sets must be of the same ABSA type: %s vs. %s" %(self.__class__ , pABSASet.__class__))
		
		# merging contexts
		self.addContexts(pABSASet.getContexts())
		
		# NOTE: word embeddings should also be merged
		print("Word embeddings were not merged. Reload them for the merged data set.")
		
		# NOTE: take care of other newly added attributes if any
		
	
	def addSentences(self, plSentences):
		'''
		Adds ABSA sentences to existing sentences
		
		Currently, no care is taken regarding ID duplication.
		'''
		
		for vSent in plSentences:
			if vSent.context not in self.getContexts():
				self.addContext(vSent.context)
			
			vSent.context.addSentence(vSent)
		
	
	def getVocabulary(self):
		'''
		Extracts and returns the vocabulary of the dataset 
		'''
		
		return sorted(set([t for s in self.getSentences() for t in s.getTokens()])) 
		
	
	def getAspectTerms(self):
		'''
		Returns all aspect terms objects
		'''
		
		vlAspectTerms = []
		
		for vSent in self.getSentences():
			vlAspectTerms += vSent.getAspectTerms()
		
		return vlAspectTerms
		
	
	def getAspectTermCount(self):
		'''
		Returns the total number of aspect terms in the dataset
		'''
		
		vTotalCount = 0
		
		for vSent in self.getSentences():
			vTotalCount += vSent.getAspectTermCount()
		
		return vTotalCount
		
	
	def extractSentenceForms(self, pSort = None):
		'''
		Returns the surface form of the sentences
		
		The sort options are:
		- None: in document order
		- id:   in sentence ID order
		- text: in sentence text order
		- at: in aspect term order
		'''
		
		if pSort.lower() == "at":
			return [at.sentence.getText() for at in self.getAspectTerms()]
		else:
			return [s.getText() for s in self.getSentences(pSort = pSort)]
		
	
	def extractAspectTerms(self):
		'''
		Extracts all aspect terms (forms) in the data and their counts
		'''
		
		vlAspectTerms = []
		
		for vSent in self.getSentences():
			vlAspectTerms += vSent.getAspectTermForms()
		
		return util.groupBy(vlAspectTerms)
		
	
	def extractATStat(self):
		'''
		Extract various aspect term statistics
		'''
		
		# data structure to store statistics
		
		ATStat = namedtuple('ATStat', "sentsWithAnAT, sentsWithNoAT, allATCount distATCount hapaxATCount polCounts CTreeATCount, ATConstMismatch, ATLenGroups")
		
		vSentsWithNoAT = len(self.getSentencesWithNoAT())
		vSentsWithAnAT = self.size - vSentsWithNoAT
		
		vdATs = self.extractAspectTerms()
		
		# allATCount: number of all aspect terms
		vAllATCount = sum([at for at in vdATs.itervalues()])
		
		# distATCount: number of distinct aspect terms
		vDistATCount = len(vdATs)
		
		# hapaxATCount: number of hapax aspect terms (terms appearing just once)
		vHapaxATCount = sum([at for at in vdATs.itervalues() if at == 1])
		
		# polCounts: number of each polarity type
		vdPolCounts = {"positive": 0, "negative": 0, "neutral": 0, "conflict": 0}
		for vAT in self.getAspectTerms():
			vdPolCounts[vAT.getPolarity()] += 1
		
		# CTreeATCount: number of aspect terms transferred to the contituency tree
		vCTreeATCount = sum([len(s.extractATsInCTree()) for s in self.getSentences()])
		
		# ATConstMismatch: number of mismatched aspect terms and constituents
		vATConstMismatch = sum([len(s.extractATermConstMismatch()) for s in self.getSentences()])
		
		# ATLenGroups: grouped AT token lengths in a dictionary
		vdATLenGroups = util.groupBy([len(at.getTokens()) for at in self.getAspectTerms()])
		
		return ATStat(vSentsWithAnAT, vSentsWithNoAT, vAllATCount, vDistATCount, vHapaxATCount, vdPolCounts, vCTreeATCount, vATConstMismatch, vdATLenGroups)
		
	
	def loadConstTrees(self, plConstTrees):
		'''
		Loads the constituency parse trees of the sentences
		
		It assumes that the provided constituency trees are in the order
		in which the sentences are loaded.
		
		The constituency trees can be provided in bracketing format or as
		constparse.ConstTree objects (in a list).
		'''
		
		for vSent, pCTree in zip(self.getSentences(), plConstTrees):
			vSent.loadConstTree(pCTree)
		
	
	def loadDepTrees(self, plDepTrees):
		'''
		Loads the dependency parse trees of the sentences
		
		It assumes that the provided dependency trees are in the order in
		which the sentences are loaded.
		
		The dependency trees are assumed to be provided a list of 
		depparse.DepTree objects.
		'''
		
		for vSent, pDTree in zip(self.getSentences(), plDepTrees):
			vSent.loadDepTree(pDTree)
		
	
	def loadPOSTaggings(self, pPOSTaggingFilename, pSort = ''):
		'''
		Loads POS taggings of the sentences in the dataset
		 
		POS tagging file should be in a columnar format and the order of the sentences should be given in pSort.
		'''
		
		for vSent, vSentPOS in zip(self.getSentences(pSort = pSort), open(pPOSTaggingFilename).read().strip().split('\n\n')):
			vPOSTagging = pos.POSTagging()
			vPOSTagging.loadFromColumnar(vSentPOS)
			
			if vPOSTagging.length != vSent.length:
				raise Exception("Length of sentence and POS tags don't match:\n\n%s\n%s" % (vSent.getTokens(), vPOSTagging.toLorgInput()))
			else:
				vSent.loadPOSTagging(vPOSTagging)
		
	
	def loadBIOOpinionExpressions(self, pllBIO):
		'''
		Loads opinion expression annotations based on BIO tagging
		
		The BIO tagging is provided in a 2D list the first dimension of which is supposed to match sentences in the 
		dataset and the second the BIO labels of the tokens in each sentence.
		'''
		
		for vSent, vlOEBIO in zip(self.getSentences(), pllBIO):
			vSent.loadBIOOpinionExpressions(vlOEBIO)
		
	
	def embedATsInCTrees(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'nothing', pflgExtendATSpanToDT = False):
		'''
		Embeds the aspect terms in the constituency trees
		
		pflgModNPStruct specifies whether the NP structures should be edited
		in order to avoid term/constituent mismatch.
		
		pEmbedPosition specifies where the aspect term should be embedded
		in the subtree. The possible values are:
		- (span)ning-constituent: embedding into a node in subtree which 
		                          spans the aspect term tokens. This may 
		                          cause mismatches where the aspect term 
		                          span is not fully covered by the node
		                          span. pOnMismatch can be set to fix the
		                          issue.
		- (pre)-terminals: embedding into each and every pre-terminal node
		                   in the subtree falling in the aspect term span.
		
		pOnMismatch specifies the method for handling aspect term/constituent
		mismatch. The possible values are:
		- nothing (or none): do not handle the mismatches
		- node (or insert): insert a new node covering the aspect term span
		'''
		
		for vSent in self.getSentences():
			vSent.embedATsInCTree(pflgModNPStruct = pflgModNPStruct, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendATSpanToDT = pflgExtendATSpanToDT)
		
	
	def embedOEsInCTrees(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'nothing'):
		'''
		Embeds the opinion expressions in the constituency trees
		
		pflgModNPStruct specifies whether the NP structures should be edited in order to avoid expression/constituent 
		mismatch.
		
		pEmbedPosition specifies where the opinion expression should be	embedded in the subtree. The possible values are:
		- (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches 
		                          where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
		                          the issue.
		- (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
		
		pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
		- nothing (or none): do not handle the mismatches
		- node (or insert): insert a new node covering the aspect term span
		'''
		
		for vSent in self.getSentences():
			vSent.embedOEsInCTree(pflgModNPStruct = pflgModNPStruct, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
		
	
	def embedATsInDTrees(self):
		'''
		Embeds the aspect terms in the dependency trees
		'''
		
		for vSent in self.getSentences():
			vSent.embedATsInDTree()
		
	
	def embedOEsInDTrees(self):
		'''
		Embeds the opinion expressions in the dependency trees
		'''
		
		for vSent in self.getSentences():
			vSent.embedOEsInDTree()
		
	
	def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
		'''
		Loads polarity scores to sentences from a sentiment lexicon which
		is a Sentexicon object
		
		For details about Sentexicon object, see sentexicon.py
		'''
		
		vTotalWordNum = 0          # total number of words in the data set
		vTotalEntryWordNum = 0     # number of words found in the lexicon
		
		for vSent in self.getSentences():
			vWordNum, vEntryWordNum = vSent.loadSentimentScores(pSentexicon, pNeutralScore)
			
			vTotalWordNum += vWordNum
			vTotalEntryWordNum += vEntryWordNum
		
		return vTotalWordNum, vTotalEntryWordNum
		
	
	def getSentimentScores(self):
		'''
		Returns a dictionary of words in the data set and the sentiment 
		scores attached to them 
		'''
		
		vdResult = {}
		
		for vSent in self.getSentences():
			for vWord, vScore in zip(vSent.getTokens(), vSent.getSentimentScores()):
				vdResult[vWord] = vScore
		
		return vdResult
		
	
	def loadSSInDTrees(self):
		'''
		Loads sentiment scores into dependency tree nodes
		'''
		
		for vSent in self.getSentences():
			vSent.loadSSInDTree()
		
	
	def loadSSInCTrees(self, pPropagation = None):
		'''
		Loads sentiment scores into constituency tree nodes
		'''
		
		for vSent in self.getSentences():
			vSent.loadSSInCTree(pPropagation = pPropagation)
		
	
	def loadWordVectors(self, pWordVectors, pflgFilter = True):
		''' 
		Loads word vectors from a file or WordVector object, whichever is given
		
		By default, it filters out the words not in the data vocabulary, which can be changed to not filter (e.g. when 
		the input is already filtered).   
		'''
		
		from ml import wv
		
		self.wv = wv.WordVector()
		
		if type(pWordVectors) == str:
			vWV.load(pWVFilename = vWVFile, plFilterVocab = self.getVocabualry())
		else:
			self.wv = pWordVectors
		
	
	def toBratInput(self):
		'''
		Generates the dataset in Brat annotation input format
		
		There are two input types: a text input containing the raw sentences and a annotation input containing the 
		annotation.
		'''
		
		
		vSentDocOffset = 0                                          # document offset of the current sentence
		vlTxt = []
		vlAnn = []
		
		for i, vAT in enumerate(self.getAspectTerms(), start = 1):
			vlTxt.append(vAT.sentence.getText())
			
			# setting the aspect term's character offset in the document
			
			# part of the sentence before the aspect term token
			vSentUpToAT = ' '.join(vAT.sentence.getTokens()[ : (vAT.getTokenSpan()[0] - 1)])
			
			# character offset of the aspect term in token(s) containing the aspect term (e.g. 5 for built in well-built)
			vATPosInToken = ' '.join(vAT.getTokens()).find(vAT.getForm())
			
			if (vSentUpToAT == ''):
				vATDocOffset = vSentDocOffset + len(vSentUpToAT) + vATPosInToken
			else:
				vATDocOffset = vSentDocOffset + len(vSentUpToAT) + vATPosInToken + 1
			
			vlAnn.append("T%s\tAT-%s %s %s\t%s" % (i, vAT.getPolarity()[:3], vATDocOffset, vATDocOffset + len(vAT.getForm()), vAT.getForm()))
			
			# setting the next sentence's character offset in the document 
			vSentDocOffset += len(vAT.sentence.getText()) + 1       # +1 for newline 
		
		return vlTxt, vlAnn
		
	
	def loadBratOE(self, pBratAnnFilename, pflgVerbos = False, pdNewAnnTypes = None):
		'''
		Loads opinion or sentiment expression annotation from Brat annotator output format
		
		It assumes that the Brat input was provided using toBratInput() method, so the order 
		of the sentences are retained.
		
		pdNewAnnNames is a dictionary which translates the annotation type names with new ones. These is primarily added 
		to rename OE (opinion exression) to SE (sentiment expression) in the SE annotation internship project.
		'''
		
		# first loading OE from Brat ann file
		
		vlOEs = []
		
		for l in open(pBratAnnFilename).read().strip().split('\n'):
			vlSplit = l.split('\t')
			if vlSplit[1].startswith("OE"):
				vlOESplit = vlSplit[1].split()
				
				# renaming the annotation types if new names are given
				if pdNewAnnTypes is not None and pdNewAnnTypes != {}:
					vAnnType = pdNewAnnTypes[vlOESplit[0]]
				else:
					vAnnType = vlOESplit[0]
				
				vlOEs.append({"type": vAnnType, "from": int(vlOESplit[1]), "to": int(vlOESplit[2]), "surface": vlSplit[2]})
		
		
		# sorting OE list based on their document offset
		vlOEs.sort(key = lambda x: x["from"])
		
		
		# extracting OEs for each aspect term: assumes that the Brat input was provided using toBratInput() method, so the order 
		# of the sentences are retained.
		
		vSentDocOffset = 0
		l = []
		for i, vAT in enumerate(self.getAspectTerms(), start = 1):
			if pflgVerbos:
				print "%s) %s" % (i, vAT.sentence.getText())
				print "\nAT: " + vAT.getForm()
				print "    Polairty: " + vAT.getPolarity()
				print "OEs:"
			
			l.append(vAT.sentence.getText())
			
			# calculating document character span of the aspect term's sentence
			vSentDocSpan = (vSentDocOffset, vSentDocOffset + len(vAT.sentence.getText()))
			
			# collecting OEs of the current AT based on character offsets: not very efficient
			vlATOEs = [oe for oe in vlOEs if oe["from"] >= vSentDocSpan[0] and oe["to"] <= vSentDocSpan[1]]
			
			if pflgVerbos:
				if len(vlATOEs) == 0:
					print "    No OE annotated for this aspect term (polarity is %s)" % vAT.getPolarity()
			
			# adding the OEs to the aspect term 
			for oe in vlATOEs:
				# sanity check
				if oe["to"] > vSentDocSpan[1]:
					raise Exception("Invalid OE span: %s > %s; the end falls out of the sentence" % (oe["to"], vSentDocSpan[1]))
				
				# finding token span of the OE
				vTokenOffsetStart = vSentDocOffset
				vOETokenSpanStart = 0
				vOETokenSpanEnd = 0
				for i, token in enumerate(vAT.sentence.getTokens(), start = 1):
					vTokenOffsetEnd = vTokenOffsetStart + len(token)         # end of this token
					
					# start token: sometimes the OE is annotated from the middle of token when the tokenization has problem (e.g. -when) 
					if oe["from"] >= vTokenOffsetStart and oe["from"] <= vTokenOffsetEnd:
						vOETokenSpanStart = i
					
					# end token: sometimes the OE is annotated until the middle of token when the tokenization has problem (e.g. headphones/mic) 
					if oe["to"] >= vTokenOffsetStart and oe["to"] <= vTokenOffsetEnd:
						vOETokenSpanEnd = i
					
					vTokenOffsetStart += len(token) + 1  # start of next token
				
				vOE = OpinionExpression()
				vOE.span = (vOETokenSpanStart, vOETokenSpanEnd)
				vOE.type = oe["type"]
				
				if vOE.span == (0, 0):
					raise Exception("No token match was found!\n%s" % oe) 
				else:
					vAT.addOE(vOE)
					
					if pflgVerbos:
						print "    %s" % vOE.type
						print "        Original: %s" % oe["surface"]
						print "        Extracted: %s" % vOE.getForm()
			
			if pflgVerbos:
				print "\n............................................."
			
			# setting the next sentence's character offset in the document 
			vSentDocOffset += len(vAT.sentence.getText()) + 1       # +1 for newline 
		
	
	def oeToBIO(self, pflgPOSTags = False):
		'''
		Converts and returns opinion/sentiment expressions of each aspect term to BIO format
		
		The returned output is a 2D list of aspect terms sentences and their tokens in "token\ttag" format. Optionally,
		the POS tags can also be included in the output making the format "token\tPOS\ttag".
		'''
		
		vllOutput = []
		
		for vAT in self.getAspectTerms():
			vllOutput.append([])
			
			vlAnn = ['O' for i in range(vAT.sentence.length)]
			
			if len(vAT.oes) > 0:
				for vOE in vAT.oes:
					vlAnn[vOE.getTokenSpan()[0] - 1] = 'B'
					
					for i in range(vOE.getTokenSpan()[0], vOE.getTokenSpan()[1]):
						vlAnn[i] = 'I'
			
			if pflgPOSTags:
				for vAnn, vPOS, vTok in zip(vlAnn, vAT.sentence.getPOSTags(), vAT.sentence.getTokens()):
					vllOutput[-1].append("%s\t%s\t%s" % (vTok, vPOS, vAnn))
			else:
				for vAnn, vTok in zip(vlAnn, vAT.sentence.getTokens()):
					vllOutput[-1].append("%s\t%s" % (vTok, vAnn))
		
		return vllOutput
		
	
	def oeToIO(self):
		'''
		Converts and returns opinion/sentiment expressions of each aspect term to binary IO format 
		
		Binary IO tags can be used when there is only one type of sentiment expression annotated and one sentiment expression
		per sentence is possible.
		'''
		
		return [at.getOEsIO() for at in self.getAspectTerms()]
		
	
	def pickle(self, pFilename):
		'''
		Pickles the object into the give file name 
		'''
		
		pickle.dump(self.__dict__, open(pFilename, "wb"), protocol = 2)
		
	
	def loadFromPickle(self, pFilename):
		'''
		Loads the pickled ABSASet object to this object 
		'''
		
		self.__dict__.update(pickle.load(open(pFilename)))
		
	
	def getPOSTagSet(self):
		'''
		Retunrs the POS tag set of the sentences of the dataset 
		'''
		
		vlPOSTags = []
		
		for vSent in self.getSentences():
			vlPOSTags += vSent.getPOSTags()
		
		return set(vlPOSTags)
		
	
class ABSAContext:
	"""
	Class for aspect-based sentiment analysis context (set of sentences) 
	"""
	
	
	def __init__(self, pABSASet):
		'''
		Constructor 
		'''
		
		self.dataset = pABSASet
		
		self.sentences = []
		
	
	def getSentences(self, pSort=''):
		'''
		Returns the ABSA sentences
		
		The sort options are:
		- None: in document order
		- text: in sentence text order
		'''
		
		if pSort.lower() == "text":
			return [s for s in sorted(self.getSentences(), key=lambda x: x.getText())]
		else:
			return self.sentences
		
	
	def addSentence(self, pSentence):
		'''
		Adds ABSA sentence to the context 
		'''
		
		if pSentence not in self.sentences:
			pSentence.context = self
			self.sentences.append(pSentence)
		
	
	def getTokens(self):
		'''
		Returns token list of all sentences in the context
		
		For tokens to make sense, make sure the data is tokenized.
		'''
		
		return [t for s in self.getSentences() for t in s.getTokens()]
		
	
	def getATPolarities(self):
		'''
		Returns list of polarities of all aspect temrs in the context 
		'''
		
		return [at.getPolarity() for s in self.getSentences() for at in s.getAspectTerms()]
		
	
	def getOverallPolarity(self):
		'''
		Returns the overall polarity of the context based on the polarity of its aspect terms
		
		See the code for how the overall is calculated. Basically, number positive or negative polarities in the context
		should be at least twice as many as the opposite polarity, or otherwise the polarity will be considered neutral.
		
		ToDo: polarity values are fixated here. They should be variable based on the data.
		'''
		vdPolarities = util.groupBy(self.getATPolarities())
		
		if "negative" not in vdPolarities and "positive" not in vdPolarities:
			return "neutral"
		elif "negative" not in vdPolarities and "positive" in vdPolarities:
			return "positive"
		elif "negative" in vdPolarities and "positive" not in vdPolarities:
			return "negative"
		elif "negative" in vdPolarities and "positive" in vdPolarities:
			if vdPolarities["negative"] >= (2 * vdPolarities["positive"]):
				return "negative"
			elif vdPolarities["positive"] >= (2 * vdPolarities["negative"]):
				return "positive"
			else:
				return "neutral"
		else:
			raise Exception("Strange situation: %s" % vdPolarities)
		
	
class ABSASent:
	'''
	Class for aspect-based sentiment analysis sentence
	'''
	
	
	def __init__(self, pABSAContext):
		'''
		Constructor
		'''
		
		# ABSAContext the sentence belongs to
		self.context = pABSAContext
		
		self.text = None
		self.aspectTerms = []
		self.cTree = None
		self.dTree = None
		self.posTagging = None
		self.sentScores = []     # sentiment score, one per word in the tokenized self.text
		self.oes = []             # opinion expressions in the sentence (not those of aspect terms)
		
	
	def getText(self):
		'''
		Returns sentence text (form)
		'''
		
		return self.text
		
	
	def getTokens(self):
		'''
		Returns the tokenization of the sentence
		
		The sentence text is assumed to be in tokenized format and only 
		splits on space.
		'''
		
		return self.getText().split()
		
	
	@property
	def length(self):
		'''
		Returns the sentence length
		'''
		
		return len(self.getTokens())
		
	
	def getAspectTerms(self):
		'''
		Returns aspect terms (objects) of the sentence 
		'''
		
		return self.aspectTerms
		
	
	def getOEs(self):
		'''
		Returns opinion expressions (objects) of the sentence 
		'''
		
		return self.oes
		
	
	def getAspectTermCount(self):
		'''
		Returns the number of aspect terms in the sentence 
		'''
		
		return len(self.aspectTerms)
		
	
	def getAspectTermForms(self):
		'''
		Returns aspect term forms (term attributes) of the sentence 
		'''
		
		return [t.getForm() for t in self.getAspectTerms()]
		
	
	def getConstTree(self):
		'''
		Returns the constituency parse tree of the sentence
		
		The returned object is of type constparse.ConstTree
		'''
		
		return self.cTree
		
	
	def getPOSTags(self):
		'''
		Returns the list of POS tags which matches the token list
		
		The POS tags are extracted from the constituency tree or dependency tree
		'''
		
		if self.cTree is not None:
			return self.cTree.getPOSs()
		elif self.dTree is not None:
			return self.dTree.getPOSs()
		elif self.posTagging is not None:
			return self.posTagging.getPOSTags()
		else:
			return []
		
	
	def getDepTree(self):
		'''
		Returns the dependency parse tree of the sentence
		
		The returned object is of type depparse.DepTree
		'''
		
		return self.dTree
		
	
	def loadConstTree(self, pConstTree):
		'''
		Loads the constituency parse tree of the sentence
		
		The contituency tree can be provided in bracketing format or as
		constparse.ConstTree object.
		'''
		
		# loading the tree
		
		if isinstance(pConstTree, constparse.ConstTree):
			vConstTree = pConstTree.getPTBFormat()
		else:
			vConstTree = pConstTree
		
		self.cTree = ABSACTree()
		self.cTree.loadPTBTree(vConstTree, pflgExpandTerminal = True)
		
		# sanity check; comment out
		#if self.cTree.surface != self.getText():
		#	print "Sentence and tree mismatch:\nSentence: %s\nTree: %s\n" % (self.getText(), self.cTree.getPTBFormat())
		
	
	def loadDepTree(self, pDepTree):
		'''
		Loads the dependency parse tree of the sentence
		
		The dependency tree is assumed to be depparse.DepTree object.
		'''
		
		# loading the tree
		
		if not isinstance(pDepTree, depparse.DepTree):
			raise Exception("A DepTree object is expected!")
		
		self.dTree = ABSADTree()
		self.dTree.loadFromDepTree(pDepTree = pDepTree)
		
		# sanity check; comment out
		#if self.dTree.surface != self.getText():
		#	print "Sentence and tree mismatch:\nSentence: %s\nTree:%s\n" % (self.getText(), self.dTree.surface)
		
	
	def loadPOSTagging(self, pPOSTagging):
		'''
		Loads POS tagging of the sentence
		
		POS tagging is a pos.POSTagging object.
		'''
		
		self.posTagging = pPOSTagging
		
	
	def loadBIOOpinionExpressions(self, plBIO):
		'''
		Loads opinion expression annotations of the sentence based on BIO tagging
		
		The BIO labels are provided in a list which is supposed to match the tokens in the sentence.
		'''
		
		# start and end of 1-based span 
		vSpanStart = 0
		vSpanEnd = 0
		
		for i, vLabel in enumerate(plBIO, start = 1):
			if vLabel.lower() == 'b':
				vSpanStart = i
				vSpanEnd = i
			elif vLabel.lower() == 'i':
				vSpanEnd += 1
			elif vLabel.lower() == 'o':
				if vSpanStart != 0:                            # means the first token after span
					vOE = OpinionExpression()
					vOE.span = (vSpanStart, vSpanEnd)
					vOE.sentence = self
					self.oes.append(vOE)
					vSpanStart = 0
					vSpanEnd = 0
		
	
	def embedATsInCTree(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'no', pflgExtendATSpanToDT = False):
		'''
		Embeds the aspect terms in the constituency tree
		
		pflgModNPStruct specifies whether the NP structures should be edited
		in order to avoid term/constituent mismatch.
		
		pEmbedPosition specifies where the aspect term should be embedded
		in the subtree. The possible values are:
		- (span)ning-constituent: embedding into a node in subtree which 
		                          spans the aspect term tokens. This may 
		                          cause mismatches where the aspect term 
		                          span is not fully covered by the node
		                          span. pOnMismatch can be set to fix the
		                          issue.
		- (pre)-terminals: embedding into each and every pre-terminal node
		                   in the subtree falling in the aspect term span.
		
		pOnMismatch specifies the method for handling aspect term/constituent
		mismatch. The possible values are:
		- nothing (or none): do not handle the mismatches
		- node (or insert): insert a new node covering the aspect term span
		
		If pflgExtendATSpanToDT is set to true, the span of aspect terms which 
		only exclude the determiner of the NP is extended to cover the 
		determiner to reduce the number of mismatches.
		'''
		
		# modifying NP structures
		
		if pflgModNPStruct:
			self.cTree.modifyNPStruct()
		
		# embeding the aspect terms
		
		self.cTree.embedAspectTerms(plAspectTerm = self.getAspectTerms(), pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendATSpanToDT)
		
	
	def embedOEsInCTree(self, pflgModNPStruct = False, pEmbedPosition = "spanning-constituent", pOnMismatch = 'no'):
		'''
		Embeds the aspect terms in the constituency tree
		
		pflgModNPStruct specifies whether the NP structures should be edited in order to avoid expression/constituent 
		mismatch.
		
		pEmbedPosition specifies where the opinion expression should be	embedded in the subtree. The possible values are:
		- (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches 
		                          where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
		                          the issue.
		- (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
		
		pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
		- nothing (or none): do not handle the mismatches
		- node (or insert): insert a new node covering the aspect term span
		'''
		
		# modifying NP structures
		
		if pflgModNPStruct:
			self.cTree.modifyNPStruct()
		
		# embeding the opinion expressions
		
		self.cTree.embedOpinionExpressions(plOEs = self.getOEs(), pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
		
	
	def embedATsInDTree(self):
		'''
		Embeds the aspect terms in the dependency tree
		'''
		
		# embeding the aspect terms
		
		self.dTree.embedAspectTerms(plAspectTerms = self.getAspectTerms())
		
	
	def embedOEsInDTree(self):
		'''
		Embeds the opinion expressions in the dependency tree
		'''
		
		self.dTree.embedOpinionExpressions(plOEs = self.getOEs())
		
	
	def getPTBConstTree(self):
		'''
		Returns the constituency tree of the sentence in PTB bracketing format
		'''
		
		return self.cTree.getPTBFormat()
		
	
	def extractATsInCTree(self):
		'''
		Extracts and returns the list of aspect terms embedded into the
		constituency tree if the tree is already loaded and an empty list
		otherwise
		'''
		
		if self.cTree == None:
			return []
		else:
			return self.cTree.extractAspectTerms()
		
	
	def extractATermConstMismatch(self):
		'''
		Extracts the aspect terms which do not match a constituent node 
		in the tree thus missing in the constituency tree
		
		Aspect term/constituent node mismatches happen because of not being 
		embeded in the tree which occur due to the inconsistency between 
		syntactic phrases and the phrases to which these terms are originally 
		assigned. Parsing errors can be one reason for this but also the 
		annotation scheme, such as flat noun phrase annotation, can also 
		cause this problem.
		'''
		
		if self.cTree == None:
			return []
		
		vlSentATerms = self.getAspectTerms()
		vlCTreeATerms = self.cTree.extractAspectTerms()
		
		vlMismatches = []
		
		if len(vlCTreeATerms) != len(vlSentATerms):
			for vSentAT in vlSentATerms:
				if vSentAT not in vlCTreeATerms:
					vlMismatches.append(vSentAT)
		
		return vlMismatches
		
	
	def loadSentimentScores(self, pSentexicon, pNeutralScore = None):
		'''
		Loads sentiment scores to words in the sentence from a sentiment
		lexicon which is a Sentexicon object
		
		It returns a tuple of the total number of words in the sentence 
		and the number of words found in the lexicon.
		
		If the word is not found in the lexicon, None will be used.
		
		For details about Sentexicon object, see sentexicon.py
		'''
		
		vlWords = self.getTokens()
		self.sentScores = []
		
		for vWord in vlWords:
			vScore = pSentexicon.getScore(vWord)
			if vScore is None and pNeutralScore is not None:
				self.sentScores.append(pNeutralScore)
			else:
				self.sentScores.append(vScore)
		
		return self.length, len([s for s in self.sentScores if s is not None])
		
	
	def getSentimentScores(self):
		'''
		Returns the sentiment scores in a list corresponding to the token list
		'''
		
		return self.sentScores
		
	
	def loadSSInDTree(self):
		'''
		Loads sentiment scores into the dependency tree of the sentence
		'''
		
		self.dTree.loadSentScores(self.sentScores)
		
	
	def loadSSInCTree(self, pPropagation = None):
		'''
		Loads sentiment scores into the constituency tree of the sentence
		'''
		
		self.cTree.loadSentScores(self.sentScores, pPropagation = pPropagation)
		
	
	def generateNGramKTree(self):
		'''
		Generates and returns the tree representation of the surface form
		of the sentence
		'''
		
		vABSANGramKTree = ABSANGramKTree(pABSASent = self)
		
		return vABSANGramKTree.generateNGramKTree()
		
	
	def extractCaraFXInput(self):
		'''
		Extracts the data in the format required for Cara feature extractor
		 
		Cara feature extractor requires three files: normalized text, POS tagged sentences and aspect term indexes. See
		https://github.com/CNGL-repo/Cara/wiki/Cara pipeline.
		'''
		
		# The data must be tokenized as loading time.
		vNormalizedTxt = self.getText()
		vNormalizedTxt = vNormalizedTxt.replace("(", "-LRB-")
		vNormalizedTxt = vNormalizedTxt.replace(")", "-RRB-")
		
		# format: (POS_1 token_1)(POS_2 token_2)...(POS_n token_n)
		vPOSTagged = ''.join(["(%s %s)" % (p, t) for p, t in zip(self.cTree.getPOSs(), self.getTokens())])
		vPOSTagged = vPOSTagged.replace(" (", " -LRB-")
		vPOSTagged = vPOSTagged.replace(" )", " -RRB-")
		
		
		return vNormalizedTxt, vPOSTagged
		
	
	def getAvgOESentScore(self):
		'''
		Calculates and returns the average sentiment score of the opinion expression tokens of the sentence 
		'''
		
		vlOEs = self.getOEs()
		
		if len(vlOEs) == 0:
			return 0
		else:
			return sum([oe.getAvgSentScore() for oe in vlOEs]) / len(vlOEs)
		
	
	def getAvgSentScore(self):
		'''
		Calculates and returns the average sentiment score sentence tokens
		'''
		
		vlSentScores = self.getSentimentScores()
		
		return sum(vlSentScores) / len(vlSentScores)
		
	
	def getPolarScores(self, pNeutralScore = None):
		'''
		Returns sentiment scores with non-neutral polarity
		
		Neutral polarity score can be set as parameter. It is None by default meaning that no polarity score is assigned
		to neutral words.
		'''
		
		return [s for s in self.sentScores if s != pNeutralScore]
		
	
	def getWordVectors(self):
		'''
		Returns the word vectors of the sentence tokens
		'''
		
		return [self.dataset.wv.getVector(t) for t in self.getTokens()]
		
	
class AspectTerm:
	'''
	Class for aspect-based sentiment analysis aspect term
	'''
	
	
	def __init__(self):
		'''
		Constructor
		'''
		
		self.term = None
		self.polarity = None
		
		self.sentence = None
		
		# the token span of the term in the text (1-base indexes)
		self.span = None
		
		# opinion/sentiment expressions towards the aspect term (not those of the sentence, i.e. ABSASent.oe
		self.oes = []
		
	
	def getForm(self):
		'''
		Returns the form of the aspect term
		'''
		
		return self.term
		
	
	def getTokens(self):
		'''
		Returns list of tokens of the aspect term
		'''
		
		if self.span is not None:
			return self.sentence.getTokens()[self.span[0]-1 : self.span[1]]
		else:
			return ''
		
	
	def getPolarity(self):
		'''
		Returns the polarity of the aspect term
		'''
		
		return self.polarity
		
	
	def getTokenSpan(self):
		'''
		Returns the token span of the term
		'''
		
		return self.span
		
	
	def getSentenceMarking(self):
		'''
		Returns the marking of the aspect terms on its sentence 
		
		The marking is a list corresponding to the aspect term's sentence tokens, every element of which is either 0 or 1
		depending on if the corresponding token is in the aspect term span or not. 
		'''
		
		return [1 if self.span[0] <= (i + 1) <= self.span[1] else 0 for i in range(self.sentence.length)] 
		
	
	def embedInCTree(self, pEmbedPosition, pOnMismatch = "nothing", pflgExtendToDT = False):
		'''
		Embeds the aspect term in the constituency tree of the sentence
		
		pEmbedPosition specifies where the aspect term should be embedded
		in the subtree. The possible values are:
		- (span)ning-constituent: embedding into a node in subtree which 
		                          spans the aspect term tokens. This may 
		                          cause mismatches where the aspect term 
		                          span is not fully covered by the node
		                          span. pAspectTerm can be set to fix the
		                          issue.
		- (pre)-terminals: embedding into each and every pre-terminal node
		                   in the subtree falling in the aspect term span.
		
		pOnMismatch specifies the method for handling aspect term/constituent
		mismacth. The possible values are:
		each aspect term covering the term span. The values are:
		- nothing (or none): do not handle the mismatches
		- node (or insert): insert a new node covering the aspect term span
		
		If pflgExtendATSpanToDT is set to true, the span of aspect terms which 
		only exclude the determiner of the NP is extended to cover the 
		determiner to reduce the number of mismatches.
		'''
		
		self.sentence.getConstTree().embedAspectTerm(pAspectTerm = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendToDT)
		
	
	def getCTreeEmbeding(self, pEmbedPosition, pOnMismatch = "nothing", pflgExtendToDT = False, pdATReprOptions = {}):
		'''
		Embeds the aspect terms in a copy of the constituency tree of the
		sentence and returns the resulting tree
		
		The representation of aspect term in the tree is based on the options
		specified in a dictionary (pdATReprOptions). The options involve
		attaching aspect term suffix, inserting aspect term node and attaching
		aspect term polarity, each with several possibilities:
		- {suffix:   [node/subtree/pre-terminal/parents/parents-partial]}
		- {node:     [parent/sister]}
		- {polarity: [node/subtree]}
		
		For details of the options, see ABSACNode.decorateAT().
		'''
		
		# copying the tree to keep the original tree intact
		# python deepcopy is used as it did a better job in an experiment than the ConstTree.deepCopy().
		vCTreeCopy = copy.deepcopy(self.sentence.getConstTree())
		
		if vCTreeCopy.embedAspectTerm(pAspectTerm = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch, pflgExtendToDT = pflgExtendToDT):
			vCTreeCopy.decorateAT(pdATReprOptions = pdATReprOptions)
			return vCTreeCopy
		else:
			return False
		
	
	def getDTreeEmbeding(self, pdATReprOptions = {}):
		'''
		Embeds the aspect terms in a copy of the dependency tree of the
		sentence and returns the resulting tree
		
		The representation of aspect term in the tree is based on the options
		specified in a dictionary (pdATReprOptions). The options involve
		attaching aspect term suffix and attaching aspect term polarity, 
		each with several possibilities.
		
		NOTE: unlike in constituency tree decoration, AT node insertion 
		is not done for the dependency tree. The reason is that in dependency
		tree, there is a one to one relation between the nodes and sentence
		tokens. Inserting a node causes a mismatch between the surface 
		form of the sentence and its dependency tree. Also, concepts like
		word form, POS tag and dependency relation are not meaningful in
		an inserted AT node. Instead, the aspect term representation using
		inserted AT nodes can be done at the PTB bracketing format representation
		level of the tree. See ABSADTree.generateDepKTree(). Note that
		the current design of this method in terms of the argument passed
		to it may not be coherent and meaningful, because what is expected
		from an aspect term representation parameter (pdATReprOptions) is
		to carry all the setting required for this purpose, not only the 
		part concerned with suffix and polarity attachement. This may be 
		handled in the future.
		
		For details of the options, see ABSADNode.decorateAT().
		'''
		
		# copying the tree to keep the original tree intact
		# NOTE: consider testing with python deepcopy (Python deepcopy did a 
		# better job in an experiment than the ConstTree.deepCopy() in the
		# getCTreeEmbeding())
		vDTreeCopy = self.sentence.getDepTree().deepCopy()
		#vDTreeCopy = copy.deepcopy(self.sentence.getDepTree())
		
		vDTreeCopy.embedAspectTerm(pAspectTerm = self)
		vDTreeCopy.decorateAT(pdATReprOptions = pdATReprOptions)
		
		return vDTreeCopy
		
	
	def getNGramKTreeEmbeding(self, pFormat = "unary", pNodeContentType = "word", pdNGramKTreeOptions = None, pdATReprOptions = None, pdOptions = None):
		'''
		Generates and returns the tree representation of the surface form
		of the corresponding sentence in various formats with aspect term 
		optionally embedded in it
		
		Formats include:
		- unary: each word is the child of its previous word 
		- bigram: each bigram forms a parent/child subtree and all of these subtrees
		          are dominated by a root node at the top. These captures unigrams as
		          well if subset tree kernels are used.
		- binary: each node has two children which are both nodes representing the word next to the current node's word.
		          The first children has a dummy terminal child X which helps capture unigrams via subset tree fragments.
		          The second child recursively continues the format by having the next word as its children in the same 
		          way.
		
		pNodeContentTypes specifies the content of the n-gram tree nodes, such as word forms or POS tags. For details, 
		see the ABSANGramKTree constructor.
		
		Aspect term representation options include:
		- "node": inserts AT node at:
		    - "x": replacing X node under/above the aspect term token  
		- "suffix": attached AT suffix to:
		    - "token": to aspect term token
		'''
		
		vABSANGramKTree = ABSANGramKTree(pABSASent = self.sentence, pNodeContentType = pNodeContentType, pAspectTerm = self, pdOptions = pdNGramKTreeOptions)
		
		return vABSANGramKTree.generateNGramKTree(pFormat, pdATReprOptions, pdOptions)
		
	
	def addOE(self, pOE):
		'''
		Adds opinion exression object to the aspect term
		'''
		
		pOE.sentence = self.sentence
		pOE.aspectTerm = self
		
		self.oes.append(pOE)
		
	
	def getOEsBIO(self, pflgPOSTags=False):
		'''
		Returns BIO tagging of the opinion/sentiment expressions of the aspect term
	
		The returned output is a list of BIO tags corresponding to aspect term sentence tokens.
		'''
		
		vlAnn = ['O' for i in range(self.sentence.length)]
		
		if len(self.oes) > 0:
			for vOE in self.oes:
				vlAnn[vOE.getTokenSpan()[0] - 1] = 'B'
				
				for i in range(vOE.getTokenSpan()[0], vOE.getTokenSpan()[1]):
					vlAnn[i] = 'I'
		
		return vlAnn
		
	
	def getOEsIO(self):
		'''
		Returns the binary IO tagging of the opinion/sentiment expression boundaries of the aspect term
		
		Binary IO tags can be used when there is only one type of sentiment expression annotated and one sentiment expression
		per sentence is possible.
		'''
		
		vlAnn = ['O'] * self.sentence.length
		
		if len(self.oes) > 0:
			for vOE in self.oes:
				for i in range(vOE.getTokenSpan()[0] - 1, vOE.getTokenSpan()[1]):
					vlAnn[i] = 'I'
		
		return vlAnn
		
	
class OpinionExpression():
	'''
	Class for opinion expression annotations
	'''
	
	
	def __init__(self):
		'''
		Constructor
		'''
		
		self.type = None       # type of the opinion expression if there is a categorization 
		                       # e.g. SE (sentiment expression), SE-pcomp (preceeding complement of SE)
		
		self.sentence = None
		
		# the aspect term towards which the opinion is expressed.
		self.aspectTerm = None
		
		self.span = None
		
	
	def getTokens(self):
		'''
		Returns list of tokens of the opinion expression
		'''
		
		return self.sentence.getTokens()[self.span[0]-1 : self.span[1]]
		
	
	def getForm(self):
		'''
		Returns the form of the opinion expression
		'''
		
		return ' '.join(self.getTokens())
		
	
	def getTokenSpan(self):
		'''
		Returns the token span of the term
		'''
		
		return self.span
		
	
	@property
	def length(self):
		'''
		Return the token length of the opinion expression  
		'''
		
		return self.span[1] - self.span[0] + 1
		
	
	def embedInCTree(self, pEmbedPosition, pOnMismatch = "nothing"):
		'''
		Embeds the opinion expression in the constituency tree of the sentence
		
		pEmbedPosition specifies where the opinion expression should be	embedded in the subtree. The possible values are:
		- (span)ning-constituent: embedding into a node in subtree which spans the OE tokens. This may cause mismatches 
		                          where the OE span is not fully covered by the node span. pOnMismatch can be set to fix
		                          the issue.
		- (pre)-terminals: embedding into each and every pre-terminal node in the subtree falling in the OE span.
		
		pOnMismatch specifies the method for handling expression/constituent mismatch. The possible values are:
		- nothing (or none): do not handle the mismatches
		- node (or insert): insert a new node covering the aspect term span
		'''
		
		self.sentence.getConstTree().embedOpinionExpression(pOE = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch)
		
	
	def getCTreeEmbeding(self, pEmbedPosition, pOnMismatch = "nothing", pdOEReprOptions = {}):
		'''
		Embeds the opinion expression in a copy of the constituency tree of the sentence and returns the resulting tree
		
		The representation of OE in the tree is based on the options specified in a dictionary (pdOEReprOptions). The 
		options involve attaching OE suffix, inserting OE node and attaching OE polarity, each with several possibilities:
		- {suffix:   [node/subtree/pre-terminal/parents/parents-partial]}
		- {node:     [parent/sister]}
		- {polarity: [node/subtree]}
		
		For details of the options, see ABSACNode.decorateOE().
		'''
		
		# copying the tree to keep the original tree intact
		# python deepcopy is used as it did a better job in an experiment than the ConstTree.deepCopy().
		vCTreeCopy = copy.deepcopy(self.sentence.getConstTree())
		
		if vCTreeCopy.embedOpinionExpression(pOE = self, pEmbedPosition = pEmbedPosition, pOnMismatch = pOnMismatch):
			vCTreeCopy.decorateOE(pdOEReprOptions = pdOEReprOptions)
			return vCTreeCopy
		else:
			return False
		
	
	def getDTreeEmbeding(self, pdOEReprOptions = {}):
		'''
		Embeds the opinion expression in a copy of the dependency tree of the
		sentence and returns the resulting tree
		
		The representation of OE in the tree is based on the options specified 
		in a dictionary (pdOEReprOptions). The options involve attaching OE
		suffix and attaching its polarity, each with several possibilities.
		
		NOTE: see the same method for aspect term for further documentation. 
		
		For details of the options, see ABSADNode.decorateOE().
		'''
		
		# copying the tree to keep the original tree intact
		# NOTE: consider testing with python deepcopy (Python deepcopy did a 
		# better job in an experiment than the ConstTree.deepCopy() in the
		# getCTreeEmbeding())
		vDTreeCopy = self.sentence.getDepTree().deepCopy()
		#vDTreeCopy = copy.deepcopy(self.sentence.getDepTree())
		
		vDTreeCopy.embedOpinionExpression(pOE = self)
		vDTreeCopy.decorateOE(pdOEReprOptions = pdOEReprOptions)
		
		return vDTreeCopy
		
	
	def getAvgSentScore(self):
		'''
		Calculates and returns the average sentiment score of the opinion expression tokens 
		'''
		
		return sum(self.sentence.getSentimentScores()[self.span[0] - 1 : self.span[1]]) / self.length