absafe.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932
  1. __author__ = 'rszk'
  2. #! /usr/bin/python
  3. """
  4. This module provides a class to extracts features for aspect-based sentiment analysis.
  5. Version 0.1 (02-Feb-2016 to 02-May-2016)
  6. - ABSAFE is added.
  7. """
  8. from ml import fecp, data
  9. from utils import util
  10. import numpy as np
  11. class ABSAFE:
  12. '''
  13. Feature extractor class for aspect-based sentiment analysis
  14. '''
  15. def __init__(self, pABSADataset):
  16. '''
  17. Constructor
  18. '''
  19. # an absa.ABSASet object
  20. self.absaDS = pABSADataset
  21. ## Generating outputs #####
  22. def generateDataset(self, pConfig = '', pdPolarityNumMap = None, pflgGenRawDataset = False):
  23. '''
  24. Extracts features based on the specified configuration in YAML-compatible format and generates dataset using
  25. them
  26. The generated dataset contains feature values post-processed based on the configuration. Optionally, a dataset
  27. with raw feature values can also be generated and returned. Both datasets are objects of type ml.data.Dataset.
  28. In addition to the datasets, an configuration string will be returned updated based on the feature extraction
  29. and processing outcome. For example, if a feature is binarized or scaled, the binarization (e.g. binarized
  30. feature names) or scaling information will be embedded into the configuration.
  31. The pConfig file contains the feature configuration. See fecp.FSConfig for the description of its format.
  32. pdPolarityNumMap is a dictionary which maps the polarity labels to numbers.
  33. '''
  34. # dataset with processed final feature values
  35. vDataset = data.Dataset()
  36. vDataset.loadTargets(self.extractATPolarities(pdPolarityNumMap))
  37. # dataset with raw feature values
  38. if pflgGenRawDataset:
  39. vRawDataset = data.Dataset()
  40. vRawDataset.loadTargets(self.extractATPolarities(pdPolarityNumMap))
  41. # feature extraction configuration processor
  42. vFSConfig = fecp.FSConfig(pConfig)
  43. for i, vFConfig in enumerate(vFSConfig.getFConfigs(), start = 1):
  44. # extracting feature values
  45. vFValues = self.extractFeature(vFConfig.name, vFConfig.featureParams)
  46. # creating feature based on the feature configuration and extracted values
  47. if vFConfig.normNominal or vFConfig.expandable:
  48. vlFeatures = vFSConfig.createFeature(vFConfig, vFValues)
  49. # adding features to the dataset
  50. vDataset.features.add(vlFeatures)
  51. vFConfig.setIndexes(i, len(vlFeatures))
  52. else:
  53. vFeature = vFSConfig.createFeature(vFConfig, vFValues)
  54. # adding feature to the dataset
  55. vDataset.features.append(vFeature)
  56. vFConfig.setIndexes(i)
  57. if pflgGenRawDataset:
  58. # generating raw-value feature
  59. vRawFeature = data.Feature(pName = vFConfig.name, pDataType = vFConfig.dataType)
  60. vRawFeature.loadValues(vFValues)
  61. # adding feature to the dataset
  62. vRawDataset.features.append(vRawFeature)
  63. # dumping configuration
  64. vConfig = vFSConfig.dumpConfig()
  65. if pflgGenRawDataset:
  66. return vDataset, vRawDataset, vConfig
  67. else:
  68. return vDataset, vConfig
  69. # extracting targets
  70. def extractATPolarities(self, pdPolarityNumMap):
  71. '''
  72. Extracts and returns aspect term polarities
  73. pdPolarityNumMap is a dictionary which maps the polarity labels to numbers.
  74. '''
  75. if pdPolarityNumMap is None:
  76. return [at.getPolarity() for at in self.absaDS.getAspectTerms()]
  77. else:
  78. return [pdPolarityNumMap[at.getPolarity()] for at in self.absaDS.getAspectTerms()]
  79. ## Feature extraction #####
  80. def extractFeature(self, pFeatureName, pdFeatureParams):
  81. '''
  82. Extracts values for the given feature name
  83. pFeatureName is the one specified in configuration file. In order
  84. to be able to have multiple settings of a single feature type, an
  85. entry for each setting is created in the configuration file with
  86. the feature name suffixed by any unique string. The specific feature
  87. parameters then goes under each entry. The reason is that configuration
  88. file is in YAML format which is dictionary-like and requires unique
  89. keys. So, the feature name can appear only once or only one of the
  90. appearances would be considered. For example, an n-gram can be extracted
  91. for different orders (n). To handle this, one entry per required order
  92. is put in the configuration and the feature name (e.g. n-gram) of each
  93. entry is suffixed with the order (e.g. n-gram-1). The specific setting
  94. then goes under each entry.
  95. '''
  96. if pFeatureName.startswith("at-surface") or pFeatureName.startswith("aspect-term-surface-form"):
  97. vlValues = self.extractATSurface(pFeatureName, pdFeatureParams)
  98. elif pFeatureName.startswith("oe-surfaces") or pFeatureName.startswith("opinion-expression-surface-forms"):
  99. vlValues = self.extractOESurface(pFeatureName, pdFeatureParams)
  100. elif pFeatureName.startswith("at-oe-const-path") or pFeatureName.startswith("at-oe-constituency-path"):
  101. vlValues = self.extractAT2OEConstPath(pFeatureName, pdFeatureParams)
  102. elif pFeatureName.startswith("at-oe-dep-path") or pFeatureName.startswith("at-oe-dependency-path"):
  103. vlValues = self.extractAT2OEDepPath(pFeatureName, pdFeatureParams)
  104. elif pFeatureName.startswith("oe-avg-sent-score") or pFeatureName.startswith("oe-average-sentiment-score"):
  105. vlValues = self.extractOEAvgSentScore(pFeatureName, pdFeatureParams)
  106. elif pFeatureName.startswith("avg-sent-score") or pFeatureName.startswith("sentence-average-sentiment-score"):
  107. vlValues = self.extractAvgSentScore(pFeatureName, pdFeatureParams)
  108. elif pFeatureName.startswith("at-avg-word-vector") or pFeatureName.startswith("at-averaged-word-vector"):
  109. vlValues = self.extractATAvgWV(pFeatureName, pdFeatureParams)
  110. elif pFeatureName.startswith("oe-avg-word-vector") or pFeatureName.startswith("oe-averaged-word-vector"):
  111. vlValues = self.extractOEAvgWV(pFeatureName, pdFeatureParams)
  112. elif pFeatureName.startswith("avg-word-vector") or pFeatureName.startswith("sentence-average-word-vector"):
  113. vlValues = self.extractSentAvgWV(pFeatureName, pdFeatureParams)
  114. elif pFeatureName.startswith("at-ngram") or pFeatureName.startswith("aspect-term-ngrams"):
  115. vlValues = self.extractATNgrams(pFeatureName, pdFeatureParams)
  116. elif pFeatureName.startswith("oe-ngram") or pFeatureName.startswith("opinion-expression-ngrams"):
  117. vlValues = self.extractOENgrams(pFeatureName, pdFeatureParams)
  118. elif pFeatureName.startswith("sent-ngram") or pFeatureName.startswith("sentence-ngrams"):
  119. vlValues = self.extractSentNgrams(pFeatureName, pdFeatureParams)
  120. elif pFeatureName.startswith("oe-word-vectors") or pFeatureName.startswith("oe-all-words-vectors"):
  121. vlValues = self.extractOEWV(pFeatureName, pdFeatureParams)
  122. elif pFeatureName.startswith("at-word-vectors") or pFeatureName.startswith("at-all-words-vectors"):
  123. vlValues = self.extractATWV(pFeatureName, pdFeatureParams)
  124. elif pFeatureName.startswith("at-oe-wv-dep-path") or pFeatureName.startswith("at-oe-word-vectors-in-dependency-path"):
  125. vlValues = self.extractAT2OEDepPathWV(pFeatureName, pdFeatureParams)
  126. elif pFeatureName.startswith("top-at-vp-ngrams") or pFeatureName.startswith("topmost-at-verb-phrase-ngrams"):
  127. vlValues = self.extractTopATVPNgrams(pFeatureName, pdFeatureParams)
  128. elif pFeatureName.startswith("polar-word-presence"):
  129. vlValues = self.extractPolarWordPresence(pFeatureName, pdFeatureParams)
  130. elif pFeatureName.startswith("verb-form-in-at-heads") or pFeatureName.startswith("first-verb-form-in-at-head-chain"):
  131. vlValues = self.extractVerbFormInATHeads(pFeatureName, pdFeatureParams)
  132. elif pFeatureName.startswith("sent-score-stat") or pFeatureName.startswith("sentence-sentiment-score-statistic"):
  133. vlValues = self.extractSentScoreStat(pFeatureName, pdFeatureParams)
  134. elif pFeatureName.startswith("pre-extracted") or pFeatureName.startswith("preex"):
  135. vlValues = self.extractPreex(pdFeatureParams)
  136. else:
  137. raise Exception("Feature %s is unknown! Check the spelling." % pFeatureName)
  138. return vlValues
  139. # pre-extracted features
  140. def extractPreex(self, pdFeatureParams):
  141. '''
  142. Extracts a previously-extracted feature from a file
  143. The file contains a column of values for the feature.
  144. '''
  145. vlValues = []
  146. for vParam, vValue in pdFeatureParams.iteritems():
  147. if vParam.lower() in ["source", "file", "input"]:
  148. vlValues = [float(s) for s in open(vValue).read().strip().split('\n')]
  149. else:
  150. raise Exception("'%s' is not a valid feature parameter for pre-extracted features!" % vParam)
  151. return vlValues
  152. # surface features
  153. def extractATSurface(self, pFeatureName, pdFeatureParams):
  154. '''
  155. Extracts the aspect term surfaces
  156. '''
  157. # character used to replace spaces to avoid confusion in data files
  158. vSpaceFiller = '_'
  159. for vParam, vValue in pdFeatureParams.iteritems():
  160. if vParam in ["space-filler"]:
  161. if vValue != None:
  162. vSpaceFiller = vValue
  163. else:
  164. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  165. vlValues = [at.getForm().replace(' ', vSpaceFiller) for at in self.absaDS.getAspectTerms()]
  166. return vlValues
  167. def extractOESurface(self, pFeatureName, pdFeatureParams):
  168. '''
  169. Extracts the opinion expression surfaces
  170. Feature parameters include:
  171. - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
  172. - closest: picks only closest OE to the AT in hand
  173. '''
  174. # character used to replace spaces to avoid confusion in data files
  175. vSpaceFiller = '_'
  176. vFilter = None
  177. for vParam, vValue in pdFeatureParams.iteritems():
  178. if vParam in ["space-filler"]:
  179. if vValue != None:
  180. vSpaceFiller = vValue
  181. elif vParam in ["filter"]:
  182. if vValue.lower() == "closest":
  183. vFilter = "closest"
  184. else:
  185. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  186. if vFilter == "closest":
  187. vlValues = []
  188. for vAT in self.absaDS.getAspectTerms():
  189. vCOE = self._extractClosestOE(vAT)
  190. if vCOE is None:
  191. vlValues.append('')
  192. else:
  193. vlValues.append(vCOE.getForm().replace(' ', vSpaceFiller))
  194. else:
  195. vlValues = [[oe.getForm().replace(' ', vSpaceFiller) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
  196. return vlValues
  197. def extractAT2OEConstPath(self, pFeatureName, pdFeatureParams):
  198. '''
  199. Extracts the constituency path between the aspect term and objective expressions
  200. Feature parameters include:
  201. - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
  202. - closest: picks only closest OE to the AT in hand
  203. '''
  204. # character used to replace spaces to avoid confusion in data files
  205. vSpaceFiller = '_'
  206. vFilter = None
  207. for vParam, vValue in pdFeatureParams.iteritems():
  208. if vParam in ["space-filler"]:
  209. if vValue != None:
  210. vSpaceFiller = vValue
  211. elif vParam in ["filter"]:
  212. if vValue.lower() == "closest":
  213. vFilter = "closest"
  214. else:
  215. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  216. if vFilter == "closest":
  217. vlValues = []
  218. for vAT in self.absaDS.getAspectTerms():
  219. vCOE = self._extractClosestOE(vAT)
  220. if vCOE is None:
  221. vlValues.append('')
  222. else:
  223. vlValues.append(vAT.sentence.getConstTree().extractAT2OEPath(vAT, vCOE))
  224. else:
  225. vlValues = [[at.sentence.getConstTree().extractAT2OEPath(at, oe) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
  226. return vlValues
  227. def extractAT2OEDepPath(self, pFeatureName, pdFeatureParams):
  228. '''
  229. Extracts the dependency path between the aspect term and objective expressions
  230. Feature parameters include:
  231. - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
  232. - closest: picks only closest OE to the AT in hand
  233. '''
  234. # character used to replace spaces to avoid confusion in data files
  235. vSpaceFiller = '_'
  236. vFilter = None
  237. for vParam, vValue in pdFeatureParams.iteritems():
  238. if vParam in ["space-filler"]:
  239. if vValue != None:
  240. vSpaceFiller = vValue
  241. elif vParam in ["filter"]:
  242. if vValue.lower() == "closest":
  243. vFilter = "closest"
  244. else:
  245. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  246. if vFilter == "closest":
  247. vlValues = []
  248. for vAT in self.absaDS.getAspectTerms():
  249. vCOE = self._extractClosestOE(vAT)
  250. if vCOE is None:
  251. vlValues.append('')
  252. else:
  253. vlValues.append(vAT.sentence.getDepTree().extractAT2OEDepRelPath(vAT, vCOE))
  254. else:
  255. vlValues = [[at.sentence.getDepTree().extractAT2OEDepRelPath(at, oe) for oe in at.sentence.getOEs()] for at in self.absaDS.getAspectTerms()]
  256. return vlValues
  257. def extractOEAvgSentScore(self, pFeatureName, pdFeatureParams):
  258. '''
  259. Extracts the average sentiment score of the opinion expression(s) tokens
  260. Feature parameters include:
  261. - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
  262. - closest: picks only closest OE to the AT in hand
  263. '''
  264. vFilter = None
  265. for vParam, vValue in pdFeatureParams.iteritems():
  266. if vParam in ["filter"]:
  267. if vValue.lower() == "closest":
  268. vFilter = "closest"
  269. else:
  270. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  271. if vFilter == "closest":
  272. vlValues = []
  273. for vAT in self.absaDS.getAspectTerms():
  274. vCOE = self._extractClosestOE(vAT)
  275. if vCOE is None:
  276. vlValues.append(0.0)
  277. else:
  278. vlValues.append(vCOE.getAvgSentScore())
  279. else:
  280. vlValues = [at.sentence.getAvgOESentScore() for at in self.absaDS.getAspectTerms()]
  281. return vlValues
  282. def extractAvgSentScore(self, pFeatureName, pdFeatureParams):
  283. '''
  284. Extracts the average sentiment score of the sentence
  285. '''
  286. vlValues = [at.sentence.getAvgSentScore() for at in self.absaDS.getAspectTerms()]
  287. return vlValues
  288. def extractATAvgWV(self, pFeatureName, pdFeatureParams):
  289. '''
  290. Extracts the averaged word vectors of the aspect term tokens
  291. Feature parameters include:
  292. - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
  293. supported by ml/wv.WordVector.
  294. '''
  295. vWVFile = ''
  296. for vParam, vValue in pdFeatureParams.iteritems():
  297. if vParam in ["wv-file", "word-vectors"]:
  298. vWVFile = vValue
  299. else:
  300. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  301. # loading the word vectors
  302. from ml import wv
  303. vWV = wv.WordVector()
  304. vWV.load(pWVFilename = vWVFile)
  305. # extracting word vectors
  306. vlValues = [vWV.getAvgVector(at.getTokens()) for at in self.absaDS.getAspectTerms()]
  307. return vlValues
  308. def extractOEAvgWV(self, pFeatureName, pdFeatureParams):
  309. '''
  310. Extracts the averaged word vectors of the opinion expression(s) tokens
  311. Feature parameters include:
  312. - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
  313. - closest: picks only closest OE to the AT in hand
  314. - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
  315. supported by ml/wv.WordVector.
  316. '''
  317. vFilter = None
  318. vWVFile = ''
  319. for vParam, vValue in pdFeatureParams.iteritems():
  320. if vParam in ["filter"]:
  321. if vValue.lower() == "closest":
  322. vFilter = "closest"
  323. elif vParam in ["wv-file", "word-vectors"]:
  324. vWVFile = vValue
  325. else:
  326. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  327. # loading the word vectors
  328. from ml import wv
  329. vWV = wv.WordVector()
  330. vWV.load(pWVFilename = vWVFile)
  331. # extracting word vectors
  332. if vFilter == "closest":
  333. vlValues = []
  334. for vAT in self.absaDS.getAspectTerms():
  335. vCOE = self._extractClosestOE(vAT)
  336. if vCOE is None:
  337. vlValues.append([0.0] * vWV.dimension)
  338. else:
  339. vlValues.append(vWV.getAvgVector(vCOE.getTokens()))
  340. else:
  341. vlValues = [vWV.getAvgVector([t for oe in at.sentence.getOEs() for t in oe.getTokens()]) for at in self.absaDS.getAspectTerms()]
  342. return vlValues
  343. def extractSentAvgWV(self, pFeatureName, pdFeatureParams):
  344. '''
  345. Extracts the averaged word vector of the sentence words
  346. '''
  347. vWVFile = ''
  348. for vParam, vValue in pdFeatureParams.iteritems():
  349. if vParam in ["wv-file", "word-vectors"]:
  350. vWVFile = vValue
  351. else:
  352. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  353. # loading the word vectors
  354. from ml import wv
  355. vWV = wv.WordVector()
  356. vWV.load(pWVFilename = vWVFile)
  357. vlValues = [vWV.getAvgVector(at.sentence.getTokens()) for at in self.absaDS.getAspectTerms()]
  358. return vlValues
  359. def extractATNgrams(self, pFeatureName, pdFeatureParams):
  360. '''
  361. Extracts the aspect term n-grams
  362. '''
  363. vOrder = 1
  364. for vParam, vValue in pdFeatureParams.iteritems():
  365. if vParam in ["order", 'n']:
  366. if vValue != None:
  367. vOrder = vValue
  368. else:
  369. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  370. from nlp import nlp
  371. vlValues = [['-'.join(ngrams) for ngrams in nlp.extractNGrams(at.getTokens(), vOrder)] for at in self.absaDS.getAspectTerms()]
  372. return vlValues
  373. def extractOENgrams(self, pFeatureName, pdFeatureParams):
  374. '''
  375. Extracts the opinion expression n-grams
  376. '''
  377. vFilter = None
  378. vOrder = 1
  379. for vParam, vValue in pdFeatureParams.iteritems():
  380. if vParam in ["order", 'n']:
  381. if vValue != None:
  382. vOrder = vValue
  383. elif vParam in ["filter"]:
  384. if vValue.lower() == "closest":
  385. vFilter = "closest"
  386. else:
  387. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  388. from nlp import nlp
  389. vlValues = []
  390. if vFilter == "closest":
  391. for vAT in self.absaDS.getAspectTerms():
  392. vCOE = self._extractClosestOE(vAT)
  393. if vCOE is None:
  394. vlValues.append([])
  395. else:
  396. vlValues.append(['-'.join(ngrams) for ngrams in nlp.extractNGrams(vCOE.getTokens(), vOrder)])
  397. else:
  398. for vAT in self.absaDS.getAspectTerms():
  399. vlOENgramsOfAT = []
  400. for vOE in vAT.sentence.getOEs():
  401. vlOENgramsOfAT += ['-'.join(ngrams) for ngrams in nlp.extractNGrams(vOE.getTokens(), vOrder)]
  402. vlValues.append(vlOENgramsOfAT)
  403. return vlValues
  404. def extractSentNgrams(self, pFeatureName, pdFeatureParams):
  405. '''
  406. Extracts the sentence n-grams
  407. '''
  408. vOrder = 1
  409. for vParam, vValue in pdFeatureParams.iteritems():
  410. if vParam in ["order", 'n']:
  411. if vValue != None:
  412. vOrder = vValue
  413. else:
  414. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  415. from nlp import nlp
  416. vlValues = [['-'.join(ngrams) for ngrams in nlp.extractNGrams(at.sentence.getTokens(), vOrder)] for at in self.absaDS.getAspectTerms()]
  417. return vlValues
  418. def extractOEWV(self, pFeatureName, pdFeatureParams):
  419. '''
  420. Extracts the concatenated word vectors of the opinion expression(s) tokens
  421. Feature parameters include:
  422. - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
  423. - closest: picks only closest OE to the AT in hand
  424. - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
  425. supported by ml/wv.WordVector.
  426. - vector-count: number of words from the beginning of opinion expression to extract the vector for. Since the length
  427. of opinion expressions are varying, a fixed number should be considered to be able to use in learning
  428. algorithm. For shorter OE than this count, zero vectors will be padded at the end.
  429. '''
  430. vFilter = None
  431. vWVFile = ''
  432. vWVCnt = 1
  433. for vParam, vValue in pdFeatureParams.iteritems():
  434. if vParam in ["filter"]:
  435. if vValue.lower() == "closest":
  436. vFilter = "closest"
  437. elif vParam in ["wv-file", "word-vectors"]:
  438. vWVFile = vValue
  439. elif vParam in ["vector-count", "word-count"]:
  440. vWVCnt = int(vValue)
  441. else:
  442. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  443. # loading the word vectors
  444. from ml import wv
  445. vWV = wv.WordVector()
  446. vWV.load(pWVFilename = vWVFile)
  447. # calculating the concatenated vector size
  448. vConcatVectorSize = vWV.dimension * vWVCnt
  449. # extracting word vectors
  450. vlValues = []
  451. if vFilter == "closest":
  452. for vAT in self.absaDS.getAspectTerms():
  453. vCOE = self._extractClosestOE(vAT)
  454. if vCOE is None:
  455. vlValues.append([np.float64(0.0)] * vWV.dimension * vWVCnt)
  456. else:
  457. # extracting concatenated vectors of all OEs in aspect term's sentence
  458. vlOEConcatVectors = [e for t in vCOE.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
  459. # padding for short OEs
  460. vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
  461. if vPadSize > 0:
  462. vlOEConcatVectors += [np.float64(0.0)] * vPadSize
  463. vlValues.append(vlOEConcatVectors)
  464. else:
  465. for vAT in self.absaDS.getAspectTerms():
  466. # extracting concatenated vectors of all OEs in aspect term's sentence
  467. vlOEConcatVectors = []
  468. for vOE in vAT.sentence.getOEs():
  469. vlOEConcatVectors += [e for t in vOE.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
  470. # padding for short OEs
  471. vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
  472. if vPadSize > 0:
  473. vlOEConcatVectors += [np.float64(0.0)] * vPadSize
  474. vlValues.append(vlOEConcatVectors)
  475. return vlValues
  476. def extractATWV(self, pFeatureName, pdFeatureParams):
  477. '''
  478. Extracts the concatenated word vectors of the aspect term tokens
  479. Feature parameters include:
  480. - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
  481. supported by ml/wv.WordVector.
  482. - vector-count: number of words from the beginning of aspect term to extract the vector for. Since the length
  483. of aspect terms are varying, a fixed number should be considered to be able to use in learning
  484. algorithm. For shorter AT than this count, zero vectors will be padded at the end.
  485. '''
  486. vWVFile = ''
  487. vWVCnt = 1
  488. for vParam, vValue in pdFeatureParams.iteritems():
  489. if vParam in ["wv-file", "word-vectors"]:
  490. vWVFile = vValue
  491. elif vParam in ["vector-count", "word-count"]:
  492. vWVCnt = int(vValue)
  493. else:
  494. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  495. # loading the word vectors
  496. from ml import wv
  497. vWV = wv.WordVector()
  498. vWV.load(pWVFilename = vWVFile)
  499. # calculating the concatenated vector size
  500. vConcatVectorSize = vWV.dimension * vWVCnt
  501. # extracting word vectors
  502. vlValues = []
  503. for vAT in self.absaDS.getAspectTerms():
  504. # extracting concatenated vectors of all ATs in aspect term's sentence
  505. vlATConcatVectors = [e for t in vAT.getTokens()[:vWVCnt] for e in vWV.getVector(t)]
  506. # padding for short ATs
  507. vPadSize = vConcatVectorSize - len(vlATConcatVectors)
  508. if vPadSize > 0:
  509. vlATConcatVectors += [np.float64(0.0)] * vPadSize
  510. vlValues.append(vlATConcatVectors)
  511. return vlValues
  512. def extractAT2OEDepPathWV(self, pFeatureName, pdFeatureParams):
  513. '''
  514. Extracts the word vectors of the words in the dependency path between the aspect term and objective expressions
  515. Feature parameters include:
  516. - filter: filters the opinion expressions in a sentence. The following ar ethe possible values:
  517. - closest: picks only closest OE to the AT in hand
  518. - wv-file: name of the file in which the word vectors are stored. The file should be in general word vector format
  519. supported by ml/wv.WordVector.
  520. - vector-count: number of words from the beginning of the path to extract the vector for. Since the length of the
  521. path is varying, a fixed number should be considered to be able to use in learning algorithm. For
  522. shorter path than this count, zero vectors will be padded at the end.
  523. '''
  524. vFilter = None
  525. vWVFile = ''
  526. vWVCnt = 1
  527. for vParam, vValue in pdFeatureParams.iteritems():
  528. if vParam in ["filter"]:
  529. if vValue.lower() == "closest":
  530. vFilter = "closest"
  531. elif vParam in ["wv-file", "word-vectors"]:
  532. vWVFile = vValue
  533. elif vParam in ["vector-count", "word-count"]:
  534. vWVCnt = int(vValue)
  535. else:
  536. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  537. # loading the word vectors
  538. from ml import wv
  539. vWV = wv.WordVector()
  540. vWV.load(pWVFilename = vWVFile)
  541. # calculating the concatenated vector size
  542. vConcatVectorSize = vWV.dimension * vWVCnt
  543. # extracting word vectors
  544. vlValues = []
  545. if vFilter == "closest":
  546. for vAT in self.absaDS.getAspectTerms():
  547. vCOE = self._extractClosestOE(vAT)
  548. if vCOE is None:
  549. vlValues.append([np.float64(0.0)] * vWV.dimension * vWVCnt)
  550. else:
  551. # extracting concatenated vectors of all OEs in aspect term's sentence
  552. vlOEConcatVectors = [e for t in vAT.sentence.getDepTree().extractAT2OEDepPathWords(vAT, vCOE)[:vWVCnt] for e in vWV.getVector(t)]
  553. # padding for short OEs
  554. vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
  555. if vPadSize > 0:
  556. vlOEConcatVectors += [np.float64(0.0)] * vPadSize
  557. vlValues.append(vlOEConcatVectors)
  558. else:
  559. for vAT in self.absaDS.getAspectTerms():
  560. # extracting concatenated vectors of all OEs in aspect term's sentence
  561. vlOEConcatVectors = []
  562. for vOE in vAT.sentence.getOEs():
  563. vlOEConcatVectors += [e for t in vAT.sentence.getDepTree().extractAT2OEDepPathWords(vAT, vOE)[:vWVCnt] for e in vWV.getVector(t)]
  564. # padding for short OEs
  565. vPadSize = vConcatVectorSize - len(vlOEConcatVectors)
  566. if vPadSize > 0:
  567. vlOEConcatVectors += [np.float64(0.0)] * vPadSize
  568. vlValues.append(vlOEConcatVectors)
  569. return vlValues
  570. def extractTopATVPNgrams(self, pFeatureName, pdFeatureParams):
  571. '''
  572. Extracts the n-grams under the topmost VP node dominating the aspect term
  573. '''
  574. vOrder = 1
  575. for vParam, vValue in pdFeatureParams.iteritems():
  576. if vParam in ["order", 'n']:
  577. if vValue != None:
  578. vOrder = vValue
  579. else:
  580. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  581. from nlp import nlp
  582. vlValues = []
  583. for vAT in self.absaDS.getAspectTerms():
  584. vTopVPNode = vAT.sentence.getConstTree().extractTopmostVP(vAT.getTokenSpan())
  585. if vTopVPNode is not None:
  586. vlValues.append(['-'.join(ngrams) for ngrams in nlp.extractNGrams(vTopVPNode.getTokens(), vOrder)])
  587. else:
  588. vlValues.append([])
  589. return vlValues
  590. def extractPolarWordPresence(self, pFeatureName, pdFeatureParams):
  591. '''
  592. Extracts the presence of
  593. '''
  594. vNeutralScore = 0
  595. for vParam, vValue in pdFeatureParams.iteritems():
  596. if vParam in ["neutral", 'neutral-score']:
  597. if vValue != None:
  598. vNeutralScore = vValue
  599. else:
  600. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  601. vlValues = [0 if len(at.sentence.getPolarScores(pNeutralScore = vNeutralScore)) == 0 else 1 for at in self.absaDS.getAspectTerms()]
  602. return vlValues
  603. def extractVerbFormInATHeads(self, pFeatureName, pdFeatureParams):
  604. '''
  605. Extracts the word form of the first verb in the head chain of the aspect term
  606. The following are the parameters:
  607. - sign: if true, the negation of the verb will be reflected by inserting a - before the verb form. For example,
  608. "does not have" will be represented as -have.
  609. '''
  610. vflgSign = False
  611. for vParam, vValue in pdFeatureParams.iteritems():
  612. if vParam in ["sign", "negation"]:
  613. if vValue != None:
  614. vflgSign = vValue
  615. else:
  616. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  617. vlValues = []
  618. for vAT in self.absaDS.getAspectTerms():
  619. vATNode = vAT.sentence.getDepTree().getNode(vAT.getTokenSpan()[0])
  620. # finding the verb in the head chain
  621. vVerbNode = None
  622. for vNode in vATNode.getHeadChain():
  623. if vNode.isVerb():
  624. vVerbNode = vNode
  625. break
  626. if vVerbNode is not None:
  627. if vVerbNode.isNegated():
  628. vlValues.append("-%s" % vVerbNode.getForm())
  629. else:
  630. vlValues.append(vVerbNode.getForm())
  631. else:
  632. vlValues.append('')
  633. return vlValues
  634. def extractSentScoreStat(self, pFeatureName, pdFeatureParams):
  635. '''
  636. Extracts statistic about sentiment score of the sentence
  637. Statistics include counts or percentages of positive/negative/neutral words.
  638. '''
  639. vPolarity = "positive"
  640. vStat = "count"
  641. for vParam, vValue in pdFeatureParams.iteritems():
  642. if vParam == "stat":
  643. if vValue != None:
  644. vStat = vValue
  645. elif vParam in ["polarity", "class", "category"]:
  646. if vValue != None:
  647. vPolarity = vValue
  648. else:
  649. raise Exception("'%s' is not a valid feature parameter for %s!" % (vParam, pFeatureName))
  650. ## 1. creating a list of sentiment scores per instance where the target sentiment is replaced by 1 and others
  651. ## by 0, e.g. in [0, +1, -1, 0, 0, -1] with target being negative polarity the outcome is [0, 0, 1, 0, 0, 1]
  652. if vPolarity.lower().startswith("pos"):
  653. vlMap = [[1 if s > 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
  654. elif vPolarity.lower().startswith("neg"):
  655. vlMap = [[1 if s < 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
  656. elif vPolarity.lower().startswith("neu"):
  657. vlMap = [[1 if s == 0 else 0 for s in at.sentence.getSentimentScores()] for at in self.absaDS.getAspectTerms()]
  658. # 2. creating the statistic
  659. if vStat.lower().startswith("count"):
  660. vlValues = [sum(m) for m in vlMap]
  661. elif vStat.lower().startswith("percent"):
  662. vlValues = [sum(m) * 1.0 / len(m) for m in vlMap]
  663. else:
  664. raise Exception("'%s' is not a valid statistic for %s!" % (vStat, pFeatureName))
  665. return vlValues
  666. # auxiliary methods
  667. def _extractClosestOE(self, pAspectTerm):
  668. '''
  669. Extracts and returns the closest opinion expression to the given aspect term in the sentence in terms of token
  670. number
  671. '''
  672. vdDists = {oe: util.getSpanDistance(pAspectTerm.getTokenSpan(), oe.getTokenSpan()) for oe in pAspectTerm.sentence.getOEs()}
  673. if vdDists == {}:
  674. return None
  675. else:
  676. return min(vdDists, key = lambda x: abs(vdDists[x]))