123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- This module defines classes to load and manipulate sentiment lexicons.
-
- Version 0.2 (28-Sep-2015)
- - collapseScores() is added to Sentexicon.
- - collapseScore() is added to SentexEntry.
-
- Version 0.1 (03-Sep-2015 to 04-Sep-2015)
- - Sentexicon and SentexEntry are added.
- """
- class Sentexicon:
- '''
- Sentiment lexicon class
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- ## lexicon entry dictionary: keys are words and values SentexEntry
- ## objects; dictionary is used for search efficiency
- ## NOTE: words is stored both in the keys and the objects, which
- ## may sound odd. However, being in dictionary keys makes the search
- ## efficient and being in the entry objects make them self-contained.
- self.entries = {}
-
- # character case of entries: (lower)cased, (upper)cased or (true)cased (i.e. original case).
- self.case = "lower"
-
-
-
- def loadFromCaraCSV(self, pFilename, pCase, pflgAppend = False):
- '''
- Loads the lexicon from file in Cara CSV format
-
- pCase specifies the case used in the lexicon. That is whether the
- entries are (lower)cased, (upper)cased or (true)cased (i.e. original
- case).
-
- For format details, see a lexicon sample in https://github.com/CNGL-repo/Cara/tree/master/src/feature_extraction/lexicon
- For more description, see https://github.com/CNGL-repo/Cara/wiki/Lexicon augmentation.
- '''
-
- vLines = open(pFilename).read().strip().split('\n')
-
-
- self.setCase(pCase)
-
- if not pflgAppend:
- self.entries = {}
-
- for vLine in vLines[1:]:
- vEntry = SentexEntry()
- vEntry.loadFromCaraCSV(vLine)
- self.entries[vEntry.word] = vEntry
-
-
-
- def setCase(self, pCase):
- '''
- Sets the character case of the lexicon entries
-
- Values are:
- - (lower)cased
- - (upper)cased
- - (true)cased (i.e. original case)
- '''
-
- if pCase.lower().startswith("lower"):
- self.case = "lower"
- elif pCase.lower().startswith("upper"):
- self.case = "upper"
- elif pCase.lower().startswith("true"):
- self.case = "true"
- else:
- raise Exception("Unknown character case: %s" % pCase)
-
-
-
- def getScore(self, pWord):
- '''
- Returns the sentiment score of the given word if it is found in the
- lexicon, otherwise None
- '''
-
- if self.case == "lower":
- vWord = pWord.lower()
- elif self.case == "upper":
- vWord = pWord.upper()
- elif self.case == "true":
- vWord = pWord
-
- try:
- return self.entries[vWord].score
- except KeyError:
- return None
-
-
-
-
- def collapseScores(self):
- '''
- Collapses the sentiment scores to a coarse scale comprising three
- scores for negative (-1), neutral(0) and positive (+1) sentiments
- '''
-
- for vEntry in self.entries:
- self.entries[vEntry].collapseScore()
-
-
-
- class SentexEntry:
- '''
- Sentiment lexicon entry class
- '''
-
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self._word = ''
- self._score = None # sentiment score
-
-
-
- def loadFromCaraCSV(self, pLine):
- '''
- Loads the lexicon entry from a line in Cara CSV format
-
- For details, see a lexicon sample in https://github.com/CNGL-repo/Cara/tree/master/src/feature_extraction/lexicon
- For more description, see https://github.com/CNGL-repo/Cara/wiki/Lexicon augmentation.
- '''
-
- vLineSplit = pLine.split(',')
- self._word = vLineSplit[0]
- try:
- self._score = float(vLineSplit[1])
- except ValueError:
- raise Exception("The sentiment score in not valid in: %s" % pLine)
-
-
-
- @property
- def word(self):
- '''
- Returns the entry word
- '''
-
- return self._word
-
-
-
- @property
- def score(self):
- '''
- Returns the sentiment score of the entry
- '''
-
- return self._score
-
-
-
- def collapseScore(self):
- '''
- Collapses the sentiment score to a coarse scale comprising three
- scores for negative (-1), neutral(0) and positive (+1) sentiments
- '''
-
- if self._score > 0:
- self._score = 1
- elif self._score < 0:
- self.score = -1
-
-
-
|