sentexicon.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. This module defines classes to load and manipulate sentiment lexicons.
  5. Version 0.2 (28-Sep-2015)
  6. - collapseScores() is added to Sentexicon.
  7. - collapseScore() is added to SentexEntry.
  8. Version 0.1 (03-Sep-2015 to 04-Sep-2015)
  9. - Sentexicon and SentexEntry are added.
  10. """
  11. class Sentexicon:
  12. '''
  13. Sentiment lexicon class
  14. '''
  15. def __init__(self):
  16. '''
  17. Constructor
  18. '''
  19. ## lexicon entry dictionary: keys are words and values SentexEntry
  20. ## objects; dictionary is used for search efficiency
  21. ## NOTE: words is stored both in the keys and the objects, which
  22. ## may sound odd. However, being in dictionary keys makes the search
  23. ## efficient and being in the entry objects make them self-contained.
  24. self.entries = {}
  25. # character case of entries: (lower)cased, (upper)cased or (true)cased (i.e. original case).
  26. self.case = "lower"
  27. def loadFromCaraCSV(self, pFilename, pCase, pflgAppend = False):
  28. '''
  29. Loads the lexicon from file in Cara CSV format
  30. pCase specifies the case used in the lexicon. That is whether the
  31. entries are (lower)cased, (upper)cased or (true)cased (i.e. original
  32. case).
  33. For format details, see a lexicon sample in https://github.com/CNGL-repo/Cara/tree/master/src/feature_extraction/lexicon
  34. For more description, see https://github.com/CNGL-repo/Cara/wiki/Lexicon augmentation.
  35. '''
  36. vLines = open(pFilename).read().strip().split('\n')
  37. self.setCase(pCase)
  38. if not pflgAppend:
  39. self.entries = {}
  40. for vLine in vLines[1:]:
  41. vEntry = SentexEntry()
  42. vEntry.loadFromCaraCSV(vLine)
  43. self.entries[vEntry.word] = vEntry
  44. def setCase(self, pCase):
  45. '''
  46. Sets the character case of the lexicon entries
  47. Values are:
  48. - (lower)cased
  49. - (upper)cased
  50. - (true)cased (i.e. original case)
  51. '''
  52. if pCase.lower().startswith("lower"):
  53. self.case = "lower"
  54. elif pCase.lower().startswith("upper"):
  55. self.case = "upper"
  56. elif pCase.lower().startswith("true"):
  57. self.case = "true"
  58. else:
  59. raise Exception("Unknown character case: %s" % pCase)
  60. def getScore(self, pWord):
  61. '''
  62. Returns the sentiment score of the given word if it is found in the
  63. lexicon, otherwise None
  64. '''
  65. if self.case == "lower":
  66. vWord = pWord.lower()
  67. elif self.case == "upper":
  68. vWord = pWord.upper()
  69. elif self.case == "true":
  70. vWord = pWord
  71. try:
  72. return self.entries[vWord].score
  73. except KeyError:
  74. return None
  75. def collapseScores(self):
  76. '''
  77. Collapses the sentiment scores to a coarse scale comprising three
  78. scores for negative (-1), neutral(0) and positive (+1) sentiments
  79. '''
  80. for vEntry in self.entries:
  81. self.entries[vEntry].collapseScore()
  82. class SentexEntry:
  83. '''
  84. Sentiment lexicon entry class
  85. '''
  86. def __init__(self):
  87. '''
  88. Constructor
  89. '''
  90. self._word = ''
  91. self._score = None # sentiment score
  92. def loadFromCaraCSV(self, pLine):
  93. '''
  94. Loads the lexicon entry from a line in Cara CSV format
  95. For details, see a lexicon sample in https://github.com/CNGL-repo/Cara/tree/master/src/feature_extraction/lexicon
  96. For more description, see https://github.com/CNGL-repo/Cara/wiki/Lexicon augmentation.
  97. '''
  98. vLineSplit = pLine.split(',')
  99. self._word = vLineSplit[0]
  100. try:
  101. self._score = float(vLineSplit[1])
  102. except ValueError:
  103. raise Exception("The sentiment score in not valid in: %s" % pLine)
  104. @property
  105. def word(self):
  106. '''
  107. Returns the entry word
  108. '''
  109. return self._word
  110. @property
  111. def score(self):
  112. '''
  113. Returns the sentiment score of the entry
  114. '''
  115. return self._score
  116. def collapseScore(self):
  117. '''
  118. Collapses the sentiment score to a coarse scale comprising three
  119. scores for negative (-1), neutral(0) and positive (+1) sentiments
  120. '''
  121. if self._score > 0:
  122. self._score = 1
  123. elif self._score < 0:
  124. self.score = -1