lm.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. This module defines classes for language model processing.
  5. Version 0.2 (27-Jun-2013)
  6. - SRILM.readOutput() which was used to read the output of various
  7. command is removed and distributed to functions reading specific
  8. output type such as language model and and n-gram counts.
  9. - SRILM._readNGramOutput() which was only reading the output of LM
  10. evaluation (scoring) is renamed to loadEval().
  11. - NGramCounts is added
  12. Version 0.1 (03-Jun-2013)
  13. - SRILM and SRILMEval are added.
  14. """
  15. import re
  16. ## Start of SRILM ######################################################
  17. class SRILM:
  18. '''
  19. Wrapper around SRILM toolkit.
  20. NOTE: preliminary yet (only reading output)
  21. '''
  22. def __init__(self):
  23. '''
  24. Constructor
  25. '''
  26. # each elemement is of SRILMEval type
  27. self.segmentEvals = []
  28. self.documentEval = None
  29. def loadEval(self, pOutput, pDebug):
  30. '''
  31. Loads the evaluation (scoring) output
  32. '''
  33. self.segmentEvals = []
  34. self.documentEval = SRILMEval()
  35. vflgDocSumStarted = False
  36. for vLine in pOutput.strip().split('\n'):
  37. if re.match("^file [^ ]+: [^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine):
  38. vflgDocSumStarted = True
  39. elif re.match("[^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine):
  40. if vflgDocSumStarted:
  41. self.documentEval.loadOOV(vLine)
  42. else:
  43. self.segmentEvals.append(SRILMEval())
  44. self.segmentEvals[-1].loadOOV(vLine)
  45. elif re.match("[^ ]+ zeroprobs, logprob= [^ ]+ ppl= [^ ]+ ppl1= [^ ]+", vLine):
  46. if vflgDocSumStarted:
  47. self.documentEval.loadScores(vLine)
  48. else:
  49. self.segmentEvals[-1].loadScores(vLine)
  50. class SRILMEval:
  51. '''
  52. Class for evaluating text against SRILM language model.
  53. NOTE: preliminary yet (only reading output)
  54. '''
  55. def __init__(self):
  56. '''
  57. Constructor
  58. '''
  59. self.oov = 0
  60. self.zeroprobs = 0
  61. self.logprob = 0.0
  62. self.ppl = 0.0
  63. self.ppl1 = 0.0
  64. def loadOOV(self, pSumLine):
  65. '''
  66. Loads OOV count from the summary line in the output of ngram command.
  67. '''
  68. vlSum = re.findall("[^ ]+ sentences, [^ ]+ words, ([^ ]+) OOVs", pSumLine)
  69. if len(vlSum) == 0:
  70. raise Exception("Not a valid summary line containing OOV: %s " % pSumLine)
  71. else:
  72. self.zeroprobs = int(vlSum[0][0])
  73. def loadScores(self, pScoreLine):
  74. '''
  75. Loads scores from score line in the output of ngram command.
  76. '''
  77. vlScores = re.findall("([^ ]+) zeroprobs, logprob= ([^ ]+) ppl= ([^ ]+) ppl1= ([^ ]+)", pScoreLine)
  78. if len(vlScores) == 0:
  79. raise Exception("Not a valid score line: %s " % pScoreLine)
  80. else:
  81. self.zeroprobs = int(vlScores[0][0])
  82. self.logprob = float(vlScores[0][1])
  83. self.prob = 10**self.logprob
  84. self.ppl = float(vlScores[0][2])
  85. self.ppl1 = float(vlScores[0][3])
  86. ## End of SRILM ########################################################
  87. ## Start of NGramCounts ################################################
  88. class NGramCounts:
  89. '''
  90. Class for n-gram counts of a corpus
  91. '''
  92. def __init__(self, pOrder):
  93. '''
  94. Constructor
  95. '''
  96. self.order = pOrder
  97. # a list of 2-tuples of n-grams and their counts
  98. self.counts = []
  99. def loadNGramCounts(self, pCounts, pFormat):
  100. '''
  101. Loads n-gram counts from pCounts in pFormat.
  102. It only loads n-grams of order self.order and ignore other orders.
  103. SRILM format is supported:
  104. '''
  105. self.counts = []
  106. if pFormat.lower() == "srilm":
  107. self._loadSRILMNGramCounts(pCounts)
  108. else:
  109. raise Exception("%s is not a supported format" % pFormat)
  110. if len(self.counts) == 0:
  111. raise Exception("No %s-gram was loaded!" % self.order)
  112. def _loadSRILMNGramCounts(self, pCounts):
  113. '''
  114. Loads n-gram counts from pCounts in SRILM n-gram count output
  115. format (-write option of ngram-count command).
  116. It only loads n-grams of order self.order and ignore other orders.
  117. The format is:
  118. <N-GRAM>\t<COUNT>
  119. where N-GRAM is:
  120. <TOKEN 1> <TOKEN 2> ... <TOKEN n>
  121. '''
  122. vlCountLines = pCounts.strip().split('\n')
  123. for vLine in vlCountLines:
  124. vNGram, vCount = vLine.split("\t")
  125. if len(vNGram.split()) != self.order:
  126. continue
  127. self.counts.append((vNGram, int(vCount)))
  128. def getQuantileSubset(self, pQuantileType, pSubsetNo):
  129. '''
  130. Returns the subset number pSubsetNo of the n-gram counts sliced
  131. by quantiles pQuantileType.
  132. For example, for pQuantileType = 4 (4-quantile or quartile) and
  133. pSubsetNo = 1, it sorts the n-gram counts, partitions is into 4
  134. quarters and returns the subset in quarter 1 (the lowest frequency
  135. subset). For pSubsetNo = 4, it would return the 4th quarter which
  136. (the highest frequency subset).
  137. Subset n starts at the item after quantile n-1 and ends at quantile
  138. n, where a quantile r (rank of quartile in fact) is
  139. computed as:
  140. len(counts) * r / quantile type
  141. '''
  142. vlSortedCounts = sorted(self.counts, key = lambda x: x[1])
  143. vLen = len(vlSortedCounts)
  144. vQuantileNminus1 = round(vLen * (pSubsetNo - 1) * 1.0 / pQuantileType)
  145. vQuantileN = round(vLen * pSubsetNo * 1.0 / pQuantileType)
  146. return vlSortedCounts[int(vQuantileNminus1) : int(vQuantileN)]
  147. ## End of NGramCounts ##################################################