123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222 |
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- This module defines classes for language model processing.
-
- Version 0.2 (27-Jun-2013)
- - SRILM.readOutput() which was used to read the output of various
- command is removed and distributed to functions reading specific
- output type such as language model and and n-gram counts.
- - SRILM._readNGramOutput() which was only reading the output of LM
- evaluation (scoring) is renamed to loadEval().
- - NGramCounts is added
-
-
- Version 0.1 (03-Jun-2013)
- - SRILM and SRILMEval are added.
-
- """
- import re
- ## Start of SRILM ######################################################
- class SRILM:
- '''
- Wrapper around SRILM toolkit.
-
- NOTE: preliminary yet (only reading output)
- '''
-
- def __init__(self):
- '''
- Constructor
- '''
-
- # each elemement is of SRILMEval type
- self.segmentEvals = []
- self.documentEval = None
-
-
-
- def loadEval(self, pOutput, pDebug):
- '''
- Loads the evaluation (scoring) output
- '''
-
- self.segmentEvals = []
- self.documentEval = SRILMEval()
- vflgDocSumStarted = False
-
- for vLine in pOutput.strip().split('\n'):
- if re.match("^file [^ ]+: [^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine):
- vflgDocSumStarted = True
- elif re.match("[^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine):
- if vflgDocSumStarted:
- self.documentEval.loadOOV(vLine)
- else:
- self.segmentEvals.append(SRILMEval())
- self.segmentEvals[-1].loadOOV(vLine)
- elif re.match("[^ ]+ zeroprobs, logprob= [^ ]+ ppl= [^ ]+ ppl1= [^ ]+", vLine):
- if vflgDocSumStarted:
- self.documentEval.loadScores(vLine)
- else:
- self.segmentEvals[-1].loadScores(vLine)
-
-
-
- class SRILMEval:
- '''
- Class for evaluating text against SRILM language model.
-
- NOTE: preliminary yet (only reading output)
- '''
-
- def __init__(self):
- '''
- Constructor
- '''
-
- self.oov = 0
- self.zeroprobs = 0
- self.logprob = 0.0
- self.ppl = 0.0
- self.ppl1 = 0.0
-
-
-
- def loadOOV(self, pSumLine):
- '''
- Loads OOV count from the summary line in the output of ngram command.
- '''
-
- vlSum = re.findall("[^ ]+ sentences, [^ ]+ words, ([^ ]+) OOVs", pSumLine)
-
- if len(vlSum) == 0:
- raise Exception("Not a valid summary line containing OOV: %s " % pSumLine)
- else:
- self.zeroprobs = int(vlSum[0][0])
-
-
-
- def loadScores(self, pScoreLine):
- '''
- Loads scores from score line in the output of ngram command.
- '''
-
- vlScores = re.findall("([^ ]+) zeroprobs, logprob= ([^ ]+) ppl= ([^ ]+) ppl1= ([^ ]+)", pScoreLine)
-
- if len(vlScores) == 0:
- raise Exception("Not a valid score line: %s " % pScoreLine)
- else:
- self.zeroprobs = int(vlScores[0][0])
- self.logprob = float(vlScores[0][1])
- self.prob = 10**self.logprob
- self.ppl = float(vlScores[0][2])
- self.ppl1 = float(vlScores[0][3])
-
-
- ## End of SRILM ########################################################
- ## Start of NGramCounts ################################################
- class NGramCounts:
- '''
- Class for n-gram counts of a corpus
- '''
-
- def __init__(self, pOrder):
- '''
- Constructor
- '''
-
- self.order = pOrder
-
- # a list of 2-tuples of n-grams and their counts
- self.counts = []
-
-
-
- def loadNGramCounts(self, pCounts, pFormat):
- '''
- Loads n-gram counts from pCounts in pFormat.
-
- It only loads n-grams of order self.order and ignore other orders.
-
- SRILM format is supported:
- '''
-
- self.counts = []
-
- if pFormat.lower() == "srilm":
- self._loadSRILMNGramCounts(pCounts)
- else:
- raise Exception("%s is not a supported format" % pFormat)
-
- if len(self.counts) == 0:
- raise Exception("No %s-gram was loaded!" % self.order)
-
-
-
- def _loadSRILMNGramCounts(self, pCounts):
- '''
- Loads n-gram counts from pCounts in SRILM n-gram count output
- format (-write option of ngram-count command).
-
- It only loads n-grams of order self.order and ignore other orders.
-
- The format is:
- <N-GRAM>\t<COUNT>
- where N-GRAM is:
- <TOKEN 1> <TOKEN 2> ... <TOKEN n>
- '''
-
- vlCountLines = pCounts.strip().split('\n')
-
- for vLine in vlCountLines:
- vNGram, vCount = vLine.split("\t")
-
- if len(vNGram.split()) != self.order:
- continue
-
- self.counts.append((vNGram, int(vCount)))
-
-
-
- def getQuantileSubset(self, pQuantileType, pSubsetNo):
- '''
- Returns the subset number pSubsetNo of the n-gram counts sliced
- by quantiles pQuantileType.
-
- For example, for pQuantileType = 4 (4-quantile or quartile) and
- pSubsetNo = 1, it sorts the n-gram counts, partitions is into 4
- quarters and returns the subset in quarter 1 (the lowest frequency
- subset). For pSubsetNo = 4, it would return the 4th quarter which
- (the highest frequency subset).
-
- Subset n starts at the item after quantile n-1 and ends at quantile
- n, where a quantile r (rank of quartile in fact) is
- computed as:
-
- len(counts) * r / quantile type
- '''
-
- vlSortedCounts = sorted(self.counts, key = lambda x: x[1])
- vLen = len(vlSortedCounts)
-
- vQuantileNminus1 = round(vLen * (pSubsetNo - 1) * 1.0 / pQuantileType)
- vQuantileN = round(vLen * pSubsetNo * 1.0 / pQuantileType)
-
- return vlSortedCounts[int(vQuantileNminus1) : int(vQuantileN)]
-
-
- ## End of NGramCounts ##################################################
|