#! /usr/bin/python # -*- coding: utf-8 -*- """ This module defines classes for language model processing. Version 0.2 (27-Jun-2013) - SRILM.readOutput() which was used to read the output of various command is removed and distributed to functions reading specific output type such as language model and and n-gram counts. - SRILM._readNGramOutput() which was only reading the output of LM evaluation (scoring) is renamed to loadEval(). - NGramCounts is added Version 0.1 (03-Jun-2013) - SRILM and SRILMEval are added. """ import re ## Start of SRILM ###################################################### class SRILM: ''' Wrapper around SRILM toolkit. NOTE: preliminary yet (only reading output) ''' def __init__(self): ''' Constructor ''' # each elemement is of SRILMEval type self.segmentEvals = [] self.documentEval = None def loadEval(self, pOutput, pDebug): ''' Loads the evaluation (scoring) output ''' self.segmentEvals = [] self.documentEval = SRILMEval() vflgDocSumStarted = False for vLine in pOutput.strip().split('\n'): if re.match("^file [^ ]+: [^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine): vflgDocSumStarted = True elif re.match("[^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine): if vflgDocSumStarted: self.documentEval.loadOOV(vLine) else: self.segmentEvals.append(SRILMEval()) self.segmentEvals[-1].loadOOV(vLine) elif re.match("[^ ]+ zeroprobs, logprob= [^ ]+ ppl= [^ ]+ ppl1= [^ ]+", vLine): if vflgDocSumStarted: self.documentEval.loadScores(vLine) else: self.segmentEvals[-1].loadScores(vLine) class SRILMEval: ''' Class for evaluating text against SRILM language model. NOTE: preliminary yet (only reading output) ''' def __init__(self): ''' Constructor ''' self.oov = 0 self.zeroprobs = 0 self.logprob = 0.0 self.ppl = 0.0 self.ppl1 = 0.0 def loadOOV(self, pSumLine): ''' Loads OOV count from the summary line in the output of ngram command. ''' vlSum = re.findall("[^ ]+ sentences, [^ ]+ words, ([^ ]+) OOVs", pSumLine) if len(vlSum) == 0: raise Exception("Not a valid summary line containing OOV: %s " % pSumLine) else: self.zeroprobs = int(vlSum[0][0]) def loadScores(self, pScoreLine): ''' Loads scores from score line in the output of ngram command. ''' vlScores = re.findall("([^ ]+) zeroprobs, logprob= ([^ ]+) ppl= ([^ ]+) ppl1= ([^ ]+)", pScoreLine) if len(vlScores) == 0: raise Exception("Not a valid score line: %s " % pScoreLine) else: self.zeroprobs = int(vlScores[0][0]) self.logprob = float(vlScores[0][1]) self.prob = 10**self.logprob self.ppl = float(vlScores[0][2]) self.ppl1 = float(vlScores[0][3]) ## End of SRILM ######################################################## ## Start of NGramCounts ################################################ class NGramCounts: ''' Class for n-gram counts of a corpus ''' def __init__(self, pOrder): ''' Constructor ''' self.order = pOrder # a list of 2-tuples of n-grams and their counts self.counts = [] def loadNGramCounts(self, pCounts, pFormat): ''' Loads n-gram counts from pCounts in pFormat. It only loads n-grams of order self.order and ignore other orders. SRILM format is supported: ''' self.counts = [] if pFormat.lower() == "srilm": self._loadSRILMNGramCounts(pCounts) else: raise Exception("%s is not a supported format" % pFormat) if len(self.counts) == 0: raise Exception("No %s-gram was loaded!" % self.order) def _loadSRILMNGramCounts(self, pCounts): ''' Loads n-gram counts from pCounts in SRILM n-gram count output format (-write option of ngram-count command). It only loads n-grams of order self.order and ignore other orders. The format is: \t where N-GRAM is: ... ''' vlCountLines = pCounts.strip().split('\n') for vLine in vlCountLines: vNGram, vCount = vLine.split("\t") if len(vNGram.split()) != self.order: continue self.counts.append((vNGram, int(vCount))) def getQuantileSubset(self, pQuantileType, pSubsetNo): ''' Returns the subset number pSubsetNo of the n-gram counts sliced by quantiles pQuantileType. For example, for pQuantileType = 4 (4-quantile or quartile) and pSubsetNo = 1, it sorts the n-gram counts, partitions is into 4 quarters and returns the subset in quarter 1 (the lowest frequency subset). For pSubsetNo = 4, it would return the 4th quarter which (the highest frequency subset). Subset n starts at the item after quantile n-1 and ends at quantile n, where a quantile r (rank of quartile in fact) is computed as: len(counts) * r / quantile type ''' vlSortedCounts = sorted(self.counts, key = lambda x: x[1]) vLen = len(vlSortedCounts) vQuantileNminus1 = round(vLen * (pSubsetNo - 1) * 1.0 / pQuantileType) vQuantileN = round(vLen * pSubsetNo * 1.0 / pQuantileType) return vlSortedCounts[int(vQuantileNminus1) : int(vQuantileN)] ## End of NGramCounts ##################################################