rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222
							#! /usr/bin/python
# -*- coding: utf-8 -*-

"""	
	This module defines classes for language model processing.
	
	Version 0.2												(27-Jun-2013)
	- SRILM.readOutput() which was used to read the output of various 
	  command is removed and distributed to functions reading specific 
	  output type such as language model and and n-gram counts.
	- SRILM._readNGramOutput() which was only reading the output of LM
	  evaluation (scoring) is renamed to loadEval().
	- NGramCounts is added
	
	
	Version 0.1												(03-Jun-2013)
	- SRILM and SRILMEval are added.
	
"""


import re

## Start of SRILM ######################################################

class SRILM:
	'''
	Wrapper around SRILM toolkit.
	
	NOTE: preliminary yet (only reading output)
	'''
	
	def __init__(self):
		'''
		Constructor
		'''
		
		# each elemement is of SRILMEval type
		self.segmentEvals = []
		self.documentEval = None
		
	
	def loadEval(self, pOutput, pDebug):
		'''
		Loads the evaluation (scoring) output
		'''
		
		self.segmentEvals = []
		self.documentEval = SRILMEval()
		vflgDocSumStarted = False
		
		for vLine in pOutput.strip().split('\n'):
			if re.match("^file [^ ]+: [^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine):
				vflgDocSumStarted = True
			elif re.match("[^ ]+ sentences, [^ ]+ words, [^ ]+ OOVs", vLine):
				if vflgDocSumStarted:
					self.documentEval.loadOOV(vLine)
				else:
					self.segmentEvals.append(SRILMEval())
					self.segmentEvals[-1].loadOOV(vLine)
			elif re.match("[^ ]+ zeroprobs, logprob= [^ ]+ ppl= [^ ]+ ppl1= [^ ]+", vLine):
				if vflgDocSumStarted:
					self.documentEval.loadScores(vLine)
				else:
					self.segmentEvals[-1].loadScores(vLine)
		
	
class SRILMEval:
	'''
	Class for evaluating text against SRILM language model.
	
	NOTE: preliminary yet (only reading output)
	'''
	
	def __init__(self):
		'''
		Constructor
		'''
		
		self.oov = 0
		self.zeroprobs = 0
		self.logprob = 0.0 
		self.ppl = 0.0
		self.ppl1 = 0.0
		
	
	def loadOOV(self, pSumLine):
		'''
		Loads OOV count from the summary line in the output of ngram command.
		'''
		
		vlSum = re.findall("[^ ]+ sentences, [^ ]+ words, ([^ ]+) OOVs", pSumLine)
		
		if len(vlSum) == 0:
			raise Exception("Not a valid summary line containing OOV: %s " % pSumLine)
		else:
			self.zeroprobs = int(vlSum[0][0])
			
	
	def loadScores(self, pScoreLine):
		'''
		Loads scores from score line in the output of ngram command.
		'''
		
		vlScores = re.findall("([^ ]+) zeroprobs, logprob= ([^ ]+) ppl= ([^ ]+) ppl1= ([^ ]+)", pScoreLine)
		
		if len(vlScores) == 0:
			raise Exception("Not a valid score line: %s " % pScoreLine)
		else:
			self.zeroprobs = int(vlScores[0][0])
			self.logprob = float(vlScores[0][1])
			self.prob = 10**self.logprob
			self.ppl = float(vlScores[0][2])
			self.ppl1 = float(vlScores[0][3])
			
	
## End of SRILM ########################################################

## Start of NGramCounts ################################################

class NGramCounts:
	'''
	Class for n-gram counts of a corpus
	'''
	
	def __init__(self, pOrder):
		'''
		Constructor
		'''
		
		self.order = pOrder
		
		# a list of 2-tuples of n-grams and their counts
		self.counts = []
		
	
	def loadNGramCounts(self, pCounts, pFormat):
		'''
		Loads n-gram counts from pCounts in pFormat.
		
		It only loads n-grams of order self.order and ignore other orders.
		
		SRILM format is supported:
		'''
		
		self.counts = []
		
		if pFormat.lower() == "srilm":
			self._loadSRILMNGramCounts(pCounts)
		else:
			raise Exception("%s is not a supported format" % pFormat)
		
		if len(self.counts) == 0:
			raise Exception("No %s-gram was loaded!" % self.order)
		
	
	def _loadSRILMNGramCounts(self, pCounts):
		'''
		Loads n-gram counts from pCounts in SRILM n-gram count output 
		format (-write option of ngram-count command).
		
		It only loads n-grams of order self.order and ignore other orders.
		
		The format is:
		  <N-GRAM>\t<COUNT>
		where N-GRAM is:
		  <TOKEN 1> <TOKEN 2> ... <TOKEN n>
		'''
		
		vlCountLines = pCounts.strip().split('\n')
		
		for vLine in vlCountLines:
			vNGram, vCount = vLine.split("\t")
			
			if len(vNGram.split()) != self.order:
				continue 
			
			self.counts.append((vNGram, int(vCount)))
		
	
	def getQuantileSubset(self, pQuantileType, pSubsetNo):
		'''
		Returns the subset number pSubsetNo of the n-gram counts sliced
		by quantiles pQuantileType.
		
		For example, for pQuantileType = 4 (4-quantile or quartile) and 
		pSubsetNo = 1, it sorts the n-gram counts, partitions is into 4 
		quarters and returns the subset in quarter 1 (the lowest frequency
		subset). For pSubsetNo = 4, it would return the 4th quarter which 
		(the highest frequency subset).
		
		Subset n starts at the item after quantile n-1 and ends at quantile
		n, where a quantile r (rank of quartile in fact) is
		computed as:
		
		  len(counts) * r / quantile type 
		'''
		
		vlSortedCounts = sorted(self.counts, key = lambda x: x[1])
		vLen = len(vlSortedCounts)
		
		vQuantileNminus1 = round(vLen * (pSubsetNo - 1) * 1.0 / pQuantileType)
		vQuantileN = round(vLen * pSubsetNo * 1.0 / pQuantileType)
		
		return vlSortedCounts[int(vQuantileNminus1) : int(vQuantileN)]
		
	
## End of NGramCounts ##################################################