rszk
/
scripts


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
							#! /usr/bin/python

## This script gets parser accuracy scores for a set of sentences, along
## with their individual translation scores by various systems (e.g. TER
## scores) and outputs statistics collected from the input.

## This statistics include:
## - the number of sentences on which each system performed the best in 
##   the ranges of specified intervals of parser accuracy
## - the percentage share of each system in each range
## - the average score of each system in each range
##
## Systems are identified automatically from the input file, by considering
## the first column being the parser accuracy and the following N, where 
## N is supplied as argument, being scores of each system for each sentence.
## The first row is supposed to be the title row containing system names.
##
## Current version: 1.0
##

from collections import namedtuple
from operator import itemgetter
from decimal import *
from random import choice
import sys, optparse


##----------------------------------------------------------------------
## Returns the index of best score in the provided list of scores.
## For tiebreaking, when more than one scores are equally the best, it 
## optionally chooses the best score in the acsending order of score list
## or selects one randomly
def getBestScoreIdx(pScores, pTieBreak):
 vlBestScoreIdxs = []
 vBestScore = min(pScores)
 
 # we need to check all scores, since we need to know all best scores
 vScoreIdx = 0
 for vScore in pScores:
  if vScore == vBestScore:
   vlBestScoreIdxs.append(vScoreIdx)
  vScoreIdx += 1

 # now tiebreaking if needed
 if len(vlBestScoreIdxs) > 1:
  if pTieBreak == 'r':              
   return choice(vlBestScoreIdxs)
  else:
   return vlBestScoreIdxs[0]      ## choose the first in ascending order
                                  ## (equal to pTieBreak == 'a')
 else:
  return vlBestScoreIdxs[0]      


##----------------------------------------------------------------------
## Writes the sorted input data into file
def writeSortedData(pSortedFileName, pData, pSystems, pDelim):
 try:
  vfSortFile  = open(pSortedFileName, 'w')
 except IOError:
  sys.exit('Can\'t create output sorted file: ' + pSortedFileName)
  
 # writing header
 vfSortFile.write("PA" + pDelim + 
                  "sentence" + pDelim + 
                  opts.delim.join(pSystems) + pDelim + 
                  "best system\n")
 
 # writing data
 for data in pData:
  vfSortFile.write(data[1] + pDelim +
                   data[0] + pDelim +
                   ','.join(data[2:]) + 
                   '\n') 
  
 vfSortFile.close() 


##======================================================================
## main
def main(argv=None):
 if argv is None:
  argv = sys.argv
 
 parser = optparse.OptionParser(usage="%prog <SCORES FILE> <NUMBER OF SYSTEMS> <PARSER ACCURACY INTERVAL> [options]" + 
                                      "\nThis script extracts statistics for parser-accuracy-based system combination.", version="%prog 1.0")

 parser.add_option("-d", "--delimiter", help="the delimiter string separating score columns", metavar="DELIMITER", dest="delim", default=" ", action="store")
 parser.add_option("-e", "--emptyrange", help="include empty ranges (ranges with no data)", metavar="EMPTY RANGES", dest="emptyRange", action="store_true")
 parser.add_option("-t", "--tiebreak", help="the tie breaking method (a: ascending system order, r: random)", metavar="TIEBREAK", dest="tieBreak", default="a", action="store")
 parser.add_option("-s", "--sortfile", help="the file name to output the input sorted by parser accuracy", metavar="SORTFILE", dest="sortedFileName", action="store")

 (opts, posArgs) = parser.parse_args()
 
 # checking arguments
 
 if len(posArgs) < 3:
  parser.error("At least 3 arguments are required!")
  
 if not (posArgs[1].isdigit() and posArgs[2].isdigit()):
  parser.error("The second and third arguments should be a number!")
 else:
  vSysNum = int(posArgs[1])
  vPAInterval = int(posArgs[2])

 # opening scores file
  
 vScoresFileName = posArgs[0]
 try:
  vfScores  = open(vScoresFileName, 'r')
 except IOError:
  sys.exit('Can\'t open scores file: ' + vScoresFileName)

 
 # extracting system names from the first row of scores file
 vlSystems = vfScores.readline().split(opts.delim)[1:vSysNum + 1]
 
 
 ## loading data into a list of tuples (no, pa, score of system 1, ..., 
 ## best system) 
 vlData = []
 vSentenceCntr = 1
 for line in vfScores.readlines():
  vlLine = line.split(opts.delim)
  vlSysScores = [Decimal(x) for x in vlLine[1:vSysNum + 1]]
  vBestSysIdx = int(getBestScoreIdx(vlSysScores, opts.tieBreak))
  vlData.append(tuple([int(vSentenceCntr)] +
                      [Decimal(vlLine[0])] + 
                      vlSysScores + 
                      [vBestSysIdx]))
  vSentenceCntr += 1

 # sorting data based on pa
 vlData.sort(key=itemgetter(1))
 
 # writing the sorted data into a file if requested
 if opts.sortedFileName != None:
  writeSortedData(opts.sortedFileName, vlData, vlSystems, opts.delim)
    
 
 # creating data structure to store pa interval statistics of systems
 PAIvalSysStat = namedtuple('PAIvalSysStat', 'rLower, rUpper, sysCounts, sysPercents, avgScores')
 vlPAIvalSysStat = []
 
 
 # creating ranges
 
 ## calculating the upper bound of last parser accuracy range based on 
 ## interval and maximum accuracy in data
 vMaxPA = vlData[-1][1]
 if vMaxPA % vPAInterval != 0:
  vLRUpper = (int(vMaxPA / vPAInterval) + 1) * vPAInterval
 else:
  vLRUpper = vMaxPA
  
 # computing interval statistics
 vDataIdx = 0
 ## looping through possible ranges, creating them, and computing statistics 
 ## for each range
 for vRange in range(0, vLRUpper, vPAInterval):
  ## 0 is treated as special case since it's included in the range as 
  ## opposed to the other range lower bounds (i.e. [0, 5], (5,10], ...)
  if vRange == 0:
   vRLower = -1
   vRUpper = vPAInterval
  else:
   vRLower = vRange
   vRUpper = vRLower + vPAInterval
   
  vRangeItemsCntr = 0
  vlRangeBSysCounts = [0]*vSysNum
  vlRangeSysScoreSum = [0]*vSysNum
  
  # looping through data to compute the best system counts and average scores
  while vDataIdx < len(vlData):
   if vRLower < vlData[vDataIdx][1] <= vRUpper:
	## adding one to the statistics of the best system (its index is in 
	## vDatum[-1])
	vlRangeBSysCounts[vlData[vDataIdx][-1]] += 1
	
	## computing the sum of scores for each system (this will be divided 
	## by the number of sentences in range later when it was found)
	for sysIdx in range(0, len(vlSystems)):
	 vlRangeSysScoreSum[sysIdx] = Decimal(vlRangeSysScoreSum[sysIdx]) + Decimal(vlData[vDataIdx][sysIdx + 2])	
	 #print vlRangeSysScoreSum[-1],
	vDataIdx += 1
	vRangeItemsCntr += 1
   else:
    break
    
  ## finalizing computing the average scores by dividing collected sums
  if vRangeItemsCntr == 0:
   vlRangeAvgScores = [0]*vSysNum
  else:
   vlRangeAvgScores = [Decimal(score / vRangeItemsCntr).quantize(Decimal('0.01')) for score in vlRangeSysScoreSum]
  
  ## computing ratios of counts of each system with respect to the sum of
  ## counts of all system  
  ## Note that the last one is not directly calculated. Insread it's 
  ## cacluated bu sybtracting the percentage so far from 100.
  vlRangePercents = []
  if sum(vlRangeBSysCounts) == 0:
   vlRangePercents = [0] * len(vlSystems)
  else:
   for idx in range (0, len(vlSystems) - 1):
    vlRangePercents.append((Decimal(vlRangeBSysCounts[idx]) * 100 / sum(vlRangeBSysCounts)).quantize(Decimal('0.1')))
   vlRangePercents.append((100 - sum(vlRangePercents)).quantize(Decimal('0.1')))
  
  # returning back the vRLower to 0 after treating 0 special case
  if vRLower == -1:
   vRLower = 0
   
  ## inserting the statistics for the range
  ## optionally, empty ranges (range with no data) will also be inserted
  if opts.emptyRange or sum(vlRangeBSysCounts) > 0:
   vlPAIvalSysStat.append(PAIvalSysStat(vRLower, vRUpper, vlRangeBSysCounts, 
                                        vlRangePercents, vlRangeAvgScores))
 
 ## printing the statistics
 ## The same delimiter in input file is used to format output.
 
 # printing header
 print ("range" + opts.delim + 
        opts.delim.join(vlSystems) + opts.delim + 
        opts.delim.join('%' + sys for sys in vlSystems) + opts.delim + 
        opts.delim.join('avg(' + sys + ')' for sys in vlSystems))
 # printing data
 for vStat in vlPAIvalSysStat:
  print (str(vStat.rLower) + '-' + 
         str(vStat.rUpper) + opts.delim + 
         opts.delim.join(str(x) for x in vStat.sysCounts) + opts.delim + 
         opts.delim.join(str(x) for x in vStat.sysPercents) + opts.delim + 
         opts.delim.join(str(x) for x in vStat.avgScores))

  
 vfScores.close()

##======================================================================
## calling main
if __name__ == "__main__":
 sys.exit(main())