123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692 |
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- This module contains POS tag corrector code.
-
- Version 0.1 (07-May-2015 to 18-May-2015)
- - POSAmender, POSAmenderFE, POSAmenderTrainer are added.
- """
- import pos
- from ml import fvg, eval
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # POSAmender
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class POSAmender:
- '''
- POS tag corrector
-
- Correction is done on a set of POS tagged sentences.
- '''
-
-
- def __init__(self):
- '''
- Constructor
-
- Dataset file contains the POS tagging in columnar format (e.g.
- Stanford TSV)
- '''
-
- ## feature config dump after extracting for training set (to be used
- ## later in prediction)
- self.dumpFConfig = ''
-
-
-
- @property
- def size(self):
- '''
- Return the size of the POS tagged data loaded
- '''
-
- return len(self.orgPOSTaggings)
-
-
-
- def trainSKLMaxEnt(self, pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
- pGoldPOSTaggingFilename, pGPTFormat, pGPTSeparator,
- pSuffixFilename, pFConfig):
- '''
- Trains and returns a MaxEnt (logit) model using scikit-learn
- '''
-
- vlOrgPOSTaggings = self.loadPOSTaggings(pFilename = pOrgPOSTaggingFilename, pFormat = pOPTFormat, pSeparator = pOPTSeparator)
- vllSuffixes = self.loadSuffixes(pFilename = pSuffixFilename)
- vllGoldPOSTags = self.extractPOSTags(pFilename = pGoldPOSTaggingFilename, pFormat = pGPTFormat, pSeparator = pGPTSeparator)
-
- # sanity check
- if len(vlOrgPOSTaggings) != len(vllSuffixes):
- raise Exception("The number of original POS tagged sentences does not match the number of suffixed sentences: %s vs. %s" % (len(vlOrgPOSTaggings), len(vllSuffixes)))
- if len(vlOrgPOSTaggings) != len(vllGoldPOSTags):
- raise Exception("The number of original POS tagged sentences does not match the number of gold POS tagged sentence: %s vs. %s" % (len(vlOrgPOSTaggings), len(vllGoldPOSTags)))
-
- # feature extraction
- vldFVs, vldRawFVs, self.dumpFConfig = self.genFeatureVectors(pFConfig = pFConfig, plOrgPOSTaggings = vlOrgPOSTaggings, pllSuffixes = vllSuffixes)
-
- # training
- vTrainer = POSAmenderTrainer(pldFVs = vldFVs, pllGoldPOSTags = vllGoldPOSTags)
- vlSKLInputDump, vModel = vTrainer.trainSKLMaxEnt()
-
- return vldFVs, vldRawFVs, vlSKLInputDump, vModel
-
-
-
- def predictWithSKLMaxEnt(self, pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
- pSuffixFilename, pModel, pFConfig = ''):
- '''
- Predicts the correct POS tags using a scikit-learn MaxEnt model
- '''
-
- vlOrgPOSTaggings = self.loadPOSTaggings(pFilename = pOrgPOSTaggingFilename, pFormat = pOPTFormat, pSeparator = pOPTSeparator)
- vllSuffixes = self.loadSuffixes(pFilename = pSuffixFilename)
-
- # sanity check
- if len(vlOrgPOSTaggings) != len(vllSuffixes):
- raise Exception("The number of original POS tagged sentences does not match the number of suffixed sentences: %s vs. %s" % (len(vlOrgPOSTaggings), len(vllSuffixes)))
-
- # feature extraction
- if pFConfig != '':
- vldFVs, vldRawFVs, vFConfig = self.genFeatureVectors(pFConfig = pFConfig, plOrgPOSTaggings = vlOrgPOSTaggings, pllSuffixes = vllSuffixes)
- elif self.dumpFConfig != '':
- vldFVs, vldRawFVs, vFConfig = self.genFeatureVectors(pFConfig = self.dumpFConfig, plOrgPOSTaggings= vlOrgPOSTaggings, pllSuffixes = vllSuffixes)
- else:
- raise Exception("Both given and dumped feature configs are empty!")
-
- # prediction
- vPredicter = POSAmenderPreder(pldFVs = vldFVs)
- vlSKLInputDump, vnpaPreds = vPredicter.predictWithSKLMaxEnt(pModel = pModel)
-
- return vldFVs, vldRawFVs, vlSKLInputDump, vnpaPreds.tolist()
-
-
-
- def testWithSKLMaxEnt(self, pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
- pGoldPOSTaggingFilename, pGPTFormat, pGPTSeparator,
- pSuffixFilename, pModel, pFConfig = ''):
- '''
- Tests the scikit-learn MaxEnt model performance and returns the
- feature vectors, prediction results and various scores
- '''
-
- # 1. prediction
-
- vldFVs, vldRawFVs, vlSKLInputDump, vlPrePOSTags = self.predictWithSKLMaxEnt(pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
- pSuffixFilename, pModel, pFConfig)
-
- # 2. evaluation
-
- vllGoldPOSTags = self.extractPOSTags(pFilename = pGoldPOSTaggingFilename, pFormat = pGPTFormat, pSeparator = pGPTSeparator)
- vlGoldPOSTags = [t for ts in vllGoldPOSTags for t in ts]
- # sanity check
- if len(vldFVs) != len(vlGoldPOSTags):
- raise Exception("The number of samples does not match the number of gold POS tagged sentence: %s vs. %s" % (len(vldFVs), len(vlGoldPOSTags)))
-
- # sentence-level predictions
- vllPredPOSTags = self.toSentPOSTagging(plPOSTags = vlPrePOSTags, plSentLens = [len(st) for st in vllGoldPOSTags])
-
- # sentence-level original POS tags
- vlOrgPOSTaggings = self.loadPOSTaggings(pFilename = pOrgPOSTaggingFilename, pFormat = pOPTFormat, pSeparator = pOPTSeparator)
- vllOrgPOSTags = [pt.getPOSTags() for pt in vlOrgPOSTaggings]
-
- # predictions against gold
- vGvPTAcc, vGvPSAcc = self.eval(pllGoldPOSTags = vllGoldPOSTags, pllPredPOSTags = vllPredPOSTags)
- # original against gold
- vGvOTAcc, vGvOSAcc = self.eval(pllGoldPOSTags = vllGoldPOSTags, pllPredPOSTags = vllOrgPOSTags)
- # predictions against original
- vOvPTAcc, vOvPSAcc = self.eval(pllGoldPOSTags = vllOrgPOSTags, pllPredPOSTags = vllPredPOSTags)
-
- # 3. generating feature vectors for dumping
-
- vlSKLInputDumpG = ["%s\t%s" % (g, v) for g, v in zip(vlGoldPOSTags, vlSKLInputDump)]
-
- return (vldFVs, vldRawFVs, vlSKLInputDumpG), vlPrePOSTags, (vGvPTAcc, vGvPSAcc), (vGvOTAcc, vGvOSAcc), (vOvPTAcc, vOvPSAcc)
-
-
-
- def eval(self, pllGoldPOSTags, pllPredPOSTags):
- '''
- Evaluates the performance of the correction
- '''
-
- vEvaler = POSAmenderEvaler(pllGoldPOSTags = pllGoldPOSTags)
-
- return vEvaler.eval(pllPredPOSTags = pllPredPOSTags)
-
-
-
- def loadPOSTaggings(self, pFilename, pFormat, pSeparator):
- '''
- Loads POS taggings from a file in the given format
-
- 3 formats are supported:
- - slashtag: e.g. I/PRN
- - columnar: e.g. Stanford TSV
- - ptb: Penn treebank tree format
-
- The value of pSeparator specifies the slashtag tag separator or
- column separator for columnar format depending on what format is
- chosen.
- '''
-
- vPOSTagLoader = pos.POSTagLoader(pos.POSTagging)
-
- if pFormat.lower().startswith("slash"):
- vPOSTagLoader.loadFromSlashTag(pPOSTaggedFilename = pFilename, pSeparator = pSeparator)
- elif pFormat.lower().startswith("column"):
- vPOSTagLoader.loadFromColumnar(pColFilename = pFilename, pSeparator = pSeparator)
- elif pFormat.lower().startswith("ptb"):
- vPOSTagLoader.loadFromPTB(pPTBFilename = pFilename)
-
- return vPOSTagLoader.taggedSentences
-
-
-
- def loadSuffixes(self, pFilename):
- '''
- Loads Foreebank POS tag suffixes
-
- The input file format must be one sentence per line, where each
- token suffix is wrapped inside '' and separated by a space from
- neighbouring tokens. Tokens without suffix are represented by ''.
- e.g. '' '' '_W' '' '' '_X' (sentence with 6 tokens where only 3rd
- and 6th tokens have suffixes.
- '''
-
- vlLines = open(pFilename).read().strip().split('\n')
-
- vlSentSuffixes = []
- for vLine in vlLines:
- vLineSplit = vLine.split()
- vlSentSuffixes.append([s.strip("'") for s in vLineSplit])
-
- return vlSentSuffixes
-
-
-
- def extractPOSTags(self, pFilename, pFormat, pSeparator):
- '''
- Extracts and returns the POS tags from sentence pos taggings
- '''
-
- vlPOSTaggings = self.loadPOSTaggings(pFilename = pFilename, pFormat = pFormat, pSeparator = pSeparator)
-
- return [pt.getPOSTags() for pt in vlPOSTaggings]
-
-
-
- def genFeatureVectors(self, pFConfig, plOrgPOSTaggings, pllSuffixes):
- '''
- Generates feature vectors based on the specified configuration
- in YAML-compatible format.
-
- The feature configuration is given in pFConfig. See fvg.FVGen
- for the description of its format.
-
- It can optionally output raw feature and value pairs in the following
- format:
- <FEATURE 1 NAME=FEATURE 1 VALUE> <FEATURE 2 NAME=FEATURE 2 VALUE> ...
- '''
-
- vFE = POSAmenderFE(plPOSTaggings = plOrgPOSTaggings, pllSuffixes = pllSuffixes)
-
- vldFVs, vDumpFConfig, vldRawFVs = vFE.genFeatureVectors(pFConfig = pFConfig)
-
- return vldFVs, vldRawFVs, vDumpFConfig
-
-
-
- def toSentPOSTagging(self, plPOSTags, plSentLens):
- '''
- Splits the one-token-per-line POS tags to one-sentence-per-line
- '''
-
- vlSentPOSTags = []
-
- vFrom = 0
- for vSLen in plSentLens:
- vlSentPOSTags.append(plPOSTags[vFrom : vFrom + vSLen])
- vFrom += vSLen
-
- return vlSentPOSTags
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # POSAmenderFE
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class POSAmenderFE:
- '''
- Class for extracting POSAmender features
- '''
-
-
- def __init__(self, plPOSTaggings, pllSuffixes):
- '''
- Constructor
- '''
-
- ## list of original sentence POS taggings (pos.POSTagging) to be
- ## amended
- self.posTaggings = plPOSTaggings
-
- ## list of sentence suffixes each represented by a list of suffixes
- ## per token (empty strings for no suffix)
- self.suffixes = pllSuffixes
-
- ## feature config generated during extraction (not the one given;
- ## this is a more complete version with binarization results)
- self.config = None
-
-
-
- def genFeatureVectors(self, pFConfig = '', pflgReturnRawFVs = True):
- '''
- Generates feature vectors based on the specified configuration
- in YAML-compatible format.
-
- The the feature configuration is given in pFConfig. See fvg.FVGen
- for the description of its format.
-
- It can optionally output raw feature and value pairs in the following
- format:
- <FEATURE 1 NAME=FEATURE 1 VALUE> <FEATURE 2 NAME=FEATURE 2 VALUE> ...
- '''
-
- vFVG = fvg.FVGen(pFConfig)
- vlFeatures = vFVG.getFeatures()
-
- for vFeature in vlFeatures:
- vlFeatureValues = self._extractFeature(vFeature.name, vFeature.featureParams)
- vFVG.loadFeatureValues(vFeature.name, vlFeatureValues)
-
- vldFVectors, self.config = vFVG.genFeatureVectors()
-
- if pflgReturnRawFVs:
- ## list of dictionaries with key being the feature name and values
- ## the feature values
- vldRawFVectors = vFVG.getRawFVectors()
-
- return vldFVectors, self.config, vldRawFVectors
- else:
- return vldFVectors, self.config
-
-
-
- def _extractFeature(self, pFeatureName, pdFeatureParams):
- '''
- Extracts values for pFeatureName
-
- pFeatureName is the one specified in confiquration file. In order
- to be able to have multiple settings of a single feature type, an
- entry for each setting is created in the configuration file with
- the feature name suffixed by any unique string. The specific feature
- parameters then goes under each entry. The reason is that configuration
- file is in YAML format which is dictionary-like and requires unique
- keys. So, the feature name can appear only once or only one of the
- appearances would be considered.
- '''
-
- if pFeatureName.startswith("token-word-form") or pFeatureName.startswith("form"):
- vlValues = self.extractWordForm()
- elif pFeatureName.startswith("token-pos-tag") or pFeatureName.startswith("pos"):
- vlValues = self.extractPOS()
- elif pFeatureName.startswith("token-pos-suffix") or pFeatureName.startswith("suffix"):
- vlValues = self.extractSuffix()
- elif pFeatureName.startswith("token-window-word-form") or pFeatureName.startswith("window-form"):
- vlValues = self.extractWindowWordForm(pdFeatureParams)
- elif pFeatureName.startswith("token-window-pos-tag") or pFeatureName.startswith("window-pos"):
- vlValues = self.extractWindowPOS(pdFeatureParams)
- elif pFeatureName.startswith("token-window-pos-suffix") or pFeatureName.startswith("window-suffix"):
- vlValues = self.extractWindowSuffix(pdFeatureParams)
- else:
- raise Exception("Feature %s is unknown! Check the spelling." % pFeatureName)
-
- return vlValues
-
-
-
- def extractWordForm(self):
- '''
- Extracts token word forms
- '''
-
- vlValues = []
-
- for vSentPOSTagging in self.posTaggings:
- for vToken in vSentPOSTagging:
- vlValues.append(vToken.form)
-
- return vlValues
-
-
-
- def extractPOS(self):
- '''
- Extracts token POS tags
- '''
-
- vlValues = []
-
- for vSentPOSTagging in self.posTaggings:
- for vToken in vSentPOSTagging:
- vlValues.append(vToken.tag)
-
- return vlValues
-
-
-
- def extractSuffix(self):
- '''
- Extracts token POS suffix
- '''
-
- vlValues = []
-
- for vSentSuffixes in self.suffixes:
- for vSuffix in vSentSuffixes:
- vlValues.append(vSuffix)
-
- return vlValues
-
-
-
- def extractWindowWordForm(self, pdFeatureParams):
- '''
- Extracts word forms of tokens located in a given window distance
- to the main token
-
- It returns "NULL" for the positions falling outside the sentence.
-
- pdFeatureParams contains the following parameters:
- - "window position": the relative position (distance) of the token
- to the main token (negative for left and positive
- for right hand side tokens).
- '''
-
- # parameter setting
-
- vWindowPosition = 0
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["window position", "position", "distance"]:
- vWindowPosition = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter!" % vParam)
-
- # extracting values
-
- vlValues = []
-
- for vSentPOSTagging in self.posTaggings:
- for i, vToken in enumerate(vSentPOSTagging):
- if i + vWindowPosition < 0:
- vlValues.append("NULL")
- elif i + vWindowPosition >= vSentPOSTagging.length:
- vlValues.append("NULL")
- else:
- vlValues.append(vToken.form)
-
- return vlValues
-
-
-
- def extractWindowPOS(self, pdFeatureParams):
- '''
- Extracts POS taggs of tokens located in a given window distance
- to the main token
-
- It returns "NULL" for the positions falling outside the sentence.
-
- pdFeatureParams contains the following parameters:
- - "window position": the relative position (distance) of the token
- to the main token (negative for left and positive
- for right hand side tokens).
- '''
-
- # parameter setting
-
- vWindowPosition = 0
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["window position", "position", "distance"]:
- vWindowPosition = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter!" % vParam)
-
- # extracting values
-
- vlValues = []
-
- for vSentPOSTagging in self.posTaggings:
- for i, vToken in enumerate(vSentPOSTagging):
- if i + vWindowPosition < 0:
- vlValues.append("NULL")
- elif i + vWindowPosition >= vSentPOSTagging.length:
- vlValues.append("NULL")
- else:
- vlValues.append(vToken.tag)
-
- return vlValues
-
-
-
- def extractWindowSuffix(self, pdFeatureParams):
- '''
- Extracts POS tag suffixes of tokens located in a given window distance
- to the main token
-
- It returns "NULL" for the positions falling outside the sentence.
-
- pdFeatureParams contains the following parameters:
- - "window position": the relative position (distance) of the token
- to the main token (negative for left and positive
- for right hand side tokens).
- '''
-
- # parameter setting
-
- vWindowPosition = 0
-
- for vParam, vValue in pdFeatureParams.iteritems():
- if vParam in ["window position", "position", "distance"]:
- vWindowPosition = vValue
- else:
- raise Exception("'%s' is not a valid feature parameter!" % vParam)
-
- # extracting values
-
- vlValues = []
-
- for vSentSuffixes in self.suffixes:
- for i, vSuffix in enumerate(vSentSuffixes):
- if i + vWindowPosition < 0:
- vlValues.append("NULL")
- elif i + vWindowPosition >= len(vSentSuffixes):
- vlValues.append("NULL")
- else:
- vlValues.append(vSuffix)
-
- return vlValues
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # POSAmenderTrainer
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class POSAmenderTrainer:
- '''
- Class for training a POSAmender model
- '''
-
-
- def __init__(self, pldFVs, pllGoldPOSTags):
- '''
- Constructor
- '''
-
- # feature vector list
- self.fvs = pldFVs
-
- # list of gold POS tags of sentences
- self.goldPOSTags = pllGoldPOSTags
-
-
-
- def trainSKLMaxEnt(self):
- '''
- Trains and returns a MaxEnt (logit) model using scikit-learn
- '''
-
- vllFVs = self.getFVsInSKLFormat()
- vlGoldLabels = self.extractGoldLabels()
-
- from sklearn import linear_model
-
- vModel = linear_model.LogisticRegression(verbose = 1)
- vModel.fit(X = vllFVs, y = vlGoldLabels)
-
- # generating the input data in SciKitLearn format for dumping
- vlSKLInputDump = ["%s\t%s" % (g, ' '.join([str(v) for v in fv])) for g, fv in zip(vlGoldLabels, vllFVs)]
-
- return vlSKLInputDump, vModel
-
-
-
- def getFVsInSKLFormat(self):
- '''
- Transforms the feature vectors to the input format of ScikitLearn
-
- The ScikitLearn format is a 2D array of shape (#_of_samples, #_of_features)
- '''
-
- vllSKLFVs = []
-
- for vdFV in self.fvs:
- vllSKLFVs.append([v for n, v in vdFV.itervalues()])
-
- return vllSKLFVs
-
-
-
- def extractGoldLabels(self):
- '''
- Extracts and returns the gold labels (POS tags) for training instances
- '''
-
- return [t for ts in self.goldPOSTags for t in ts]
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # POSAmenderPreder
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class POSAmenderPreder:
- '''
- Class for predicting the correct POS tag
- '''
-
-
- def __init__(self, pldFVs):
- '''
- Constructor
- '''
-
- # feature vector list
- self.fvs = pldFVs
-
-
-
- def predictWithSKLMaxEnt(self, pModel):
- '''
- Predicts and returns correct POS tags using a scikit-learn MaxEnt
- (logit) model
- '''
-
- vllFVs = self.getFVsInSKLFormat()
-
- from sklearn import linear_model
-
- vlPredLabels = pModel.predict(vllFVs)
-
- # generating the input data in SciKitLearn format for dumping
- vlSKLInputDump = [' '.join([str(v) for v in fv]) for fv in vllFVs]
-
- return vlSKLInputDump, vlPredLabels
-
-
-
- def getFVsInSKLFormat(self):
- '''
- Transforms the feature vectors to the input format of ScikitLearn
-
- The ScikitLearn format is a 2D arrar of shape (#_of_samples, #_of_features)
- '''
-
- vllSKLFVs = []
-
- for vdFV in self.fvs:
- vllSKLFVs.append([v for n, v in vdFV.itervalues()])
-
- return vllSKLFVs
-
-
-
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- # POSAmenderTester
- #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
- class POSAmenderEvaler:
- '''
- Class for evaluating the POS tag correction
- '''
-
-
- def __init__(self, pllGoldPOSTags):
- '''
- Constructor
- '''
-
- # list of gold POS tags of sentences
- self.goldPOSTags = pllGoldPOSTags
-
-
-
- def eval(self, pllPredPOSTags):
- '''
- Evaluates the predicted POS tags and returns the accuracy at token
- and sentence levels
- '''
-
- vPredEval = eval.PredEval()
-
- # sentence level
-
- vSAcc = vPredEval.acc(plPrediction = pllPredPOSTags, plReference = self.goldPOSTags)
-
- # token level
-
- vlGoldTags = [t for ts in self.goldPOSTags for t in ts]
- vlPredTags = [t for ts in pllPredPOSTags for t in ts]
-
- vTAcc = vPredEval.acc(plPrediction = vlPredTags, plReference = vlGoldTags)
-
- return vTAcc, vSAcc
-
-
|