123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405 |
- #! /usr/bin/env python
- # -*- coding: utf-8 -*-
- """
- Wrapper around any-gram kernel
- Author: Rasoul Kaljahi
- See LICENSE file.
- """
- from sklearn import svm
- from sklearn.multiclass import OneVsRestClassifier
- import pickle
- import numpy as np
- class AnyGramWrapper:
- '''
- Wrapper class around any-gram kernel to be used by scikit learn SVC
- '''
-
-
- def __init__(self, pAnyGram):
- '''
- Constructor
- '''
- self.trainXs = None
- self.trainYs = None
- self.trainAuxs = None # auxiliary training input
-
- self.testXs = None
- self.testYs = None
- self.testAuxs = None # auxiliary test input
-
- self.ag = pAnyGram
- self._pcTrainKernel = None # precomputed kernel from the preloaded training data
- self._pcTestKernel = None # precomputed kernel from the preloaded test data
- self._model = None
-
-
-
- def loadTrainSet(self, pXs, pYs):
- '''
- Loads the training data
-
- The data set is provided via two lists, one containing a list of text instances (preferably tokenized) and one
- containing the labels matching the text instances.
- '''
-
- self.trainXs = self.ag.formatData(pXs)
- self.trainYs = [int(y) for y in pYs]
-
- self._pcTrainKernel = None
-
-
-
- def loadTrainAux(self, pAux):
- '''
- Loads auxiliary training input
-
- The input must be a 3D array/list of shape (#train_innstances, #max_text_length, #auxiliary_dimension
- '''
- if self.trainXs is None or len(self.trainXs) == 0:
- raise Exception("Training dataset should be loaded first!")
- self.trainAuxs = self.formatAux(pAux)
-
-
-
- def loadTestSet(self, pXs, pYs):
- '''
- Loads the training data
-
- The data set is provided via two lists, one containing a list of text instances (preferably tokenized) and one
- containing the labels matching the text instances.
- '''
-
- self.testXs = self.ag.formatData(pXs)
- self.testYs = [int(y) for y in pYs]
-
- self._pcTestKernel = None
-
-
-
- def loadTestAux(self, pAux):
- '''
- Loads auxiliary test input
-
- The input must be a 3D array/list of shape (#train_innstances, #max_text_length, #auxiliary_dimension
- '''
-
- if self.testXs is None or len(self.testXs) == 0:
- raise Exception("Test dataset should be loaded first!")
- self.testAuxs = self.formatAux(pAux)
-
-
-
- def formatAux(self, pAux):
- '''
- Formats the auxiliary data
-
- The input is a 3D list of shape (#train_innstances, #max_text_length, #auxiliary_dimension)
- '''
-
- if isinstance(pAux, list):
- # padding the 2nd dimensions of list (#max_text_length)
- vFixedDimList = [s[:self.ag.maxTxtLen] + [[0] * len(s[0])] * (self.ag.maxTxtLen - len(s)) for s in pAux]
- for s in vFixedDimList:
- if len(s) != 80:
- print len(s)
- for t in s:
- if len(t) != 1:
- print len(t)
- for a in t:
- if not isinstance(a, int):
- print type(a)
- return np.array(vFixedDimList, dtype=np.float64)
- else:
- return pAux
-
-
-
- def loadEmbeddings(self, pWEFilename, pIsLowerCase):
- '''
- Loads word embeddings from the given file
-
- Embeddings should be loaded before the data, because the vocabulary used for formatting data should
- be extracted from the word embeddings (when the word embeddings are used).
-
- pIsLowerCase specifies whether the vocabulary of the word embedding is lowercased.
- '''
-
- self.ag.loadEmbeddings(pWEFilename, pIsLowerCase)
-
-
-
- def precomputeTrainKernel(self):
- '''
- Precomputes the kernel from the training data
- '''
-
- self._pcTrainKernel = self._preComputeKernel(self.trainXs, self.trainXs, self.trainAuxs, self.trainAuxs)
-
-
-
- def _getPrecomputedTrainKernel(self):
- '''
- Returns the precomputed kernel from the training data
-
- If the kernel is not precomputed yet, it will do so first.
- '''
-
- if self._pcTrainKernel is None:
- self.precomputeTrainKernel()
-
- return self._pcTrainKernel
-
-
-
- def precomputeTestKernel(self):
- '''
- Precomputes the kernel from the test data
- '''
-
- self._pcTestKernel = self._preComputeKernel(self.testXs, self.trainXs, self.testAuxs, self.trainAuxs)
-
-
-
- def _getPrecomputedTestKernel(self):
- '''
- Returns the precomputed kernel from the test data
-
- If the kernel is not precomputed yet, it will do so first.
- '''
-
- if self._pcTestKernel is None:
- self.precomputeTestKernel()
-
- return self._pcTestKernel
-
-
-
- def _preComputeKernel(self, pX1, pX2, pAux1 = None, pAux2 = None):
- '''
- Computes and returns kernel with the given data
-
- The data should be formatted by AnyGram.formatData().
- '''
-
- if self.trainXs is None or len(self.trainXs) == 0:
- raise Exception("Training dataset is empty!")
-
- if pAux1 is not None and pAux2 is not None:
- return self.ag.computeKernel(pX1.astype(np.float64),
- pX2.astype(np.float64),
- pAux1.astype(np.float64),
- pAux2.astype(np.float64))
- else:
- return self.ag.computeKernel(pX1.astype(np.float64),
- pX2.astype(np.float64))
-
-
-
- def combinePrecomputedTrainKernel(self, paKernelMatrix, pCombMethod):
- '''
- Combines the anygram kernel computed on the training set here with any given kernel matrix using the specified method
-
- The given kernel matrix should match the computed any-gram kernel matrix in shape.
-
- The combination methods supported here are:
- + or add: add corresponding elements in the two kernel matrices
- * or multiply: multiply corresponding elements in the two kernel matrices
- arith: arithmetic mean of the corresponding elements in the two kernel matrices
- geo: geometric mean
- '''
-
- if self._pcTrainKernel in None:
- raise Exception("Kernel is not precomputed yet. Run precomputeTrainKernel() first.")
-
- self._pcTrainKernel = self._combineKernels(self._pcTrainKernel, paKernelMatrix, pCombMethod)
-
-
-
- def combinePrecomputedTestKernel(self, paKernelMatrix, pCombMethod):
- '''
- Combines the anygram kernel computed on the test set here with any given kernel matrix using the specified method
-
- The given kernel matrix should match the computed any-gram kernel matrix in shape.
-
- The combination methods supported here are:
- + or add: add corresponding elements in the two kernel matrices
- * or multiply: multiply corresponding elements in the two kernel matrices
- arith: arithmetic mean of the corresponding elements in the two kernel matrices
- geo: geometric mean
- '''
-
- if self._pcTestKernel in None:
- raise Exception("Kernel is not precomputed yet. Run precomputeTrainKernel() first.")
-
- self._pcTestKernel = self._combineKernels(self._pcTestKernel, paKernelMatrix, pCombMethod)
-
-
-
-
- def _combineKernels(self, paKernelMatrix1, paKernelMatrix2, pCombMethod):
- '''
- Combines and returns two given kernel matrices with the givem method
-
- The given kernel matrices should have the same shapes.
-
- The combination methods supported here are:
- + or add: add corresponding elements in the two kernel matrices
- * or multiply: multiply corresponding elements in the two kernel matrices
- arith: arithmetic mean of the corresponding elements in the two kernel matrices
- geo: geometric mean
- '''
-
- if paKernelMatrix1.shape != paKernelMatrix2.shape:
- raise Exception("The shape of the given kernel matrix is not valid: " % paKernelMatrix1.shape)
-
- if pCombMethod.lower() in ['+', "add"]:
- return np.add(paKernelMatrix1, paKernelMatrix1)
- elif pCombMethod.lower() in ['*', "multiply"]:
- return np.multiply(paKernelMatrix1, paKernelMatrix1)
- elif pCombMethod.lower().startswith("arith"):
- return np.add(paKernelMatrix1, paKernelMatrix2) / 2
- elif pCombMethod.lower().startswith("geo"):
- return np.sqrt(np.multiply(paKernelMatrix1, paKernelMatrix2))
-
-
-
- def train(self, pflgUsePrecompKernel = False, pMCMethod = None, C = 1, class_weight = None):
- '''
- Trains and returns anygram model
-
- If pflgUsePrecompKernel is set to true, SVC will use precomputed kernel. This can save time when the data or
- kernel computation parameters remain the same in repeated trainings (e.g. in tunning).
-
- pMCMethod is decision_function_shape parameter of the scikit.svm.SVC and specifies the multiclass classification
- method. The othe parameters are those of scikit.svm.SVC.
- '''
-
- if self.trainXs is None or len(self.trainXs) == 0:
- raise Exception("Training dataset is empty!")
-
-
- if pflgUsePrecompKernel:
- vKernel = "precomputed"
- X = self._getPrecomputedTrainKernel()
- else:
- vKernel = self.ag
- X = self.trainXs
- if pMCMethod is None:
- vSVC = svm.SVC(kernel = vKernel, C = C, class_weight = class_weight)
- elif pMCMethod.lower() == "ovo":
- vSVC = svm.SVC(kernel = vKernel, decision_function_shape = "ovo", C = C, class_weight = class_weight)
- elif pMCMethod.lower() in ["ova", "ovr"]:
- vSVC = OneVsRestClassifier(svm.SVC(kernel = vKernel, decision_function_shape = pMCMethod, C = C, class_weight = class_weight))
- else:
- raise Exception("Unknown multiclass classification method: %s", pMCMethod)
-
- # training
- self._model = vSVC.fit(X, self.trainYs)
-
-
-
- @property
- def model(self):
- '''
- Trained scikit SVC model
- '''
-
- return self._model
-
-
-
- def saveModel(self, pFilename):
- '''
- Saves the SVC model by pickling it to a given file
- '''
-
- ## ToDo: when saving the model, the vocabulary and the embeddings (for WESS) must also be saved
-
- pickle.dump(self._model, open(pFilename, 'w'))
-
-
-
- def loadModel(self, pModelPickle):
- '''
- Loads the pickled SVC model
- '''
- ## ToDo: model should have been saved with a vocabulary and embedding which must also be loaded here
- self._model = pickle.load(open(pModelPickle))
-
-
-
- def test(self, pTestXs = None, pTestYs = None):
- '''
- Tests the given models on the loaded test set
- '''
-
- if pTestXs is not None and pTestYs is not None:
- self.loadTestSet(pXs = pTestXs, pYs = pTestYs)
-
- if isinstance(self._model, OneVsRestClassifier):
- vKernel = self._model.estimators_[0].kernel
- elif isinstance(self._model, svm.SVC):
- vKernel = self._model.kernel
-
- # prediction
- if vKernel == "precomputed":
- vaPreds = self._model.predict(self._getPrecomputedTestKernel())
- else:
- vaPreds = self._model.predict(self.testXs)
-
- # scoring
- vScore = self._score(vaPreds, self.testYs)
-
- return vaPreds, vScore
-
-
-
- def predict(self, pXs, pAux = None):
- '''
- Predicts the labels of the given data
- '''
- vaXs = self.ag.formatData(pXs)
- vaAuxs = self.formatAux(pAux)
-
- if isinstance(self._model, OneVsRestClassifier):
- vKernel = self._model.estimators_[0].kernel
- elif isinstance(self._model, svm.SVC):
- vKernel = self._model.kernel
-
- if vKernel == "precomputed":
- return self._model.predict(self._preComputeKernel(vaXs, self.trainXs, vaAuxs, self.trainAuxs))
- else:
- return self._model.predict(vaXs)
-
-
-
- def _score(self, plPreds, plGolds):
- '''
- Scores the given predictions
- '''
-
- vCorrect = 0
-
- for p, g in zip(plPreds, plGolds):
- if p == g:
- vCorrect += 1
-
- return vCorrect * 1.0 / len(plPreds)
-
-
-
|