rszk 6 years ago
parent
commit
8526464aa3
4 changed files with 1426 additions and 0 deletions
  1. 371 0
      anygramkernel.py
  2. 405 0
      anygramwrapper.py
  3. 11 0
      glove.we
  4. 639 0
      we.py

+ 371 - 0
anygramkernel.py

@@ -0,0 +1,371 @@
+from collections import Counter
+import numpy as np
+import ctypes, os
+
+import we
+
+
+class AnyGram:
+    '''
+    Class to prepare and compute any-gram kernel
+    
+    ToDo: extend to multiple sentences per instance
+    '''
+
+    def __init__(self, pMethod, pMaxTxtLen, pWESTThreshold = 0, pflgNormalizeKernel = True):
+        '''
+        Constructor
+
+        pMaxTxtLen is the maximum text length of the instances in terms of tokens. Longer instances will be cutoff and
+        shorter ones will be padded. The same length should also be used at prediction time if a new AnyGram object is 
+        created. Otherwise scikit will complain.
+        '''
+        
+        self.method = pMethod                             # method to be used in comparing tokens:
+                                                          # - sm: string match
+                                                          # - wess: word embedding similarity score
+                                                          # - west: word embedding similarity threshold
+        
+        self._westThreshold = pWESTThreshold              # threshold for WEST method
+        
+        self.maxTxtLen = pMaxTxtLen                       # Maximum text length of the instances in terms of tokens: longer instances
+                                                          # will be cutoff and shorter ones will be padded.
+        
+        self.embeddings = None
+        self.vocabulary = None                            # dictionary of tokens and their indicies
+        self._iVocabulary = None                          # dictionary of token indicies and their forms (inverse vocabulary)
+        
+        self.l = 0.4                                      # lambda
+        
+        self._normalizeKernel = pflgNormalizeKernel       # whether or not normalize the kernel values                          
+        
+    
+    
+    def setWESTThreshold(self, pThreshold):
+        '''
+        Sets the word embedding similarity threshold for WEST method
+        
+        The value must be between 0 and 1.
+        '''
+        
+        self._westThreshold = pThreshold
+        
+    
+    
+    @property
+    def vocabSize(self):
+        '''
+        Returns the size of the vocabulary
+        '''
+        
+        return len(self.vocabulary)
+        
+    
+    
+    @property
+    def iVocabulary(self):
+        '''
+        Returns the inverse vocabulary (dictionary of token indicies and their forms)
+        
+        The inverse vocabulary is created on demand.
+        '''
+        
+        if self._iVocabulary is None:
+            self._iVocabulary = {v: k for k, v in self.vocabulary.iteritems()}
+        
+        # adding something for padding
+        self._iVocabulary[0] = '_PAD_'
+        
+        return self._iVocabulary
+        
+    
+    
+    def __call__(self, pX, pTrainX):
+        '''
+        Call method
+        '''
+        
+        return self.computeKernel(pX, pTrainX)
+        
+    
+    
+    def computeKernel(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
+        '''
+        Computes and returns the kernel values
+         
+        pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX 
+        respectively, but they can have a custom number of columns (equal for both though). At the moment these can only 
+        be used with a precomputed kernel.
+        '''
+        
+        if self.method.lower() == "sm":
+            return self._computeKernelSM(pX, pTrainX, pXAux, pTrainXAux)
+        elif self.method.lower() == "west":
+            return self._computeKernelWEST(pX, pTrainX, pXAux, pTrainXAux)
+        elif self.method.lower() == "wess":
+            return self._computeKernelWESS(pX, pTrainX, pXAux, pTrainXAux)
+        
+    
+    
+    def _computeKernelSM(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
+        '''
+        Computes and returns the kernel using string match method
+        
+        pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX 
+        respectively, but they can have a custom number of columns (equal for both though). At the moment these can only 
+        be used with a precomputed kernel.
+        
+        For every pair of tokens, every pair of peer elements in the auxiliary input will be compared against each other.
+        '''
+        
+        if pXAux is None:
+            pXAux = np.empty(shape = (0, 0, 0))
+            pTrainXAux = np.empty(shape=(0, 0, 0))
+        
+        vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
+        
+        vLib.compKernelMatrixSM.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                            ctypes.c_int,
+                                            ctypes.c_int,
+                                            np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                            ctypes.c_int,
+                                            ctypes.c_int,
+                                            np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                            np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                            ctypes.c_int,
+                                            ctypes.c_int,
+                                            ctypes.c_int]
+        vLib.compKernelMatrixSM.restype = np.ctypeslib.ndpointer(dtype=np.float64,
+                                                                 shape=(pX.shape[0], pTrainX.shape[0]),
+                                                                 flags="C_CONTIGUOUS")
+        
+        if id(pX) == id(pTrainX):
+            vPhase = 0
+        else:
+            vPhase = 1
+        
+        vK = vLib.compKernelMatrixSM(pX.flatten(), pX.shape[0], pX.shape[1],
+                                     pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
+                                     pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
+                                     vPhase, self._normalizeKernel)
+        
+        return vK
+        
+    
+    
+    def _computeKernelWEST(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
+        '''
+        Computes and returns the kernel using word embedding similarity threshold method
+        
+        pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX 
+        respectively, but they can have a custom number of columns (equal for both though). At the moment these can only 
+        be used with a precomputed kernel.
+        
+        For every token, the auxiliary input will be concatenated to the end of its embedding vector.
+        '''
+        
+        if pXAux is None:
+            pXAux = np.empty(shape = (0, 0, 0))
+            pTrainXAux = np.empty(shape=(0, 0, 0))
+
+        vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
+        
+        vLib.compKernelMatrixWEST.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              ctypes.c_int,
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              ctypes.c_int,
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              ctypes.c_int,
+                                              ctypes.c_double,
+                                              ctypes.c_int,
+                                              ctypes.c_int]
+        vLib.compKernelMatrixWEST.restype = np.ctypeslib.ndpointer(dtype=np.float64,
+                                                                   shape=(pX.shape[0], pTrainX.shape[0]),
+                                                                   flags="C_CONTIGUOUS")
+        
+        if id(pX) == id(pTrainX):
+            vPhase = 0
+        else:
+            vPhase = 1
+        
+        vK = vLib.compKernelMatrixWEST(pX.flatten(), pX.shape[0], pX.shape[1],
+                                       pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
+                                       pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
+                                       self.embeddings.flatten(), self.embeddings.shape[0], self.embeddings.shape[1], ctypes.c_double(self._westThreshold),
+                                       vPhase, self._normalizeKernel)
+        
+        return vK
+        
+    
+    
+    def _computeKernelWESS(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
+        '''
+        Computes and returns the kernel using word embedding similarity score method
+        
+        pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX 
+        respectively, but they can have a custom number of columns (equal for both though). At the moment these can only 
+        be used with a precomputed kernel.
+        
+        For every token, the auxiliary input will be concatenated to the end of its embedding vector.
+        '''
+        
+        if pXAux is None:
+            pXAux = np.empty(shape = (0, 0, 0))
+            pTrainXAux = np.empty(shape=(0, 0, 0))
+
+        vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
+        
+        vLib.compKernelMatrixWESS.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              ctypes.c_int,
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              ctypes.c_int,
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
+                                              ctypes.c_int,
+                                              ctypes.c_int,
+                                              ctypes.c_int,
+                                              ctypes.c_int]
+        vLib.compKernelMatrixWESS.restype = np.ctypeslib.ndpointer(dtype=np.float64,
+                                                                   shape=(pX.shape[0], pTrainX.shape[0]),
+                                                                   flags="C_CONTIGUOUS")
+
+        if id(pX) == id(pTrainX):
+            vPhase = 0
+        else:
+            vPhase = 1
+        
+        vK = vLib.compKernelMatrixWESS(pX.flatten(), pX.shape[0], pX.shape[1],
+                                       pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
+                                       pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
+                                       self.embeddings.flatten(), self.embeddings.shape[0], self.embeddings.shape[1],
+                                       vPhase, self._normalizeKernel)
+         
+        return vK
+        
+    
+    
+    def loadVocabulary(self, pdVocabulary):
+        '''
+        Loads vocabulary to be used for formatting the input data to learning algorithm
+        
+        The vocabulary should be a dictionary of tokens and their indices.
+        '''
+        
+        self.vocabulary = dict(pdVocabulary)
+        
+    
+    
+    def loadEmbeddings(self, pWEFilename, pIsLowerCase):
+        '''
+        Loads word embeddings from the given file
+        
+        Word embeddings should be loaded before formatting data, because the vocabulary used for formatting data should
+        be extracted from the word embeddings (when the word embeddings are used).
+        
+        pIsLowerCase specifies whether the vocabulary of the word embedding is lowercased.
+        '''
+        
+        # loading embeddings
+        vWE = we.WordEmbedding()
+        vWE.loadEmbeddings(pWVFilename=pWEFilename, pIsLowerCase=pIsLowerCase)
+        
+        # genertaing vocabulary with embeddings' vocabulary: 0 need to be reserved for padding
+        self.vocabulary = {k: v + 1 for k, v in vWE.wordIDs.iteritems()}
+        
+        # adding the 0 vector (for padding index) at the beginning
+        self.embeddings = np.concatenate((np.zeros((1, vWE.dimension)),
+                                          vWE.embeddings))
+        
+    
+    
+    def _generateVocabulary(self, pllCorpus):
+        '''
+        Extracts the vocabulary from the input corpus and stores it in a dictionary of tokens and their indices
+
+        Token frequencies are used as order in generating the indices. 0 is reserved for padding the input when 
+        formatting it for the learning algorithm, so the starting index is 1.
+        '''
+        
+        # 0 is resereved for padding
+        vStartingIdx = 1
+        
+        # flattening corpus
+        vlTokens = [t for s in pllCorpus for t in s]
+        
+        # computing frequencies
+        vdTokenFreqs = Counter(vlTokens)
+        
+        # sorting based on token frequency
+        vlSortedTokens = [t for t, f in
+                          sorted([(t, f) for (t, f) in vdTokenFreqs.iteritems()], key=lambda x: x[1], reverse=True)]
+        
+        # creating token-index map
+        
+        self.vocabulary = {}
+        
+        # filling the vocabulary
+        for idx, vToken in enumerate(vlSortedTokens, start=vStartingIdx):
+            self.vocabulary[vToken] = idx
+        
+    
+    
+    def formatData(self, pCorpus, pflgLowercase = False):
+        '''
+        Converts the format of the input corpus from list of texts into arrays of token indices to be used as input
+        to the learning algorithm
+
+        It needs a vocabulary mapping tokens to indicies for conversion. If a vocabulary is already loaded, it will be 
+        used. Otherwsie, the vocabulary will be generated from the corpus.
+
+        Use this function before feeding the input to the learning algorithm. For example, with scikit SVC, given X as 
+        a list of tokenized sentences (e.g. ["This is sentence 1 .", "This is sentence 2 ."]), and Y the labels:
+
+        any_gram = AnyGram()
+        X = any_gram.formatData(X)
+
+        clf = svm.SVC(kernel = any_gram)
+        clf.fit(X, Y)
+        
+        If word embedding vectors are used, they should be loaded before formatting the data, to create the vocabulary
+        based on the word indices of the word embeddings. Otherwise, a vocabulary will be created by this method and when
+        loading embeddings afterwards, the indices will be correct.
+        
+        If a token is not in the vocabulary, it will be added to its end. This only happens in case of using word embeddings. 
+        
+        Since the output is a 2D array, the sentences are padded to the length of the longest sentence. 0 is used as the 
+        padding index.
+        '''
+        
+        # spliting  the input sentences into tokens if it is not already (NOTE: the input is supposed to be tokenized)
+        if pflgLowercase:
+            vllCorpus = [[t.lower() for t in s.split()] for s in pCorpus]
+        else:
+            vllCorpus = [[t for t in s.split()] for s in pCorpus]
+
+        if self.vocabulary is None:
+            self._generateVocabulary(vllCorpus)
+
+        # finding the longest sentence
+        # vMaxLen = max([len(s) for s in vllCorpus])
+
+        vaOutput = np.zeros((len(pCorpus), self.maxTxtLen), dtype=np.int32)
+        for i in range(vaOutput.shape[0]):
+            for j in range(min(len(vllCorpus[i]), self.maxTxtLen)):
+                if vllCorpus[i][j] not in self.vocabulary:
+                    self.vocabulary[vllCorpus[i][j]] = self.vocabSize + 1
+                vaOutput[i][j] = self.vocabulary[vllCorpus[i][j]]
+        
+        return vaOutput
+        
+    
+    

+ 405 - 0
anygramwrapper.py

@@ -0,0 +1,405 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Wrapper around any-gram kernel
+
+Version 0.1                                                                                 (25-May-2017 to 21-Jun-2017)
+- AnyGramWrapper is created.
+
+"""
+
+
+from sklearn import svm
+from sklearn.multiclass import OneVsRestClassifier
+import pickle
+import numpy as np
+
+
+class AnyGramWrapper:
+    '''
+    Wrapper class around any-gram kernel to be used by scikit learn SVC
+    '''
+    
+    
+    def __init__(self, pAnyGram):
+        '''
+        Constructor
+        '''
+
+        self.trainXs = None
+        self.trainYs = None
+        self.trainAuxs = None                                # auxiliary training input
+        
+        self.testXs = None
+        self.testYs = None
+        self.testAuxs = None                                # auxiliary test input
+        
+        self.ag = pAnyGram
+        self._pcTrainKernel = None                           # precomputed kernel from the preloaded training data
+        self._pcTestKernel = None                            # precomputed kernel from the preloaded test data
+
+        self._model = None
+        
+    
+    
+    def loadTrainSet(self, pXs, pYs):
+        '''
+        Loads the training data
+        
+        The data set is provided via two lists, one containing a list of text instances (preferably tokenized) and one
+        containing the labels matching the text instances.
+        '''
+        
+        self.trainXs = self.ag.formatData(pXs)
+        self.trainYs = [int(y) for y in pYs]
+        
+        self._pcTrainKernel = None
+        
+    
+    
+    def loadTrainAux(self, pAux):
+        '''
+        Loads auxiliary training input
+        
+        The input must be a 3D array/list of shape (#train_innstances, #max_text_length, #auxiliary_dimension
+        '''
+
+        if self.trainXs is None or len(self.trainXs) == 0:
+            raise Exception("Training dataset should be loaded first!")
+
+        self.trainAuxs = self.formatAux(pAux)
+        
+    
+    
+    def loadTestSet(self, pXs, pYs):
+        '''
+        Loads the training data
+        
+        The data set is provided via two lists, one containing a list of text instances (preferably tokenized) and one
+        containing the labels matching the text instances.
+        '''
+        
+        self.testXs = self.ag.formatData(pXs)
+        self.testYs = [int(y) for y in pYs]
+        
+        self._pcTestKernel = None
+        
+    
+    
+    def loadTestAux(self, pAux):
+        '''
+        Loads auxiliary test input
+        
+        The input must be a 3D array/list of shape (#train_innstances, #max_text_length, #auxiliary_dimension
+        '''
+        
+        if self.testXs is None or len(self.testXs) == 0:
+            raise Exception("Test dataset should be loaded first!")
+
+        self.testAuxs = self.formatAux(pAux)
+        
+    
+    
+    def formatAux(self, pAux):
+        '''
+        Formats the auxiliary data
+        
+        The input is a 3D list of shape (#train_innstances, #max_text_length, #auxiliary_dimension)
+        '''
+        
+        if isinstance(pAux, list):
+            # padding the 2nd dimensions of list (#max_text_length)
+            vFixedDimList = [s[:self.ag.maxTxtLen] + [[0] * len(s[0])] * (self.ag.maxTxtLen - len(s)) for s in pAux]
+            for s in vFixedDimList:
+                if len(s) != 80:
+                    print len(s)
+                for t in s:
+                    if len(t) != 1:
+                        print len(t)
+                    for a in t:
+                        if not isinstance(a, int):
+                            print type(a) 
+            return np.array(vFixedDimList, dtype=np.float64)
+        else:
+            return pAux
+        
+    
+    
+    def loadEmbeddings(self, pWEFilename, pIsLowerCase):
+        '''
+        Loads word embeddings from the given file
+        
+        Embeddings should be loaded before the data, because the vocabulary used for formatting data should
+        be extracted from the word embeddings (when the word embeddings are used).
+        
+        pIsLowerCase specifies whether the vocabulary of the word embedding is lowercased.
+        '''
+        
+        self.ag.loadEmbeddings(pWEFilename, pIsLowerCase)
+        
+    
+    
+    def precomputeTrainKernel(self):
+        '''
+        Precomputes the kernel from the training data
+        '''
+        
+        self._pcTrainKernel = self._preComputeKernel(self.trainXs, self.trainXs, self.trainAuxs, self.trainAuxs)
+        
+    
+    
+    def _getPrecomputedTrainKernel(self):
+        '''
+        Returns the precomputed kernel from the training data
+        
+        If the kernel is not precomputed yet, it will do so first.
+        '''
+        
+        if self._pcTrainKernel is None:
+            self.precomputeTrainKernel()
+        
+        return self._pcTrainKernel
+        
+    
+    
+    def precomputeTestKernel(self):
+        '''
+        Precomputes the kernel from the test data
+        '''
+        
+        self._pcTestKernel = self._preComputeKernel(self.testXs, self.trainXs, self.testAuxs, self.trainAuxs)
+        
+    
+    
+    def _getPrecomputedTestKernel(self):
+        '''
+        Returns the precomputed kernel from the test data
+        
+        If the kernel is not precomputed yet, it will do so first.
+        '''
+        
+        if self._pcTestKernel is None:
+            self.precomputeTestKernel()
+        
+        return self._pcTestKernel
+        
+    
+    
+    def _preComputeKernel(self, pX1, pX2, pAux1 = None, pAux2 = None):
+        '''
+        Computes and returns kernel with the given data
+        
+        The data should be formatted by AnyGram.formatData().
+        '''
+        
+        if self.trainXs is None or len(self.trainXs) == 0:
+            raise Exception("Training dataset is empty!")
+        
+        if pAux1 is not None and pAux2 is not None:
+            return self.ag.computeKernel(pX1.astype(np.float64),
+                                         pX2.astype(np.float64),
+                                         pAux1.astype(np.float64),
+                                         pAux2.astype(np.float64))
+        else:
+            return self.ag.computeKernel(pX1.astype(np.float64),
+                                         pX2.astype(np.float64))
+        
+    
+    
+    def combinePrecomputedTrainKernel(self, paKernelMatrix, pCombMethod):
+        '''
+        Combines the anygram kernel computed on the training set here with any given kernel matrix using the specified method
+         
+        The given kernel matrix should match the computed any-gram kernel matrix in shape.
+        
+        The combination methods supported here are:
+         + or add: add corresponding elements in the two kernel matrices
+         * or multiply: multiply corresponding elements in the two kernel matrices
+         arith: arithmetic mean of the corresponding elements in the two kernel matrices
+         geo: geometric mean
+        '''
+        
+        if self._pcTrainKernel in None:
+            raise Exception("Kernel is not precomputed yet. Run precomputeTrainKernel() first.")
+        
+        self._pcTrainKernel = self._combineKernels(self._pcTrainKernel, paKernelMatrix, pCombMethod)
+        
+    
+    
+    def combinePrecomputedTestKernel(self, paKernelMatrix, pCombMethod):
+        '''
+        Combines the anygram kernel computed on the test set here with any given kernel matrix using the specified method
+        
+        The given kernel matrix should match the computed any-gram kernel matrix in shape.
+        
+        The combination methods supported here are:
+         + or add: add corresponding elements in the two kernel matrices
+         * or multiply: multiply corresponding elements in the two kernel matrices
+         arith: arithmetic mean of the corresponding elements in the two kernel matrices
+         geo: geometric mean
+        '''
+        
+        if self._pcTestKernel in None:
+            raise Exception("Kernel is not precomputed yet. Run precomputeTrainKernel() first.")
+        
+        self._pcTestKernel = self._combineKernels(self._pcTestKernel, paKernelMatrix, pCombMethod)
+        
+        
+    
+    
+    def _combineKernels(self, paKernelMatrix1, paKernelMatrix2, pCombMethod):
+        '''
+        Combines and returns two given kernel matrices with the givem method         
+        
+        The given kernel matrices should have the same shapes.
+        
+        The combination methods supported here are:
+         + or add: add corresponding elements in the two kernel matrices
+         * or multiply: multiply corresponding elements in the two kernel matrices
+         arith: arithmetic mean of the corresponding elements in the two kernel matrices
+         geo: geometric mean
+        '''
+        
+        if paKernelMatrix1.shape != paKernelMatrix2.shape:
+            raise Exception("The shape of the given kernel matrix is not valid: " % paKernelMatrix1.shape)
+        
+        if pCombMethod.lower() in ['+', "add"]:
+            return np.add(paKernelMatrix1, paKernelMatrix1)
+        elif pCombMethod.lower() in ['*', "multiply"]:
+            return np.multiply(paKernelMatrix1, paKernelMatrix1)
+        elif pCombMethod.lower().startswith("arith"):
+            return np.add(paKernelMatrix1, paKernelMatrix2) / 2
+        elif pCombMethod.lower().startswith("geo"):
+            return np.sqrt(np.multiply(paKernelMatrix1, paKernelMatrix2)) 
+        
+    
+    
+    def train(self, pflgUsePrecompKernel = False, pMCMethod = None, C = 1, class_weight = None):
+        '''
+        Trains and returns anygram model
+        
+        If pflgUsePrecompKernel is set to true, SVC will use precomputed kernel. This can save time when the data or
+        kernel computation parameters remain the same in repeated trainings (e.g. in tunning).
+        
+        pMCMethod is decision_function_shape parameter of the scikit.svm.SVC and specifies the multiclass classification
+        method. The othe parameters are those of scikit.svm.SVC.
+        '''
+        
+        if self.trainXs is None or len(self.trainXs) == 0:
+            raise Exception("Training dataset is empty!")
+        
+        
+        if  pflgUsePrecompKernel:
+            vKernel = "precomputed"
+            X = self._getPrecomputedTrainKernel()
+        else:
+            vKernel = self.ag
+            X = self.trainXs
+
+        if pMCMethod is None:
+            vSVC = svm.SVC(kernel = vKernel, C = C, class_weight = class_weight)
+        elif pMCMethod.lower() == "ovo":
+            vSVC = svm.SVC(kernel = vKernel, decision_function_shape = "ovo", C = C, class_weight = class_weight)
+        elif pMCMethod.lower() in ["ova", "ovr"]:
+            vSVC = OneVsRestClassifier(svm.SVC(kernel = vKernel, decision_function_shape = pMCMethod, C = C, class_weight = class_weight))
+        else:
+            raise Exception("Unknown multiclass classification method: %s", pMCMethod)
+        
+        # training
+        self._model = vSVC.fit(X, self.trainYs)
+        
+    
+    
+    @property
+    def model(self):
+        '''
+        Trained scikit SVC model
+        '''
+        
+        return self._model
+        
+    
+    
+    def saveModel(self, pFilename):
+        '''
+        Saves the SVC model by pickling it to a given file  
+        '''
+        
+        ## ToDo: when saving the model, the vocabulary and the embeddings (for WESS) must also be saved 
+        
+        pickle.dump(self._model, open(pFilename, 'w'))
+        
+    
+    
+    def loadModel(self, pModelPickle):
+        '''
+        Loads the pickled SVC model  
+        '''
+
+        ## ToDo: model should have been saved with a vocabulary and embedding which must also be loaded here
+        self._model = pickle.load(open(pModelPickle))
+        
+    
+    
+    def test(self, pTestXs = None, pTestYs = None):
+        '''
+        Tests the given models on the loaded test set 
+        '''
+        
+        if pTestXs is not None and pTestYs is not None:
+            self.loadTestSet(pXs = pTestXs, pYs = pTestYs)
+        
+        if isinstance(self._model, OneVsRestClassifier):
+            vKernel = self._model.estimators_[0].kernel
+        elif isinstance(self._model, svm.SVC):
+            vKernel = self._model.kernel
+        
+        # prediction
+        if  vKernel == "precomputed":
+            vaPreds = self._model.predict(self._getPrecomputedTestKernel())
+        else:
+            vaPreds = self._model.predict(self.testXs)
+        
+        # scoring
+        vScore = self._score(vaPreds, self.testYs)
+        
+        return vaPreds, vScore
+        
+    
+    
+    def predict(self, pXs, pAux = None):
+        '''
+        Predicts the labels of the given data
+        '''
+
+        vaXs = self.ag.formatData(pXs)
+        vaAuxs = self.formatAux(pAux)
+        
+        if isinstance(self._model, OneVsRestClassifier):
+            vKernel = self._model.estimators_[0].kernel
+        elif isinstance(self._model, svm.SVC):
+            vKernel = self._model.kernel
+        
+        if  vKernel == "precomputed":
+            return self._model.predict(self._preComputeKernel(vaXs, self.trainXs, vaAuxs, self.trainAuxs))
+        else:
+            return self._model.predict(vaXs)
+        
+    
+    
+    def _score(self, plPreds, plGolds):
+        '''
+        Scores the given predictions 
+        '''
+        
+        vCorrect = 0
+        
+        for p, g in zip(plPreds, plGolds):
+            if p == g:
+                vCorrect += 1
+        
+        return vCorrect * 1.0 / len(plPreds)
+        
+    
+    

File diff suppressed because it is too large
+ 11 - 0
glove.we


+ 639 - 0
we.py

@@ -0,0 +1,639 @@
+	#! /usr/bin/python
+# -*- coding: utf-8 -*-
+
+"""	
+	This module defines classes for (predicted) word embedding vectors.
+	
+	Version 0.5																								(06-Feb-2017)
+	- getVector() now supports optional return value for unknown tokens. 
+	
+	Version 0.4																								(21-Oct-2016)
+	- Unknown token support is added.
+	
+	Version 0.3																				(06-Oct-2016 to 10-Oct-2016)
+	- The module is renamed from wv.py.
+	- loadData() and other functions are added.
+	- Normalization of vectors is added.
+	- extractSimilarWords is added.
+	- Problem with filter vocabulary in loader methods were found an fixed.
+	
+	Version 0.2																				(15-Jun-2016 to 29-Jun-2016)
+	- calcSimMatrix() is added to WordVector.
+	- loadW2VTxtVectors() is added.
+	
+	Version 0.1																				(27-Feb-2016 to 01-Mar-2016)
+	- WordEmbedding is added.
+	
+"""
+
+import sys
+import zipfile, collections
+import numpy as np
+
+
+class WordEmbedding:
+	'''
+	Class for managing word embeddings
+	'''
+	
+	def __init__(self, pVerbosity = 1):
+		'''
+		Constructor 
+		'''
+		
+		# verbosity of processes
+		self.verbosity = pVerbosity
+		
+		self.embeddings = None  # numpy array containing final word embeddings
+		self.normalized = None  # whether the embeddings are normalized
+		
+		self.wordIDs = None  # vocab words and their IDs (indexes)
+		self.IDWords = None  # vocab IDs and their words
+		
+		# training data attributes
+		
+		self.corpus = None  # original corpus as list of sentences
+		self.wordCounts = None  # vocab words and their counts
+		self.data = None  # the data: list of word IDs (not word forms) in their 
+		# original order in the corpus
+		self._lowercase = False
+		
+		# unknown token embedidng vector
+		self._unknown = None
+		
+	
+	
+	@property
+	def dimension(self):
+		'''
+		Returns the dimension of the loaded vectors
+		'''
+		
+		if self.embeddings is None:
+			return 0
+		else:
+			return len(self.embeddings[0])
+		
+	
+	
+	@property
+	def vocabSize(self):
+		'''
+		Returns the vocabulary size of the embedding vectors
+		'''
+		
+		if self.wordIDs is None:
+			return 0
+		else:
+			return len(self.wordIDs)
+		
+	
+	
+	def loadData(self, pCorpusFilename, pVocabSize, pflgKeepOrgCorpus=False, pflgKeepVocabCounts=False):
+		'''
+		Loads data from an input corpus
+		
+		The original corpus can be optionally stored for later uses. The default is to delete it after loading 
+		required data.
+		'''
+		
+		# reading data
+		
+		if zipfile.is_zipfile(pCorpusFilename):
+			with zipfile.ZipFile(pCorpusFilename) as f:
+				vlCorpusLines = f.read(f.namelist()[0]).strip().split('\n')
+		else:
+			vlCorpusLines = open(pCorpusFilename).read().strip().split('\n')
+		
+		vlWords = [w for l in vlCorpusLines for w in l.split()]
+		
+		# loading data
+		
+		if pflgKeepVocabCounts:
+			self.wordCounts = {"UNK": 0}  # initializing vocab words/counts by UNK; to be counted later
+		self.wordIDs = {"UNK": 0}  # initializing vocal words/IDs by UNK
+		self.IDWords = {0: "UNK"}  # initializing vocab IDs/words by UNK
+		self.data = []
+		
+		# getting words IDs based on word frequency order (only most common words specified by vocab size)
+		for i, (word, count) in enumerate(collections.Counter(vlWords).most_common(pVocabSize - 1), start=1):
+			if pflgKeepVocabCounts:
+				self.wordCounts[word] = count
+			self.wordIDs[word] = i
+		
+		for word in vlWords:
+			if word in self.wordIDs:
+				id = self.wordIDs[word]
+				self.IDWords[id] = word
+			else:
+				id = 0
+				if pflgKeepVocabCounts:
+					self.wordCounts["UNK"] += 1
+			
+			self.data.append(id)
+		
+		if self.verbosity > 0:
+			print("Data size:")
+			print("\t%d (%dM) lines" % (len(vlCorpusLines), len(vlCorpusLines) / 1000000))
+			print("\t%d (%dM) words\n" % (len(vlWords), len(vlWords) / 1000000))
+		
+		if self.verbosity > 1:
+			if pflgKeepVocabCounts:
+				print('Most common words:\n\t%s\n' % '\n\t'.join(["%s: %d" % (w, c) for w, c in
+																  sorted(self.wordCounts.iteritems(),
+																		 key=lambda x: x[1], reverse=True)[:10]]))
+			print(
+				'Sample word indexes:\n\t%s\n' % '\n\t'.join(
+					["%s: %s" % (w, self.wordIDs[w]) for w in self.wordIDs][:10]))
+			print('Sample index words:\n\t%s\n' % '\n\t'.join(
+				["%s: %s" % (id, self.IDWords[id]) for id in self.IDWords][:10]))
+			print(
+				'Sample data:\n\t%s\n\n\t%s\n' % (
+				self.data[:53], ' '.join([sefl.IDWords[id] for id in self.data[:53]])))
+		
+		# deleting the corpus if asked and the word list
+		if not pflgKeepOrgCorpus:
+			self.corpus = vlCorpusLines
+		del vlCorpusLines
+		del vlWords
+		
+	
+	
+	def getWordFreq(self, pWord):
+		'''
+		Returns the frequency of a given word in the vocabulary 
+		'''
+		
+		if self.wordCounts is None:
+			print(
+			"Word counts are not stored or no data is loaded. Data should be loaded with pflgKeepVocabCount parameter set to true.")
+		elif pWord in self.wordCounts:
+			return self.wordCounts[pWord]
+		else:
+			print('Word "%s" not found' % pWord)
+		
+	
+	
+	def loadEmbeddings(self, pWVFilename, pIsLowerCase=False,
+					   plFilterVocab=None, pflgCaseSensitiveFilter=True,
+					   pflgNormalized=False, pUnknownToken = None):
+		'''
+		Loads pre-trained word embeddings from file in general format
+		
+		In the general format, each word is represented in a line which contains the word and the vector all separated 
+		by space/tab. 
+		
+		pIsLowerCase determines the case of the embeddings' vocabulary.
+		
+		Optionally, a vocabulary list can be provided to filter the vectors being loaded into the memory. This can reduce
+		the memory usage in scenarios where the list of words in use are known in advance. 
+		
+		If the word vectors are lowercase, the filter word lookup will be case insensitive. Otherwise, another argument 
+		is used to determine the case-sensitivity of the lookup, the default value of which is true, i.e. case sensitive.
+		
+		The loaded vectors may or may not be already normalized. For example, Google News pre-trained vectors are not 
+		normalized. When the vectors are normalized, cosine similarity can be camputed simply using dot product without
+		the need to normalization by the product of vector norms. However, for normalized vectors, this normalization will
+		have no effect. This information is passed through pflgNormalized and is needed for computing similarities.
+		'''
+		
+		self._lowercase = pIsLowerCase
+		
+		# filter vocabulary case
+		if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
+			vlFilterVocab = [w.lower() for w in plFilterVocab]
+		else:
+			vlFilterVocab = plFilterVocab
+		
+		self.embeddings = []
+		self.wordIDs = {}
+		self.IDWords = {}
+		vWordID = 0
+		for i, vLine in enumerate(open(pWVFilename), start=1):
+			vlLSplit = vLine.split()
+			
+			# vector word case 
+			if not pflgCaseSensitiveFilter and not self._lowercase:
+				vWord = vlLSplit[0].lower()
+			else:
+				vWord = vlLSplit[0]
+			
+			if vlFilterVocab is None or vWord in vlFilterVocab:
+				self.embeddings.append(np.array([float(n) for n in vlLSplit[1:]]))
+				self.wordIDs[vlLSplit[0]] = vWordID
+				self.IDWords[vWordID] = vlLSplit[0]
+				vWordID += 1
+			
+			if pUnknownToken is not None and vWord ==  pUnknownToken:
+				self._unknown = np.array([float(n) for n in vlLSplit[1:]])
+			
+			if i % 1000 == 0:
+				sys.stdout.write('.')
+				sys.stdout.flush()
+		
+		self.embeddings = np.array(self.embeddings)
+		
+		self.normalized = pflgNormalized
+		
+		sys.stdout.write('\n')
+		
+	
+	
+	def loadGloVeVectors(self, pWVFilename, pIsLowerCase=False,
+						 plFilterVocab=None, pflgCaseSensitiveFilter=True,
+						 pflgNormalized=False, pUnknownToken = None):
+		'''
+		Loads word vectors from GloVe word embedding file
+		
+		Both .zip and text files are accepted.
+		
+		See load() for parameters.
+		'''
+		
+		if pWVFilename[-3:].lower() == "zip":
+			self._loadGloVeVectorsZip(pWVFilename, pIsLowerCase, plFilterVocab, pflgCaseSensitiveFilter, pflgNormalized, pUnknownToken)
+		else:
+			self.loadEmbeddings(pWVFilename, pIsLowerCase, plFilterVocab, pflgCaseSensitiveFilter, pflgNormalized, pUnknownToken)
+		
+	
+	
+	def _loadGloVeVectorsZip(self, pWVZipFilename, pIsLowerCase=False,
+							 plFilterVocab=None, pflgCaseSensitiveFilter=True,
+							 pflgNormalized=False, pUnknownToken = None):
+		'''
+		Loads word vectors from GloVe zip file
+		'''
+		
+		self._lowercase = pIsLowerCase
+		
+		import zipfile as zp
+		
+		vZipFile = zp.ZipFile(pWVZipFilename)
+		
+		# filter vocabulary case
+		if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
+			vlFilterVocab = [w.lower() for w in plFilterVocab]
+		else:
+			vlFilterVocab = plFilterVocab
+		
+		self.embeddings = []
+		self.wordIDs = {}
+		self.IDWords = {}
+		vWordID = 0
+		for i, vLine in enumerate(vZipFile.open(vZipFile.namelist()[0]), start=1):
+			vlLSplit = vLine.split()
+			
+			# vector word case 
+			if not pflgCaseSensitiveFilter and not self._lowercase:
+				vWord = vlLSplit[0].lower()
+			else:
+				vWord = vlLSplit[0]
+			
+			if vlFilterVocab is None or vlLSplit[0] in vlFilterVocab:
+				self.embeddings.append(np.array([float(n) for n in vlLSplit[1:]]))
+				self.wordIDs[vlLSplit[0]] = vWordID
+				self.IDWords[vWordID] = vlLSplit[0]
+				vWordID += 1
+			
+			if pUnknownToken is not None and vWord == pUnknownToken:
+				self._unknown = np.array([float(n) for n in vlLSplit[1:]])
+			
+			if i % 1000 == 0:
+				sys.stdout.write('.')
+				sys.stdout.flush()
+		
+		self.embeddings = np.array(self.embeddings)
+		
+		self.normalized = pflgNormalized
+		
+		sys.stdout.write('\n')
+		
+	
+	
+	def loadW2VBinVectors(self, pWVBinFilename, pIsLowerCase=False,
+						  plFilterVocab=None, pflgCaseSensitiveFilter=True,
+						  pflgNormalized=False, pUnknownToken = None):
+		'''
+		Loads word vectors from word2vec file in binary format
+		
+		The method uses gensim to load the vectors. Parameters are the same as in load() method.
+		
+		pUnknownToken specifies the token in the input embedding vectors which represents unknown tokens (e.g. UNK)
+		if there is one.
+		'''
+		
+		import gensim
+		
+		self._lowercase = pIsLowerCase
+		
+		# filter vocabulary case
+		if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
+			vlFilterVocab = [w.lower() for w in plFilterVocab]
+		else:
+			vlFilterVocab = plFilterVocab
+		
+		# loading word vectors using gensim
+		vGensimModel = gensim.models.Word2Vec.load_word2vec_format(pWVBinFilename, binary=True)
+		
+		self.embeddings = []
+		self.wordIDs = {}
+		self.IDWords = {}
+		vWordID = 0
+		
+		if vlFilterVocab is not None:
+			for vWord in vlFilterVocab:
+				try:
+					self.embeddings.append(vGensimModel[vWord])
+					self.wordIDs[vWord] = vWordID
+					self.IDWords[vWordID] = vWord
+					vWordID += 1
+				except KeyError:
+					continue
+				
+				sys.stdout.write('.')
+				sys.stdout.flush()
+		else:
+			sys.stdout.write('Reading the entire vocabulary...')
+			sys.stdout.flush()
+			
+			for vWord in vGensimModel.vocab:
+				self.embeddings.append(vGensimModel[vWord])
+				self.wordIDs[vWord] = vWordID
+				self.IDWords[vWordID] = vWord
+				vWordID += 1
+			
+			sys.stdout.write(' done.')
+			sys.stdout.flush()
+		
+		self.embeddings = np.array(self.embeddings)
+		
+		if pUnknownToken is not None:
+			self._unknown = vGensimModel[pUnknownToken]
+		
+		self.normalized = pflgNormalized
+		
+		sys.stdout.write('\n')
+		
+	
+	
+	def loadW2VTxtVectors(self, pWVTxtFilename, pIsLowerCase=False,
+						  plFilterVocab=None, pflgCaseSensitiveFilter=True,
+						  pflgNormalized=False, pUnknownToken = None):
+		'''
+		Loads word vectors from word2vec file in text format
+		'''
+		
+		self._lowercase = pIsLowerCase
+		
+		# filter vocabulary case
+		if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
+			vlFilterVocab = [w.lower() for w in plFilterVocab]
+		else:
+			vlFilterVocab = plFilterVocab
+		
+		vfWV = open(pWVTxtFilename)
+		
+		# skipping the header
+		vfWV.readline()
+		
+		# loading word vectors
+		
+		self.embeddings = []
+		self.wordIDs = {}
+		self.IDWords = {}
+		vWordID = 0
+		for i, vLine in enumerate(vfWV, start=1):
+			vlLSplit = vLine.split()
+			
+			# vector word case 
+			if not pflgCaseSensitiveFilter and not self._lowercase:
+				vWord = vlLSplit[0].lower()
+			else:
+				vWord = vlLSplit[0]
+			
+			if vlFilterVocab is None or vWord in vlFilterVocab:
+				self.embeddings.append(np.array([float(n) for n in vlLSplit[1:]]))
+				self.wordIDs[vlLSplit[0]] = vWordID
+				self.IDWords[vWordID] = vlLSplit[0]
+			
+			if pUnknownToken is not None and vWord == pUnknownToken:
+				self._unknown = np.array([float(n) for n in vlLSplit[1:]])
+			
+			if i % 1000 == 0:
+				sys.stdout.write('.')
+				sys.stdout.flush()
+		
+		self.embeddings = np.array(self.embeddings)
+		
+		self.normalized = pflgNormalized
+		
+		sys.stdout.write('\n')
+		
+	
+	
+	def normalizeEmbeddings(self):
+		'''
+		Normalizes embeddings
+		'''
+		
+		self.embeddings = self.embeddings / np.sqrt(np.sum((np.square(self.embeddings)), axis=1, keepdims=True))
+		
+		self.normalized = True
+		
+	
+	
+	def getVector(self, pWord, pUnknown = "empty"):
+		'''
+		Returns the embedding vector of the given word if exists and None otherwise
+		
+		pUknown specifies what should be returned in case the given word in not found. It can be one of hte following:
+		- empty: an empty list is returned
+		- zero: an array of zeros is returned
+		- unknown: the vector for the unknown word is returned 9see self._unknown 
+		'''
+		
+		try:
+			if self._lowercase:
+				return self.embeddings[self.wordIDs[pWord.lower()]]
+			else:
+				return self.embeddings[self.wordIDs[pWord]]
+		except KeyError:
+			if pUnknown.lower() == "empty":
+				return []
+			elif pUnknown.lower() == "zero":
+				return np.zeros(self.dimension)
+			elif pUnknown.lower() == "unknown":
+				return self.unknown 
+		
+	
+	
+	def getWordEmbedding(self, pWord):
+		'''
+		Returns the embedding vector of the given word if exists and None otherwise 
+		'''
+		
+		return self.getVector(pWord)
+		
+	
+	@property
+	def unknown(self):
+		'''
+		Returns the embedding vector of unknown words
+		'''
+		
+		return self._unknown
+		
+	
+	
+	def getAvgVector(self, plWords):
+		'''
+		Returns the average of the vectors of the given words
+		
+		Unknown words will be ignored and not included in the averaging. If all words are unknown, a vector of zeros 
+		will be returned.
+		'''
+		
+		vlSum = np.zeros(self.dimension)
+		
+		for vWord in plWords:
+			try:
+				if self._lowercase:
+					vlSum = np.add(vlSum, self.embeddings[self.wordIDs[pWord]])
+				else:
+					vlSum = np.add(vlSum, self.embeddings[self.wordIDs[pWord.lower()]])
+			except KeyError:
+				continue
+		
+		return vlSum
+		
+	
+	
+	def calcSimMatrix(self, plWords=None, pSimMeasure="cosine", pflgReturn=False):
+		'''
+		Calculates the similarities between every pair of words from the given list of words or the entire	vocabulary
+		using the specified similarity measure
+		
+		It prints the results but can also optionally return them in a matrix, which is implemented using a dictionary: 
+		  {"word1": {"word2": 0.02, "word3": 0.91},
+		   "word2": {"word1": 0.02, "word3": 0.59},
+		   "word3": {"word1": 0.91, "word2": 0.91}}		
+		
+		For access efficiency, duplicates are allowed, but there are no entries for the similarity of a word with itself.
+		
+		If no word list is given, the entire vocabulary of the loaded vectors will be used.  
+		'''
+		
+		########## change to matrix operations for efficieny
+		
+		if plWords is None:
+			vlWords = self.wordIDs
+		else:
+			vlWords = list(set(plWords))
+		
+		vdSimMatrix = {}
+		
+		for vWord1 in vlWords:
+			vdSimMatrix[vWord1] = {}
+			for vWord2 in vlWords:
+				if vWord1 != vWord2:
+					if pSimMeasure.lower().startswith("cos"):
+						vSim = self.calcCosSim(vWord1, vWord2)
+					elif pSimMeasure.lower().startswith("euc"):
+						vSim = self.calcEuclideanSim(vWord1, vWord2)
+					else:
+						raise Exception("Unknown similarity measure: %s" % pSimMeasure)
+					
+					print "%s\t%s\t%s" % (vWord1, vWord2, vSim)
+					if pflgReturn:
+						vdSimMatrix[vWord1][vWord2] = vSim
+		
+		if pflgReturn:
+			return vdSimMatrix
+		
+	
+	
+	def extractSimilarWords(self, pWord, pSimWordNum):
+		'''
+		Extracts and returns similar words to a given word
+		
+		It returns pSimWordNum number of similar words and only if the given word exists in the vocabulary.
+		'''
+		
+		vaWV = self.getVector(pWord)
+		
+		if len(vaWV) == 0:
+			print "Word %s not found" % pWord
+		else:
+			if not self.normalized:
+				print "Embedding vectors are not normalized. Use normalizeEmbeddings() to normalize them first."
+				return
+			
+			vaSimilarities = np.dot(vaWV, np.transpose(self.embeddings))
+			
+			vlNearstIDs = (-vaSimilarities[:]).argsort()[1: pSimWordNum + 1]
+			return [(self.IDWords[id], vaSimilarities[id]) for id in vlNearstIDs]
+		
+	
+	
+	def calcCosSim(self, pWord1, pWord2):
+		'''
+		Calculates and returns the cosine similarity of the given words
+		'''
+		
+		vlWV1 = self.getVector(pWord1)
+		vlWV2 = self.getVector(pWord2)
+		
+		if len(vlWV1) == 0:
+			raise Exception("Word %s not found" % pWord1)
+		if len(vlWV2) == 0:
+			raise Exception("Word %s not found" % pWord2)
+		
+		if self.normalized:
+			return np.dot(vlWV1, vlWV2)
+		else:
+			return np.dot(vlWV1, vlWV2) / (np.linalg.norm(vlWV1) * np.linalg.norm(vlWV2))
+		
+	
+	
+	def calcEuclideanSim(self, pWord1, pWord2):
+		'''
+		Calculates and returns the euclidean similarity of the given words
+
+		Euclidean similarity is a function of Euclidean distance calculated as:
+
+		 ES = 1 / (1 + ED)
+		'''
+		
+		return 1.0 / (1 + self.calcEuclideanDist(pWord1, pWord2))
+		
+	
+	
+	def calcEuclideanDist(self, pWord1, pWord2):
+		'''
+		Calculates and returns the euclidean distance of the given words
+		'''
+		
+		vlWV1 = self.getVector(pWord1)
+		vlWV2 = self.getVector(pWord2)
+		
+		if len(vlWV1) == 0:
+			raise Exception("Word %s not found" % pWord1)
+		if len(vlWV2) == 0:
+			raise Exception("Word %s not found" % pWord2)
+		
+		return np.linalg.norm(vlWV1 - vlWV2)
+		
+	
+	
+	def scaleEmbeddings(self, pRange):
+		'''
+		Scales embedding values (vector elements) into the given range 
+		'''
+		
+		vMin = np.min(self.embeddings)
+		vMax = np.max(self.embeddings)
+		
+		self.embeddings = pRange[0] + (self.embeddings - vMin) * (pRange[1] - pRange[0]) * 1.0 / (vMax - vMin)
+		
+	
+