123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381 |
- #! /usr/bin/python
- # -*- coding: utf-8 -*-
- """
- Any-gram kernels
- Author: Rasoul Kaljahi
- See LICENSE file.
- """
- from collections import Counter
- import numpy as np
- import ctypes, os
- import we
- class AnyGram:
- '''
- Class to prepare and compute any-gram kernel
-
- ToDo: extend to multiple sentences per instance
- '''
- def __init__(self, pMethod, pMaxTxtLen, pWESTThreshold = 0, pflgNormalizeKernel = True):
- '''
- Constructor
- pMaxTxtLen is the maximum text length of the instances in terms of tokens. Longer instances will be cutoff and
- shorter ones will be padded. The same length should also be used at prediction time if a new AnyGram object is
- created. Otherwise scikit will complain.
- '''
-
- self.method = pMethod # method to be used in comparing tokens:
- # - sm: string match
- # - wess: word embedding similarity score
- # - west: word embedding similarity threshold
-
- self._westThreshold = pWESTThreshold # threshold for WEST method
-
- self.maxTxtLen = pMaxTxtLen # Maximum text length of the instances in terms of tokens: longer instances
- # will be cutoff and shorter ones will be padded.
-
- self.embeddings = None
- self.vocabulary = None # dictionary of tokens and their indicies
- self._iVocabulary = None # dictionary of token indicies and their forms (inverse vocabulary)
-
- self.l = 0.4 # lambda
-
- self._normalizeKernel = pflgNormalizeKernel # whether or not normalize the kernel values
-
-
-
- def setWESTThreshold(self, pThreshold):
- '''
- Sets the word embedding similarity threshold for WEST method
-
- The value must be between 0 and 1.
- '''
-
- self._westThreshold = pThreshold
-
-
-
- @property
- def vocabSize(self):
- '''
- Returns the size of the vocabulary
- '''
-
- return len(self.vocabulary)
-
-
-
- @property
- def iVocabulary(self):
- '''
- Returns the inverse vocabulary (dictionary of token indicies and their forms)
-
- The inverse vocabulary is created on demand.
- '''
-
- if self._iVocabulary is None:
- self._iVocabulary = {v: k for k, v in self.vocabulary.iteritems()}
-
- # adding something for padding
- self._iVocabulary[0] = '_PAD_'
-
- return self._iVocabulary
-
-
-
- def __call__(self, pX, pTrainX):
- '''
- Call method
- '''
-
- return self.computeKernel(pX, pTrainX)
-
-
-
- def computeKernel(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
- '''
- Computes and returns the kernel values
-
- pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
- respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
- be used with a precomputed kernel.
- '''
-
- if self.method.lower() == "sm":
- return self._computeKernelSM(pX, pTrainX, pXAux, pTrainXAux)
- elif self.method.lower() == "west":
- return self._computeKernelWEST(pX, pTrainX, pXAux, pTrainXAux)
- elif self.method.lower() == "wess":
- return self._computeKernelWESS(pX, pTrainX, pXAux, pTrainXAux)
-
-
-
- def _computeKernelSM(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
- '''
- Computes and returns the kernel using string match method
-
- pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
- respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
- be used with a precomputed kernel.
-
- For every pair of tokens, every pair of peer elements in the auxiliary input will be compared against each other.
- '''
-
- if pXAux is None:
- pXAux = np.empty(shape = (0, 0, 0))
- pTrainXAux = np.empty(shape=(0, 0, 0))
-
- vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
-
- vLib.compKernelMatrixSM.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- ctypes.c_int]
- vLib.compKernelMatrixSM.restype = np.ctypeslib.ndpointer(dtype=np.float64,
- shape=(pX.shape[0], pTrainX.shape[0]),
- flags="C_CONTIGUOUS")
-
- if id(pX) == id(pTrainX):
- vPhase = 0
- else:
- vPhase = 1
-
- vK = vLib.compKernelMatrixSM(pX.flatten(), pX.shape[0], pX.shape[1],
- pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
- pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
- vPhase, self._normalizeKernel)
-
- return vK
-
-
-
- def _computeKernelWEST(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
- '''
- Computes and returns the kernel using word embedding similarity threshold method
-
- pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
- respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
- be used with a precomputed kernel.
-
- For every token, the auxiliary input will be concatenated to the end of its embedding vector.
- '''
-
- if pXAux is None:
- pXAux = np.empty(shape = (0, 0, 0))
- pTrainXAux = np.empty(shape=(0, 0, 0))
- vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
-
- vLib.compKernelMatrixWEST.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- ctypes.c_double,
- ctypes.c_int,
- ctypes.c_int]
- vLib.compKernelMatrixWEST.restype = np.ctypeslib.ndpointer(dtype=np.float64,
- shape=(pX.shape[0], pTrainX.shape[0]),
- flags="C_CONTIGUOUS")
-
- if id(pX) == id(pTrainX):
- vPhase = 0
- else:
- vPhase = 1
-
- vK = vLib.compKernelMatrixWEST(pX.flatten(), pX.shape[0], pX.shape[1],
- pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
- pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
- self.embeddings.flatten(), self.embeddings.shape[0], self.embeddings.shape[1], ctypes.c_double(self._westThreshold),
- vPhase, self._normalizeKernel)
-
- return vK
-
-
-
- def _computeKernelWESS(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
- '''
- Computes and returns the kernel using word embedding similarity score method
-
- pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
- respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
- be used with a precomputed kernel.
-
- For every token, the auxiliary input will be concatenated to the end of its embedding vector.
- '''
-
- if pXAux is None:
- pXAux = np.empty(shape = (0, 0, 0))
- pTrainXAux = np.empty(shape=(0, 0, 0))
- vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
-
- vLib.compKernelMatrixWESS.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
- ctypes.c_int,
- ctypes.c_int,
- ctypes.c_int,
- ctypes.c_int]
- vLib.compKernelMatrixWESS.restype = np.ctypeslib.ndpointer(dtype=np.float64,
- shape=(pX.shape[0], pTrainX.shape[0]),
- flags="C_CONTIGUOUS")
- if id(pX) == id(pTrainX):
- vPhase = 0
- else:
- vPhase = 1
-
- vK = vLib.compKernelMatrixWESS(pX.flatten(), pX.shape[0], pX.shape[1],
- pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
- pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
- self.embeddings.flatten(), self.embeddings.shape[0], self.embeddings.shape[1],
- vPhase, self._normalizeKernel)
-
- return vK
-
-
-
- def loadVocabulary(self, pdVocabulary):
- '''
- Loads vocabulary to be used for formatting the input data to learning algorithm
-
- The vocabulary should be a dictionary of tokens and their indices.
- '''
-
- self.vocabulary = dict(pdVocabulary)
-
-
-
- def loadEmbeddings(self, pWEFilename, pIsLowerCase):
- '''
- Loads word embeddings from the given file
-
- Word embeddings should be loaded before formatting data, because the vocabulary used for formatting data should
- be extracted from the word embeddings (when the word embeddings are used).
-
- pIsLowerCase specifies whether the vocabulary of the word embedding is lowercased.
- '''
-
- # loading embeddings
- vWE = we.WordEmbedding()
- vWE.loadEmbeddings(pWVFilename=pWEFilename, pIsLowerCase=pIsLowerCase)
-
- # genertaing vocabulary with embeddings' vocabulary: 0 need to be reserved for padding
- self.vocabulary = {k: v + 1 for k, v in vWE.wordIDs.iteritems()}
-
- # adding the 0 vector (for padding index) at the beginning
- self.embeddings = np.concatenate((np.zeros((1, vWE.dimension)),
- vWE.embeddings))
-
-
-
- def _generateVocabulary(self, pllCorpus):
- '''
- Extracts the vocabulary from the input corpus and stores it in a dictionary of tokens and their indices
- Token frequencies are used as order in generating the indices. 0 is reserved for padding the input when
- formatting it for the learning algorithm, so the starting index is 1.
- '''
-
- # 0 is resereved for padding
- vStartingIdx = 1
-
- # flattening corpus
- vlTokens = [t for s in pllCorpus for t in s]
-
- # computing frequencies
- vdTokenFreqs = Counter(vlTokens)
-
- # sorting based on token frequency
- vlSortedTokens = [t for t, f in
- sorted([(t, f) for (t, f) in vdTokenFreqs.iteritems()], key=lambda x: x[1], reverse=True)]
-
- # creating token-index map
-
- self.vocabulary = {}
-
- # filling the vocabulary
- for idx, vToken in enumerate(vlSortedTokens, start=vStartingIdx):
- self.vocabulary[vToken] = idx
-
-
-
- def formatData(self, pCorpus, pflgLowercase = False):
- '''
- Converts the format of the input corpus from list of texts into arrays of token indices to be used as input
- to the learning algorithm
- It needs a vocabulary mapping tokens to indicies for conversion. If a vocabulary is already loaded, it will be
- used. Otherwsie, the vocabulary will be generated from the corpus.
- Use this function before feeding the input to the learning algorithm. For example, with scikit SVC, given X as
- a list of tokenized sentences (e.g. ["This is sentence 1 .", "This is sentence 2 ."]), and Y the labels:
- any_gram = AnyGram()
- X = any_gram.formatData(X)
- clf = svm.SVC(kernel = any_gram)
- clf.fit(X, Y)
-
- If word embedding vectors are used, they should be loaded before formatting the data, to create the vocabulary
- based on the word indices of the word embeddings. Otherwise, a vocabulary will be created by this method and when
- loading embeddings afterwards, the indices will be correct.
-
- If a token is not in the vocabulary, it will be added to its end. This only happens in case of using word embeddings.
-
- Since the output is a 2D array, the sentences are padded to the length of the longest sentence. 0 is used as the
- padding index.
- '''
-
- # spliting the input sentences into tokens if it is not already (NOTE: the input is supposed to be tokenized)
- if pflgLowercase:
- vllCorpus = [[t.lower() for t in s.split()] for s in pCorpus]
- else:
- vllCorpus = [[t for t in s.split()] for s in pCorpus]
- if self.vocabulary is None:
- self._generateVocabulary(vllCorpus)
- # finding the longest sentence
- # vMaxLen = max([len(s) for s in vllCorpus])
- vaOutput = np.zeros((len(pCorpus), self.maxTxtLen), dtype=np.int32)
- for i in range(vaOutput.shape[0]):
- for j in range(min(len(vllCorpus[i]), self.maxTxtLen)):
- if vllCorpus[i][j] not in self.vocabulary:
- self.vocabulary[vllCorpus[i][j]] = self.vocabSize + 1
- vaOutput[i][j] = self.vocabulary[vllCorpus[i][j]]
-
- return vaOutput
-
-
-
|