anygramkernel.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. Any-gram kernels
  5. Author: Rasoul Kaljahi
  6. See LICENSE file.
  7. """
  8. from collections import Counter
  9. import numpy as np
  10. import ctypes, os
  11. import we
  12. class AnyGram:
  13. '''
  14. Class to prepare and compute any-gram kernel
  15. ToDo: extend to multiple sentences per instance
  16. '''
  17. def __init__(self, pMethod, pMaxTxtLen, pWESTThreshold = 0, pflgNormalizeKernel = True):
  18. '''
  19. Constructor
  20. pMaxTxtLen is the maximum text length of the instances in terms of tokens. Longer instances will be cutoff and
  21. shorter ones will be padded. The same length should also be used at prediction time if a new AnyGram object is
  22. created. Otherwise scikit will complain.
  23. '''
  24. self.method = pMethod # method to be used in comparing tokens:
  25. # - sm: string match
  26. # - wess: word embedding similarity score
  27. # - west: word embedding similarity threshold
  28. self._westThreshold = pWESTThreshold # threshold for WEST method
  29. self.maxTxtLen = pMaxTxtLen # Maximum text length of the instances in terms of tokens: longer instances
  30. # will be cutoff and shorter ones will be padded.
  31. self.embeddings = None
  32. self.vocabulary = None # dictionary of tokens and their indicies
  33. self._iVocabulary = None # dictionary of token indicies and their forms (inverse vocabulary)
  34. self.l = 0.4 # lambda
  35. self._normalizeKernel = pflgNormalizeKernel # whether or not normalize the kernel values
  36. def setWESTThreshold(self, pThreshold):
  37. '''
  38. Sets the word embedding similarity threshold for WEST method
  39. The value must be between 0 and 1.
  40. '''
  41. self._westThreshold = pThreshold
  42. @property
  43. def vocabSize(self):
  44. '''
  45. Returns the size of the vocabulary
  46. '''
  47. return len(self.vocabulary)
  48. @property
  49. def iVocabulary(self):
  50. '''
  51. Returns the inverse vocabulary (dictionary of token indicies and their forms)
  52. The inverse vocabulary is created on demand.
  53. '''
  54. if self._iVocabulary is None:
  55. self._iVocabulary = {v: k for k, v in self.vocabulary.iteritems()}
  56. # adding something for padding
  57. self._iVocabulary[0] = '_PAD_'
  58. return self._iVocabulary
  59. def __call__(self, pX, pTrainX):
  60. '''
  61. Call method
  62. '''
  63. return self.computeKernel(pX, pTrainX)
  64. def computeKernel(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
  65. '''
  66. Computes and returns the kernel values
  67. pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
  68. respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
  69. be used with a precomputed kernel.
  70. '''
  71. if self.method.lower() == "sm":
  72. return self._computeKernelSM(pX, pTrainX, pXAux, pTrainXAux)
  73. elif self.method.lower() == "west":
  74. return self._computeKernelWEST(pX, pTrainX, pXAux, pTrainXAux)
  75. elif self.method.lower() == "wess":
  76. return self._computeKernelWESS(pX, pTrainX, pXAux, pTrainXAux)
  77. def _computeKernelSM(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
  78. '''
  79. Computes and returns the kernel using string match method
  80. pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
  81. respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
  82. be used with a precomputed kernel.
  83. For every pair of tokens, every pair of peer elements in the auxiliary input will be compared against each other.
  84. '''
  85. if pXAux is None:
  86. pXAux = np.empty(shape = (0, 0, 0))
  87. pTrainXAux = np.empty(shape=(0, 0, 0))
  88. vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
  89. vLib.compKernelMatrixSM.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  90. ctypes.c_int,
  91. ctypes.c_int,
  92. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  93. ctypes.c_int,
  94. ctypes.c_int,
  95. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  96. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  97. ctypes.c_int,
  98. ctypes.c_int,
  99. ctypes.c_int]
  100. vLib.compKernelMatrixSM.restype = np.ctypeslib.ndpointer(dtype=np.float64,
  101. shape=(pX.shape[0], pTrainX.shape[0]),
  102. flags="C_CONTIGUOUS")
  103. if id(pX) == id(pTrainX):
  104. vPhase = 0
  105. else:
  106. vPhase = 1
  107. vK = vLib.compKernelMatrixSM(pX.flatten(), pX.shape[0], pX.shape[1],
  108. pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
  109. pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
  110. vPhase, self._normalizeKernel)
  111. return vK
  112. def _computeKernelWEST(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
  113. '''
  114. Computes and returns the kernel using word embedding similarity threshold method
  115. pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
  116. respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
  117. be used with a precomputed kernel.
  118. For every token, the auxiliary input will be concatenated to the end of its embedding vector.
  119. '''
  120. if pXAux is None:
  121. pXAux = np.empty(shape = (0, 0, 0))
  122. pTrainXAux = np.empty(shape=(0, 0, 0))
  123. vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
  124. vLib.compKernelMatrixWEST.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  125. ctypes.c_int,
  126. ctypes.c_int,
  127. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  128. ctypes.c_int,
  129. ctypes.c_int,
  130. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  131. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  132. ctypes.c_int,
  133. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  134. ctypes.c_int,
  135. ctypes.c_int,
  136. ctypes.c_double,
  137. ctypes.c_int,
  138. ctypes.c_int]
  139. vLib.compKernelMatrixWEST.restype = np.ctypeslib.ndpointer(dtype=np.float64,
  140. shape=(pX.shape[0], pTrainX.shape[0]),
  141. flags="C_CONTIGUOUS")
  142. if id(pX) == id(pTrainX):
  143. vPhase = 0
  144. else:
  145. vPhase = 1
  146. vK = vLib.compKernelMatrixWEST(pX.flatten(), pX.shape[0], pX.shape[1],
  147. pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
  148. pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
  149. self.embeddings.flatten(), self.embeddings.shape[0], self.embeddings.shape[1], ctypes.c_double(self._westThreshold),
  150. vPhase, self._normalizeKernel)
  151. return vK
  152. def _computeKernelWESS(self, pX, pTrainX, pXAux = None, pTrainXAux = None):
  153. '''
  154. Computes and returns the kernel using word embedding similarity score method
  155. pXAux and pTrainXAux are the auxiliary matirx input. The should have the same number of rows as pX and pTrainX
  156. respectively, but they can have a custom number of columns (equal for both though). At the moment these can only
  157. be used with a precomputed kernel.
  158. For every token, the auxiliary input will be concatenated to the end of its embedding vector.
  159. '''
  160. if pXAux is None:
  161. pXAux = np.empty(shape = (0, 0, 0))
  162. pTrainXAux = np.empty(shape=(0, 0, 0))
  163. vLib = ctypes.cdll.LoadLibrary(os.path.dirname(os.path.realpath(__file__)) + "/agk.so")
  164. vLib.compKernelMatrixWESS.argtypes = [np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  165. ctypes.c_int,
  166. ctypes.c_int,
  167. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  168. ctypes.c_int,
  169. ctypes.c_int,
  170. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  171. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  172. ctypes.c_int,
  173. np.ctypeslib.ndpointer(dtype=np.float64, ndim=1, flags="C_CONTIGUOUS"),
  174. ctypes.c_int,
  175. ctypes.c_int,
  176. ctypes.c_int,
  177. ctypes.c_int]
  178. vLib.compKernelMatrixWESS.restype = np.ctypeslib.ndpointer(dtype=np.float64,
  179. shape=(pX.shape[0], pTrainX.shape[0]),
  180. flags="C_CONTIGUOUS")
  181. if id(pX) == id(pTrainX):
  182. vPhase = 0
  183. else:
  184. vPhase = 1
  185. vK = vLib.compKernelMatrixWESS(pX.flatten(), pX.shape[0], pX.shape[1],
  186. pTrainX.flatten(), pTrainX.shape[0], pTrainX.shape[1],
  187. pXAux.flatten(), pTrainXAux.flatten(), pXAux.shape[2],
  188. self.embeddings.flatten(), self.embeddings.shape[0], self.embeddings.shape[1],
  189. vPhase, self._normalizeKernel)
  190. return vK
  191. def loadVocabulary(self, pdVocabulary):
  192. '''
  193. Loads vocabulary to be used for formatting the input data to learning algorithm
  194. The vocabulary should be a dictionary of tokens and their indices.
  195. '''
  196. self.vocabulary = dict(pdVocabulary)
  197. def loadEmbeddings(self, pWEFilename, pIsLowerCase):
  198. '''
  199. Loads word embeddings from the given file
  200. Word embeddings should be loaded before formatting data, because the vocabulary used for formatting data should
  201. be extracted from the word embeddings (when the word embeddings are used).
  202. pIsLowerCase specifies whether the vocabulary of the word embedding is lowercased.
  203. '''
  204. # loading embeddings
  205. vWE = we.WordEmbedding()
  206. vWE.loadEmbeddings(pWVFilename=pWEFilename, pIsLowerCase=pIsLowerCase)
  207. # genertaing vocabulary with embeddings' vocabulary: 0 need to be reserved for padding
  208. self.vocabulary = {k: v + 1 for k, v in vWE.wordIDs.iteritems()}
  209. # adding the 0 vector (for padding index) at the beginning
  210. self.embeddings = np.concatenate((np.zeros((1, vWE.dimension)),
  211. vWE.embeddings))
  212. def _generateVocabulary(self, pllCorpus):
  213. '''
  214. Extracts the vocabulary from the input corpus and stores it in a dictionary of tokens and their indices
  215. Token frequencies are used as order in generating the indices. 0 is reserved for padding the input when
  216. formatting it for the learning algorithm, so the starting index is 1.
  217. '''
  218. # 0 is resereved for padding
  219. vStartingIdx = 1
  220. # flattening corpus
  221. vlTokens = [t for s in pllCorpus for t in s]
  222. # computing frequencies
  223. vdTokenFreqs = Counter(vlTokens)
  224. # sorting based on token frequency
  225. vlSortedTokens = [t for t, f in
  226. sorted([(t, f) for (t, f) in vdTokenFreqs.iteritems()], key=lambda x: x[1], reverse=True)]
  227. # creating token-index map
  228. self.vocabulary = {}
  229. # filling the vocabulary
  230. for idx, vToken in enumerate(vlSortedTokens, start=vStartingIdx):
  231. self.vocabulary[vToken] = idx
  232. def formatData(self, pCorpus, pflgLowercase = False):
  233. '''
  234. Converts the format of the input corpus from list of texts into arrays of token indices to be used as input
  235. to the learning algorithm
  236. It needs a vocabulary mapping tokens to indicies for conversion. If a vocabulary is already loaded, it will be
  237. used. Otherwsie, the vocabulary will be generated from the corpus.
  238. Use this function before feeding the input to the learning algorithm. For example, with scikit SVC, given X as
  239. a list of tokenized sentences (e.g. ["This is sentence 1 .", "This is sentence 2 ."]), and Y the labels:
  240. any_gram = AnyGram()
  241. X = any_gram.formatData(X)
  242. clf = svm.SVC(kernel = any_gram)
  243. clf.fit(X, Y)
  244. If word embedding vectors are used, they should be loaded before formatting the data, to create the vocabulary
  245. based on the word indices of the word embeddings. Otherwise, a vocabulary will be created by this method and when
  246. loading embeddings afterwards, the indices will be correct.
  247. If a token is not in the vocabulary, it will be added to its end. This only happens in case of using word embeddings.
  248. Since the output is a 2D array, the sentences are padded to the length of the longest sentence. 0 is used as the
  249. padding index.
  250. '''
  251. # spliting the input sentences into tokens if it is not already (NOTE: the input is supposed to be tokenized)
  252. if pflgLowercase:
  253. vllCorpus = [[t.lower() for t in s.split()] for s in pCorpus]
  254. else:
  255. vllCorpus = [[t for t in s.split()] for s in pCorpus]
  256. if self.vocabulary is None:
  257. self._generateVocabulary(vllCorpus)
  258. # finding the longest sentence
  259. # vMaxLen = max([len(s) for s in vllCorpus])
  260. vaOutput = np.zeros((len(pCorpus), self.maxTxtLen), dtype=np.int32)
  261. for i in range(vaOutput.shape[0]):
  262. for j in range(min(len(vllCorpus[i]), self.maxTxtLen)):
  263. if vllCorpus[i][j] not in self.vocabulary:
  264. self.vocabulary[vllCorpus[i][j]] = self.vocabSize + 1
  265. vaOutput[i][j] = self.vocabulary[vllCorpus[i][j]]
  266. return vaOutput