we.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. Word embedding vectors
  5. Author: Rasoul Kaljahi
  6. See LICENSE file.
  7. """
  8. import sys
  9. import zipfile, collections
  10. import numpy as np
  11. class WordEmbedding:
  12. '''
  13. Class for managing word embeddings
  14. '''
  15. def __init__(self, pVerbosity = 1):
  16. '''
  17. Constructor
  18. '''
  19. # verbosity of processes
  20. self.verbosity = pVerbosity
  21. self.embeddings = None # numpy array containing final word embeddings
  22. self.normalized = None # whether the embeddings are normalized
  23. self.wordIDs = None # vocab words and their IDs (indexes)
  24. self.IDWords = None # vocab IDs and their words
  25. # training data attributes
  26. self.corpus = None # original corpus as list of sentences
  27. self.wordCounts = None # vocab words and their counts
  28. self.data = None # the data: list of word IDs (not word forms) in their
  29. # original order in the corpus
  30. self._lowercase = False
  31. # unknown token embedidng vector
  32. self._unknown = None
  33. @property
  34. def dimension(self):
  35. '''
  36. Returns the dimension of the loaded vectors
  37. '''
  38. if self.embeddings is None:
  39. return 0
  40. else:
  41. return len(self.embeddings[0])
  42. @property
  43. def vocabSize(self):
  44. '''
  45. Returns the vocabulary size of the embedding vectors
  46. '''
  47. if self.wordIDs is None:
  48. return 0
  49. else:
  50. return len(self.wordIDs)
  51. def loadData(self, pCorpusFilename, pVocabSize, pflgKeepOrgCorpus=False, pflgKeepVocabCounts=False):
  52. '''
  53. Loads data from an input corpus
  54. The original corpus can be optionally stored for later uses. The default is to delete it after loading
  55. required data.
  56. '''
  57. # reading data
  58. if zipfile.is_zipfile(pCorpusFilename):
  59. with zipfile.ZipFile(pCorpusFilename) as f:
  60. vlCorpusLines = f.read(f.namelist()[0]).strip().split('\n')
  61. else:
  62. vlCorpusLines = open(pCorpusFilename).read().strip().split('\n')
  63. vlWords = [w for l in vlCorpusLines for w in l.split()]
  64. # loading data
  65. if pflgKeepVocabCounts:
  66. self.wordCounts = {"UNK": 0} # initializing vocab words/counts by UNK; to be counted later
  67. self.wordIDs = {"UNK": 0} # initializing vocal words/IDs by UNK
  68. self.IDWords = {0: "UNK"} # initializing vocab IDs/words by UNK
  69. self.data = []
  70. # getting words IDs based on word frequency order (only most common words specified by vocab size)
  71. for i, (word, count) in enumerate(collections.Counter(vlWords).most_common(pVocabSize - 1), start=1):
  72. if pflgKeepVocabCounts:
  73. self.wordCounts[word] = count
  74. self.wordIDs[word] = i
  75. for word in vlWords:
  76. if word in self.wordIDs:
  77. id = self.wordIDs[word]
  78. self.IDWords[id] = word
  79. else:
  80. id = 0
  81. if pflgKeepVocabCounts:
  82. self.wordCounts["UNK"] += 1
  83. self.data.append(id)
  84. if self.verbosity > 0:
  85. print("Data size:")
  86. print("\t%d (%dM) lines" % (len(vlCorpusLines), len(vlCorpusLines) / 1000000))
  87. print("\t%d (%dM) words\n" % (len(vlWords), len(vlWords) / 1000000))
  88. if self.verbosity > 1:
  89. if pflgKeepVocabCounts:
  90. print('Most common words:\n\t%s\n' % '\n\t'.join(["%s: %d" % (w, c) for w, c in
  91. sorted(self.wordCounts.iteritems(),
  92. key=lambda x: x[1], reverse=True)[:10]]))
  93. print(
  94. 'Sample word indexes:\n\t%s\n' % '\n\t'.join(
  95. ["%s: %s" % (w, self.wordIDs[w]) for w in self.wordIDs][:10]))
  96. print('Sample index words:\n\t%s\n' % '\n\t'.join(
  97. ["%s: %s" % (id, self.IDWords[id]) for id in self.IDWords][:10]))
  98. print(
  99. 'Sample data:\n\t%s\n\n\t%s\n' % (
  100. self.data[:53], ' '.join([sefl.IDWords[id] for id in self.data[:53]])))
  101. # deleting the corpus if asked and the word list
  102. if not pflgKeepOrgCorpus:
  103. self.corpus = vlCorpusLines
  104. del vlCorpusLines
  105. del vlWords
  106. def getWordFreq(self, pWord):
  107. '''
  108. Returns the frequency of a given word in the vocabulary
  109. '''
  110. if self.wordCounts is None:
  111. print(
  112. "Word counts are not stored or no data is loaded. Data should be loaded with pflgKeepVocabCount parameter set to true.")
  113. elif pWord in self.wordCounts:
  114. return self.wordCounts[pWord]
  115. else:
  116. print('Word "%s" not found' % pWord)
  117. def loadEmbeddings(self, pWVFilename, pIsLowerCase=False,
  118. plFilterVocab=None, pflgCaseSensitiveFilter=True,
  119. pflgNormalized=False, pUnknownToken = None):
  120. '''
  121. Loads pre-trained word embeddings from file in general format
  122. In the general format, each word is represented in a line which contains the word and the vector all separated
  123. by space/tab.
  124. pIsLowerCase determines the case of the embeddings' vocabulary.
  125. Optionally, a vocabulary list can be provided to filter the vectors being loaded into the memory. This can reduce
  126. the memory usage in scenarios where the list of words in use are known in advance.
  127. If the word vectors are lowercase, the filter word lookup will be case insensitive. Otherwise, another argument
  128. is used to determine the case-sensitivity of the lookup, the default value of which is true, i.e. case sensitive.
  129. The loaded vectors may or may not be already normalized. For example, Google News pre-trained vectors are not
  130. normalized. When the vectors are normalized, cosine similarity can be camputed simply using dot product without
  131. the need to normalization by the product of vector norms. However, for normalized vectors, this normalization will
  132. have no effect. This information is passed through pflgNormalized and is needed for computing similarities.
  133. '''
  134. self._lowercase = pIsLowerCase
  135. # filter vocabulary case
  136. if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
  137. vlFilterVocab = [w.lower() for w in plFilterVocab]
  138. else:
  139. vlFilterVocab = plFilterVocab
  140. self.embeddings = []
  141. self.wordIDs = {}
  142. self.IDWords = {}
  143. vWordID = 0
  144. for i, vLine in enumerate(open(pWVFilename), start=1):
  145. vlLSplit = vLine.split()
  146. # vector word case
  147. if not pflgCaseSensitiveFilter and not self._lowercase:
  148. vWord = vlLSplit[0].lower()
  149. else:
  150. vWord = vlLSplit[0]
  151. if vlFilterVocab is None or vWord in vlFilterVocab:
  152. self.embeddings.append(np.array([float(n) for n in vlLSplit[1:]]))
  153. self.wordIDs[vlLSplit[0]] = vWordID
  154. self.IDWords[vWordID] = vlLSplit[0]
  155. vWordID += 1
  156. if pUnknownToken is not None and vWord == pUnknownToken:
  157. self._unknown = np.array([float(n) for n in vlLSplit[1:]])
  158. if i % 1000 == 0:
  159. sys.stdout.write('.')
  160. sys.stdout.flush()
  161. self.embeddings = np.array(self.embeddings)
  162. self.normalized = pflgNormalized
  163. sys.stdout.write('\n')
  164. def loadGloVeVectors(self, pWVFilename, pIsLowerCase=False,
  165. plFilterVocab=None, pflgCaseSensitiveFilter=True,
  166. pflgNormalized=False, pUnknownToken = None):
  167. '''
  168. Loads word vectors from GloVe word embedding file
  169. Both .zip and text files are accepted.
  170. See load() for parameters.
  171. '''
  172. if pWVFilename[-3:].lower() == "zip":
  173. self._loadGloVeVectorsZip(pWVFilename, pIsLowerCase, plFilterVocab, pflgCaseSensitiveFilter, pflgNormalized, pUnknownToken)
  174. else:
  175. self.loadEmbeddings(pWVFilename, pIsLowerCase, plFilterVocab, pflgCaseSensitiveFilter, pflgNormalized, pUnknownToken)
  176. def _loadGloVeVectorsZip(self, pWVZipFilename, pIsLowerCase=False,
  177. plFilterVocab=None, pflgCaseSensitiveFilter=True,
  178. pflgNormalized=False, pUnknownToken = None):
  179. '''
  180. Loads word vectors from GloVe zip file
  181. '''
  182. self._lowercase = pIsLowerCase
  183. import zipfile as zp
  184. vZipFile = zp.ZipFile(pWVZipFilename)
  185. # filter vocabulary case
  186. if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
  187. vlFilterVocab = [w.lower() for w in plFilterVocab]
  188. else:
  189. vlFilterVocab = plFilterVocab
  190. self.embeddings = []
  191. self.wordIDs = {}
  192. self.IDWords = {}
  193. vWordID = 0
  194. for i, vLine in enumerate(vZipFile.open(vZipFile.namelist()[0]), start=1):
  195. vlLSplit = vLine.split()
  196. # vector word case
  197. if not pflgCaseSensitiveFilter and not self._lowercase:
  198. vWord = vlLSplit[0].lower()
  199. else:
  200. vWord = vlLSplit[0]
  201. if vlFilterVocab is None or vlLSplit[0] in vlFilterVocab:
  202. self.embeddings.append(np.array([float(n) for n in vlLSplit[1:]]))
  203. self.wordIDs[vlLSplit[0]] = vWordID
  204. self.IDWords[vWordID] = vlLSplit[0]
  205. vWordID += 1
  206. if pUnknownToken is not None and vWord == pUnknownToken:
  207. self._unknown = np.array([float(n) for n in vlLSplit[1:]])
  208. if i % 1000 == 0:
  209. sys.stdout.write('.')
  210. sys.stdout.flush()
  211. self.embeddings = np.array(self.embeddings)
  212. self.normalized = pflgNormalized
  213. sys.stdout.write('\n')
  214. def loadW2VBinVectors(self, pWVBinFilename, pIsLowerCase=False,
  215. plFilterVocab=None, pflgCaseSensitiveFilter=True,
  216. pflgNormalized=False, pUnknownToken = None):
  217. '''
  218. Loads word vectors from word2vec file in binary format
  219. The method uses gensim to load the vectors. Parameters are the same as in load() method.
  220. pUnknownToken specifies the token in the input embedding vectors which represents unknown tokens (e.g. UNK)
  221. if there is one.
  222. '''
  223. import gensim
  224. self._lowercase = pIsLowerCase
  225. # filter vocabulary case
  226. if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
  227. vlFilterVocab = [w.lower() for w in plFilterVocab]
  228. else:
  229. vlFilterVocab = plFilterVocab
  230. # loading word vectors using gensim
  231. vGensimModel = gensim.models.Word2Vec.load_word2vec_format(pWVBinFilename, binary=True)
  232. self.embeddings = []
  233. self.wordIDs = {}
  234. self.IDWords = {}
  235. vWordID = 0
  236. if vlFilterVocab is not None:
  237. for vWord in vlFilterVocab:
  238. try:
  239. self.embeddings.append(vGensimModel[vWord])
  240. self.wordIDs[vWord] = vWordID
  241. self.IDWords[vWordID] = vWord
  242. vWordID += 1
  243. except KeyError:
  244. continue
  245. sys.stdout.write('.')
  246. sys.stdout.flush()
  247. else:
  248. sys.stdout.write('Reading the entire vocabulary...')
  249. sys.stdout.flush()
  250. for vWord in vGensimModel.vocab:
  251. self.embeddings.append(vGensimModel[vWord])
  252. self.wordIDs[vWord] = vWordID
  253. self.IDWords[vWordID] = vWord
  254. vWordID += 1
  255. sys.stdout.write(' done.')
  256. sys.stdout.flush()
  257. self.embeddings = np.array(self.embeddings)
  258. if pUnknownToken is not None:
  259. self._unknown = vGensimModel[pUnknownToken]
  260. self.normalized = pflgNormalized
  261. sys.stdout.write('\n')
  262. def loadW2VTxtVectors(self, pWVTxtFilename, pIsLowerCase=False,
  263. plFilterVocab=None, pflgCaseSensitiveFilter=True,
  264. pflgNormalized=False, pUnknownToken = None):
  265. '''
  266. Loads word vectors from word2vec file in text format
  267. '''
  268. self._lowercase = pIsLowerCase
  269. # filter vocabulary case
  270. if plFilterVocab is not None and (self._lowercase or not pflgCaseSensitiveFilter):
  271. vlFilterVocab = [w.lower() for w in plFilterVocab]
  272. else:
  273. vlFilterVocab = plFilterVocab
  274. vfWV = open(pWVTxtFilename)
  275. # skipping the header
  276. vfWV.readline()
  277. # loading word vectors
  278. self.embeddings = []
  279. self.wordIDs = {}
  280. self.IDWords = {}
  281. vWordID = 0
  282. for i, vLine in enumerate(vfWV, start=1):
  283. vlLSplit = vLine.split()
  284. # vector word case
  285. if not pflgCaseSensitiveFilter and not self._lowercase:
  286. vWord = vlLSplit[0].lower()
  287. else:
  288. vWord = vlLSplit[0]
  289. if vlFilterVocab is None or vWord in vlFilterVocab:
  290. self.embeddings.append(np.array([float(n) for n in vlLSplit[1:]]))
  291. self.wordIDs[vlLSplit[0]] = vWordID
  292. self.IDWords[vWordID] = vlLSplit[0]
  293. if pUnknownToken is not None and vWord == pUnknownToken:
  294. self._unknown = np.array([float(n) for n in vlLSplit[1:]])
  295. if i % 1000 == 0:
  296. sys.stdout.write('.')
  297. sys.stdout.flush()
  298. self.embeddings = np.array(self.embeddings)
  299. self.normalized = pflgNormalized
  300. sys.stdout.write('\n')
  301. def normalizeEmbeddings(self):
  302. '''
  303. Normalizes embeddings
  304. '''
  305. self.embeddings = self.embeddings / np.sqrt(np.sum((np.square(self.embeddings)), axis=1, keepdims=True))
  306. self.normalized = True
  307. def getVector(self, pWord, pUnknown = "empty"):
  308. '''
  309. Returns the embedding vector of the given word if exists and None otherwise
  310. pUknown specifies what should be returned in case the given word in not found. It can be one of hte following:
  311. - empty: an empty list is returned
  312. - zero: an array of zeros is returned
  313. - unknown: the vector for the unknown word is returned 9see self._unknown
  314. '''
  315. try:
  316. if self._lowercase:
  317. return self.embeddings[self.wordIDs[pWord.lower()]]
  318. else:
  319. return self.embeddings[self.wordIDs[pWord]]
  320. except KeyError:
  321. if pUnknown.lower() == "empty":
  322. return []
  323. elif pUnknown.lower() == "zero":
  324. return np.zeros(self.dimension)
  325. elif pUnknown.lower() == "unknown":
  326. return self.unknown
  327. def getWordEmbedding(self, pWord):
  328. '''
  329. Returns the embedding vector of the given word if exists and None otherwise
  330. '''
  331. return self.getVector(pWord)
  332. @property
  333. def unknown(self):
  334. '''
  335. Returns the embedding vector of unknown words
  336. '''
  337. return self._unknown
  338. def getAvgVector(self, plWords):
  339. '''
  340. Returns the average of the vectors of the given words
  341. Unknown words will be ignored and not included in the averaging. If all words are unknown, a vector of zeros
  342. will be returned.
  343. '''
  344. vlSum = np.zeros(self.dimension)
  345. for vWord in plWords:
  346. try:
  347. if self._lowercase:
  348. vlSum = np.add(vlSum, self.embeddings[self.wordIDs[pWord]])
  349. else:
  350. vlSum = np.add(vlSum, self.embeddings[self.wordIDs[pWord.lower()]])
  351. except KeyError:
  352. continue
  353. return vlSum
  354. def calcSimMatrix(self, plWords=None, pSimMeasure="cosine", pflgReturn=False):
  355. '''
  356. Calculates the similarities between every pair of words from the given list of words or the entire vocabulary
  357. using the specified similarity measure
  358. It prints the results but can also optionally return them in a matrix, which is implemented using a dictionary:
  359. {"word1": {"word2": 0.02, "word3": 0.91},
  360. "word2": {"word1": 0.02, "word3": 0.59},
  361. "word3": {"word1": 0.91, "word2": 0.91}}
  362. For access efficiency, duplicates are allowed, but there are no entries for the similarity of a word with itself.
  363. If no word list is given, the entire vocabulary of the loaded vectors will be used.
  364. '''
  365. ########## change to matrix operations for efficieny
  366. if plWords is None:
  367. vlWords = self.wordIDs
  368. else:
  369. vlWords = list(set(plWords))
  370. vdSimMatrix = {}
  371. for vWord1 in vlWords:
  372. vdSimMatrix[vWord1] = {}
  373. for vWord2 in vlWords:
  374. if vWord1 != vWord2:
  375. if pSimMeasure.lower().startswith("cos"):
  376. vSim = self.calcCosSim(vWord1, vWord2)
  377. elif pSimMeasure.lower().startswith("euc"):
  378. vSim = self.calcEuclideanSim(vWord1, vWord2)
  379. else:
  380. raise Exception("Unknown similarity measure: %s" % pSimMeasure)
  381. print "%s\t%s\t%s" % (vWord1, vWord2, vSim)
  382. if pflgReturn:
  383. vdSimMatrix[vWord1][vWord2] = vSim
  384. if pflgReturn:
  385. return vdSimMatrix
  386. def extractSimilarWords(self, pWord, pSimWordNum):
  387. '''
  388. Extracts and returns similar words to a given word
  389. It returns pSimWordNum number of similar words and only if the given word exists in the vocabulary.
  390. '''
  391. vaWV = self.getVector(pWord)
  392. if len(vaWV) == 0:
  393. print "Word %s not found" % pWord
  394. else:
  395. if not self.normalized:
  396. print "Embedding vectors are not normalized. Use normalizeEmbeddings() to normalize them first."
  397. return
  398. vaSimilarities = np.dot(vaWV, np.transpose(self.embeddings))
  399. vlNearstIDs = (-vaSimilarities[:]).argsort()[1: pSimWordNum + 1]
  400. return [(self.IDWords[id], vaSimilarities[id]) for id in vlNearstIDs]
  401. def calcCosSim(self, pWord1, pWord2):
  402. '''
  403. Calculates and returns the cosine similarity of the given words
  404. '''
  405. vlWV1 = self.getVector(pWord1)
  406. vlWV2 = self.getVector(pWord2)
  407. if len(vlWV1) == 0:
  408. raise Exception("Word %s not found" % pWord1)
  409. if len(vlWV2) == 0:
  410. raise Exception("Word %s not found" % pWord2)
  411. if self.normalized:
  412. return np.dot(vlWV1, vlWV2)
  413. else:
  414. return np.dot(vlWV1, vlWV2) / (np.linalg.norm(vlWV1) * np.linalg.norm(vlWV2))
  415. def calcEuclideanSim(self, pWord1, pWord2):
  416. '''
  417. Calculates and returns the euclidean similarity of the given words
  418. Euclidean similarity is a function of Euclidean distance calculated as:
  419. ES = 1 / (1 + ED)
  420. '''
  421. return 1.0 / (1 + self.calcEuclideanDist(pWord1, pWord2))
  422. def calcEuclideanDist(self, pWord1, pWord2):
  423. '''
  424. Calculates and returns the euclidean distance of the given words
  425. '''
  426. vlWV1 = self.getVector(pWord1)
  427. vlWV2 = self.getVector(pWord2)
  428. if len(vlWV1) == 0:
  429. raise Exception("Word %s not found" % pWord1)
  430. if len(vlWV2) == 0:
  431. raise Exception("Word %s not found" % pWord2)
  432. return np.linalg.norm(vlWV1 - vlWV2)
  433. def scaleEmbeddings(self, pRange):
  434. '''
  435. Scales embedding values (vector elements) into the given range
  436. '''
  437. vMin = np.min(self.embeddings)
  438. vMax = np.max(self.embeddings)
  439. self.embeddings = pRange[0] + (self.embeddings - vMin) * (pRange[1] - pRange[0]) * 1.0 / (vMax - vMin)