posamender.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692
  1. #! /usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. """
  4. This module contains POS tag corrector code.
  5. Version 0.1 (07-May-2015 to 18-May-2015)
  6. - POSAmender, POSAmenderFE, POSAmenderTrainer are added.
  7. """
  8. import pos
  9. from ml import fvg, eval
  10. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  11. # POSAmender
  12. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  13. class POSAmender:
  14. '''
  15. POS tag corrector
  16. Correction is done on a set of POS tagged sentences.
  17. '''
  18. def __init__(self):
  19. '''
  20. Constructor
  21. Dataset file contains the POS tagging in columnar format (e.g.
  22. Stanford TSV)
  23. '''
  24. ## feature config dump after extracting for training set (to be used
  25. ## later in prediction)
  26. self.dumpFConfig = ''
  27. @property
  28. def size(self):
  29. '''
  30. Return the size of the POS tagged data loaded
  31. '''
  32. return len(self.orgPOSTaggings)
  33. def trainSKLMaxEnt(self, pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
  34. pGoldPOSTaggingFilename, pGPTFormat, pGPTSeparator,
  35. pSuffixFilename, pFConfig):
  36. '''
  37. Trains and returns a MaxEnt (logit) model using scikit-learn
  38. '''
  39. vlOrgPOSTaggings = self.loadPOSTaggings(pFilename = pOrgPOSTaggingFilename, pFormat = pOPTFormat, pSeparator = pOPTSeparator)
  40. vllSuffixes = self.loadSuffixes(pFilename = pSuffixFilename)
  41. vllGoldPOSTags = self.extractPOSTags(pFilename = pGoldPOSTaggingFilename, pFormat = pGPTFormat, pSeparator = pGPTSeparator)
  42. # sanity check
  43. if len(vlOrgPOSTaggings) != len(vllSuffixes):
  44. raise Exception("The number of original POS tagged sentences does not match the number of suffixed sentences: %s vs. %s" % (len(vlOrgPOSTaggings), len(vllSuffixes)))
  45. if len(vlOrgPOSTaggings) != len(vllGoldPOSTags):
  46. raise Exception("The number of original POS tagged sentences does not match the number of gold POS tagged sentence: %s vs. %s" % (len(vlOrgPOSTaggings), len(vllGoldPOSTags)))
  47. # feature extraction
  48. vldFVs, vldRawFVs, self.dumpFConfig = self.genFeatureVectors(pFConfig = pFConfig, plOrgPOSTaggings = vlOrgPOSTaggings, pllSuffixes = vllSuffixes)
  49. # training
  50. vTrainer = POSAmenderTrainer(pldFVs = vldFVs, pllGoldPOSTags = vllGoldPOSTags)
  51. vlSKLInputDump, vModel = vTrainer.trainSKLMaxEnt()
  52. return vldFVs, vldRawFVs, vlSKLInputDump, vModel
  53. def predictWithSKLMaxEnt(self, pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
  54. pSuffixFilename, pModel, pFConfig = ''):
  55. '''
  56. Predicts the correct POS tags using a scikit-learn MaxEnt model
  57. '''
  58. vlOrgPOSTaggings = self.loadPOSTaggings(pFilename = pOrgPOSTaggingFilename, pFormat = pOPTFormat, pSeparator = pOPTSeparator)
  59. vllSuffixes = self.loadSuffixes(pFilename = pSuffixFilename)
  60. # sanity check
  61. if len(vlOrgPOSTaggings) != len(vllSuffixes):
  62. raise Exception("The number of original POS tagged sentences does not match the number of suffixed sentences: %s vs. %s" % (len(vlOrgPOSTaggings), len(vllSuffixes)))
  63. # feature extraction
  64. if pFConfig != '':
  65. vldFVs, vldRawFVs, vFConfig = self.genFeatureVectors(pFConfig = pFConfig, plOrgPOSTaggings = vlOrgPOSTaggings, pllSuffixes = vllSuffixes)
  66. elif self.dumpFConfig != '':
  67. vldFVs, vldRawFVs, vFConfig = self.genFeatureVectors(pFConfig = self.dumpFConfig, plOrgPOSTaggings= vlOrgPOSTaggings, pllSuffixes = vllSuffixes)
  68. else:
  69. raise Exception("Both given and dumped feature configs are empty!")
  70. # prediction
  71. vPredicter = POSAmenderPreder(pldFVs = vldFVs)
  72. vlSKLInputDump, vnpaPreds = vPredicter.predictWithSKLMaxEnt(pModel = pModel)
  73. return vldFVs, vldRawFVs, vlSKLInputDump, vnpaPreds.tolist()
  74. def testWithSKLMaxEnt(self, pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
  75. pGoldPOSTaggingFilename, pGPTFormat, pGPTSeparator,
  76. pSuffixFilename, pModel, pFConfig = ''):
  77. '''
  78. Tests the scikit-learn MaxEnt model performance and returns the
  79. feature vectors, prediction results and various scores
  80. '''
  81. # 1. prediction
  82. vldFVs, vldRawFVs, vlSKLInputDump, vlPrePOSTags = self.predictWithSKLMaxEnt(pOrgPOSTaggingFilename, pOPTFormat, pOPTSeparator,
  83. pSuffixFilename, pModel, pFConfig)
  84. # 2. evaluation
  85. vllGoldPOSTags = self.extractPOSTags(pFilename = pGoldPOSTaggingFilename, pFormat = pGPTFormat, pSeparator = pGPTSeparator)
  86. vlGoldPOSTags = [t for ts in vllGoldPOSTags for t in ts]
  87. # sanity check
  88. if len(vldFVs) != len(vlGoldPOSTags):
  89. raise Exception("The number of samples does not match the number of gold POS tagged sentence: %s vs. %s" % (len(vldFVs), len(vlGoldPOSTags)))
  90. # sentence-level predictions
  91. vllPredPOSTags = self.toSentPOSTagging(plPOSTags = vlPrePOSTags, plSentLens = [len(st) for st in vllGoldPOSTags])
  92. # sentence-level original POS tags
  93. vlOrgPOSTaggings = self.loadPOSTaggings(pFilename = pOrgPOSTaggingFilename, pFormat = pOPTFormat, pSeparator = pOPTSeparator)
  94. vllOrgPOSTags = [pt.getPOSTags() for pt in vlOrgPOSTaggings]
  95. # predictions against gold
  96. vGvPTAcc, vGvPSAcc = self.eval(pllGoldPOSTags = vllGoldPOSTags, pllPredPOSTags = vllPredPOSTags)
  97. # original against gold
  98. vGvOTAcc, vGvOSAcc = self.eval(pllGoldPOSTags = vllGoldPOSTags, pllPredPOSTags = vllOrgPOSTags)
  99. # predictions against original
  100. vOvPTAcc, vOvPSAcc = self.eval(pllGoldPOSTags = vllOrgPOSTags, pllPredPOSTags = vllPredPOSTags)
  101. # 3. generating feature vectors for dumping
  102. vlSKLInputDumpG = ["%s\t%s" % (g, v) for g, v in zip(vlGoldPOSTags, vlSKLInputDump)]
  103. return (vldFVs, vldRawFVs, vlSKLInputDumpG), vlPrePOSTags, (vGvPTAcc, vGvPSAcc), (vGvOTAcc, vGvOSAcc), (vOvPTAcc, vOvPSAcc)
  104. def eval(self, pllGoldPOSTags, pllPredPOSTags):
  105. '''
  106. Evaluates the performance of the correction
  107. '''
  108. vEvaler = POSAmenderEvaler(pllGoldPOSTags = pllGoldPOSTags)
  109. return vEvaler.eval(pllPredPOSTags = pllPredPOSTags)
  110. def loadPOSTaggings(self, pFilename, pFormat, pSeparator):
  111. '''
  112. Loads POS taggings from a file in the given format
  113. 3 formats are supported:
  114. - slashtag: e.g. I/PRN
  115. - columnar: e.g. Stanford TSV
  116. - ptb: Penn treebank tree format
  117. The value of pSeparator specifies the slashtag tag separator or
  118. column separator for columnar format depending on what format is
  119. chosen.
  120. '''
  121. vPOSTagLoader = pos.POSTagLoader(pos.POSTagging)
  122. if pFormat.lower().startswith("slash"):
  123. vPOSTagLoader.loadFromSlashTag(pPOSTaggedFilename = pFilename, pSeparator = pSeparator)
  124. elif pFormat.lower().startswith("column"):
  125. vPOSTagLoader.loadFromColumnar(pColFilename = pFilename, pSeparator = pSeparator)
  126. elif pFormat.lower().startswith("ptb"):
  127. vPOSTagLoader.loadFromPTB(pPTBFilename = pFilename)
  128. return vPOSTagLoader.taggedSentences
  129. def loadSuffixes(self, pFilename):
  130. '''
  131. Loads Foreebank POS tag suffixes
  132. The input file format must be one sentence per line, where each
  133. token suffix is wrapped inside '' and separated by a space from
  134. neighbouring tokens. Tokens without suffix are represented by ''.
  135. e.g. '' '' '_W' '' '' '_X' (sentence with 6 tokens where only 3rd
  136. and 6th tokens have suffixes.
  137. '''
  138. vlLines = open(pFilename).read().strip().split('\n')
  139. vlSentSuffixes = []
  140. for vLine in vlLines:
  141. vLineSplit = vLine.split()
  142. vlSentSuffixes.append([s.strip("'") for s in vLineSplit])
  143. return vlSentSuffixes
  144. def extractPOSTags(self, pFilename, pFormat, pSeparator):
  145. '''
  146. Extracts and returns the POS tags from sentence pos taggings
  147. '''
  148. vlPOSTaggings = self.loadPOSTaggings(pFilename = pFilename, pFormat = pFormat, pSeparator = pSeparator)
  149. return [pt.getPOSTags() for pt in vlPOSTaggings]
  150. def genFeatureVectors(self, pFConfig, plOrgPOSTaggings, pllSuffixes):
  151. '''
  152. Generates feature vectors based on the specified configuration
  153. in YAML-compatible format.
  154. The feature configuration is given in pFConfig. See fvg.FVGen
  155. for the description of its format.
  156. It can optionally output raw feature and value pairs in the following
  157. format:
  158. <FEATURE 1 NAME=FEATURE 1 VALUE> <FEATURE 2 NAME=FEATURE 2 VALUE> ...
  159. '''
  160. vFE = POSAmenderFE(plPOSTaggings = plOrgPOSTaggings, pllSuffixes = pllSuffixes)
  161. vldFVs, vDumpFConfig, vldRawFVs = vFE.genFeatureVectors(pFConfig = pFConfig)
  162. return vldFVs, vldRawFVs, vDumpFConfig
  163. def toSentPOSTagging(self, plPOSTags, plSentLens):
  164. '''
  165. Splits the one-token-per-line POS tags to one-sentence-per-line
  166. '''
  167. vlSentPOSTags = []
  168. vFrom = 0
  169. for vSLen in plSentLens:
  170. vlSentPOSTags.append(plPOSTags[vFrom : vFrom + vSLen])
  171. vFrom += vSLen
  172. return vlSentPOSTags
  173. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  174. # POSAmenderFE
  175. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  176. class POSAmenderFE:
  177. '''
  178. Class for extracting POSAmender features
  179. '''
  180. def __init__(self, plPOSTaggings, pllSuffixes):
  181. '''
  182. Constructor
  183. '''
  184. ## list of original sentence POS taggings (pos.POSTagging) to be
  185. ## amended
  186. self.posTaggings = plPOSTaggings
  187. ## list of sentence suffixes each represented by a list of suffixes
  188. ## per token (empty strings for no suffix)
  189. self.suffixes = pllSuffixes
  190. ## feature config generated during extraction (not the one given;
  191. ## this is a more complete version with binarization results)
  192. self.config = None
  193. def genFeatureVectors(self, pFConfig = '', pflgReturnRawFVs = True):
  194. '''
  195. Generates feature vectors based on the specified configuration
  196. in YAML-compatible format.
  197. The the feature configuration is given in pFConfig. See fvg.FVGen
  198. for the description of its format.
  199. It can optionally output raw feature and value pairs in the following
  200. format:
  201. <FEATURE 1 NAME=FEATURE 1 VALUE> <FEATURE 2 NAME=FEATURE 2 VALUE> ...
  202. '''
  203. vFVG = fvg.FVGen(pFConfig)
  204. vlFeatures = vFVG.getFeatures()
  205. for vFeature in vlFeatures:
  206. vlFeatureValues = self._extractFeature(vFeature.name, vFeature.featureParams)
  207. vFVG.loadFeatureValues(vFeature.name, vlFeatureValues)
  208. vldFVectors, self.config = vFVG.genFeatureVectors()
  209. if pflgReturnRawFVs:
  210. ## list of dictionaries with key being the feature name and values
  211. ## the feature values
  212. vldRawFVectors = vFVG.getRawFVectors()
  213. return vldFVectors, self.config, vldRawFVectors
  214. else:
  215. return vldFVectors, self.config
  216. def _extractFeature(self, pFeatureName, pdFeatureParams):
  217. '''
  218. Extracts values for pFeatureName
  219. pFeatureName is the one specified in confiquration file. In order
  220. to be able to have multiple settings of a single feature type, an
  221. entry for each setting is created in the configuration file with
  222. the feature name suffixed by any unique string. The specific feature
  223. parameters then goes under each entry. The reason is that configuration
  224. file is in YAML format which is dictionary-like and requires unique
  225. keys. So, the feature name can appear only once or only one of the
  226. appearances would be considered.
  227. '''
  228. if pFeatureName.startswith("token-word-form") or pFeatureName.startswith("form"):
  229. vlValues = self.extractWordForm()
  230. elif pFeatureName.startswith("token-pos-tag") or pFeatureName.startswith("pos"):
  231. vlValues = self.extractPOS()
  232. elif pFeatureName.startswith("token-pos-suffix") or pFeatureName.startswith("suffix"):
  233. vlValues = self.extractSuffix()
  234. elif pFeatureName.startswith("token-window-word-form") or pFeatureName.startswith("window-form"):
  235. vlValues = self.extractWindowWordForm(pdFeatureParams)
  236. elif pFeatureName.startswith("token-window-pos-tag") or pFeatureName.startswith("window-pos"):
  237. vlValues = self.extractWindowPOS(pdFeatureParams)
  238. elif pFeatureName.startswith("token-window-pos-suffix") or pFeatureName.startswith("window-suffix"):
  239. vlValues = self.extractWindowSuffix(pdFeatureParams)
  240. else:
  241. raise Exception("Feature %s is unknown! Check the spelling." % pFeatureName)
  242. return vlValues
  243. def extractWordForm(self):
  244. '''
  245. Extracts token word forms
  246. '''
  247. vlValues = []
  248. for vSentPOSTagging in self.posTaggings:
  249. for vToken in vSentPOSTagging:
  250. vlValues.append(vToken.form)
  251. return vlValues
  252. def extractPOS(self):
  253. '''
  254. Extracts token POS tags
  255. '''
  256. vlValues = []
  257. for vSentPOSTagging in self.posTaggings:
  258. for vToken in vSentPOSTagging:
  259. vlValues.append(vToken.tag)
  260. return vlValues
  261. def extractSuffix(self):
  262. '''
  263. Extracts token POS suffix
  264. '''
  265. vlValues = []
  266. for vSentSuffixes in self.suffixes:
  267. for vSuffix in vSentSuffixes:
  268. vlValues.append(vSuffix)
  269. return vlValues
  270. def extractWindowWordForm(self, pdFeatureParams):
  271. '''
  272. Extracts word forms of tokens located in a given window distance
  273. to the main token
  274. It returns "NULL" for the positions falling outside the sentence.
  275. pdFeatureParams contains the following parameters:
  276. - "window position": the relative position (distance) of the token
  277. to the main token (negative for left and positive
  278. for right hand side tokens).
  279. '''
  280. # parameter setting
  281. vWindowPosition = 0
  282. for vParam, vValue in pdFeatureParams.iteritems():
  283. if vParam in ["window position", "position", "distance"]:
  284. vWindowPosition = vValue
  285. else:
  286. raise Exception("'%s' is not a valid feature parameter!" % vParam)
  287. # extracting values
  288. vlValues = []
  289. for vSentPOSTagging in self.posTaggings:
  290. for i, vToken in enumerate(vSentPOSTagging):
  291. if i + vWindowPosition < 0:
  292. vlValues.append("NULL")
  293. elif i + vWindowPosition >= vSentPOSTagging.length:
  294. vlValues.append("NULL")
  295. else:
  296. vlValues.append(vToken.form)
  297. return vlValues
  298. def extractWindowPOS(self, pdFeatureParams):
  299. '''
  300. Extracts POS taggs of tokens located in a given window distance
  301. to the main token
  302. It returns "NULL" for the positions falling outside the sentence.
  303. pdFeatureParams contains the following parameters:
  304. - "window position": the relative position (distance) of the token
  305. to the main token (negative for left and positive
  306. for right hand side tokens).
  307. '''
  308. # parameter setting
  309. vWindowPosition = 0
  310. for vParam, vValue in pdFeatureParams.iteritems():
  311. if vParam in ["window position", "position", "distance"]:
  312. vWindowPosition = vValue
  313. else:
  314. raise Exception("'%s' is not a valid feature parameter!" % vParam)
  315. # extracting values
  316. vlValues = []
  317. for vSentPOSTagging in self.posTaggings:
  318. for i, vToken in enumerate(vSentPOSTagging):
  319. if i + vWindowPosition < 0:
  320. vlValues.append("NULL")
  321. elif i + vWindowPosition >= vSentPOSTagging.length:
  322. vlValues.append("NULL")
  323. else:
  324. vlValues.append(vToken.tag)
  325. return vlValues
  326. def extractWindowSuffix(self, pdFeatureParams):
  327. '''
  328. Extracts POS tag suffixes of tokens located in a given window distance
  329. to the main token
  330. It returns "NULL" for the positions falling outside the sentence.
  331. pdFeatureParams contains the following parameters:
  332. - "window position": the relative position (distance) of the token
  333. to the main token (negative for left and positive
  334. for right hand side tokens).
  335. '''
  336. # parameter setting
  337. vWindowPosition = 0
  338. for vParam, vValue in pdFeatureParams.iteritems():
  339. if vParam in ["window position", "position", "distance"]:
  340. vWindowPosition = vValue
  341. else:
  342. raise Exception("'%s' is not a valid feature parameter!" % vParam)
  343. # extracting values
  344. vlValues = []
  345. for vSentSuffixes in self.suffixes:
  346. for i, vSuffix in enumerate(vSentSuffixes):
  347. if i + vWindowPosition < 0:
  348. vlValues.append("NULL")
  349. elif i + vWindowPosition >= len(vSentSuffixes):
  350. vlValues.append("NULL")
  351. else:
  352. vlValues.append(vSuffix)
  353. return vlValues
  354. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  355. # POSAmenderTrainer
  356. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  357. class POSAmenderTrainer:
  358. '''
  359. Class for training a POSAmender model
  360. '''
  361. def __init__(self, pldFVs, pllGoldPOSTags):
  362. '''
  363. Constructor
  364. '''
  365. # feature vector list
  366. self.fvs = pldFVs
  367. # list of gold POS tags of sentences
  368. self.goldPOSTags = pllGoldPOSTags
  369. def trainSKLMaxEnt(self):
  370. '''
  371. Trains and returns a MaxEnt (logit) model using scikit-learn
  372. '''
  373. vllFVs = self.getFVsInSKLFormat()
  374. vlGoldLabels = self.extractGoldLabels()
  375. from sklearn import linear_model
  376. vModel = linear_model.LogisticRegression(verbose = 1)
  377. vModel.fit(X = vllFVs, y = vlGoldLabels)
  378. # generating the input data in SciKitLearn format for dumping
  379. vlSKLInputDump = ["%s\t%s" % (g, ' '.join([str(v) for v in fv])) for g, fv in zip(vlGoldLabels, vllFVs)]
  380. return vlSKLInputDump, vModel
  381. def getFVsInSKLFormat(self):
  382. '''
  383. Transforms the feature vectors to the input format of ScikitLearn
  384. The ScikitLearn format is a 2D array of shape (#_of_samples, #_of_features)
  385. '''
  386. vllSKLFVs = []
  387. for vdFV in self.fvs:
  388. vllSKLFVs.append([v for n, v in vdFV.itervalues()])
  389. return vllSKLFVs
  390. def extractGoldLabels(self):
  391. '''
  392. Extracts and returns the gold labels (POS tags) for training instances
  393. '''
  394. return [t for ts in self.goldPOSTags for t in ts]
  395. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  396. # POSAmenderPreder
  397. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  398. class POSAmenderPreder:
  399. '''
  400. Class for predicting the correct POS tag
  401. '''
  402. def __init__(self, pldFVs):
  403. '''
  404. Constructor
  405. '''
  406. # feature vector list
  407. self.fvs = pldFVs
  408. def predictWithSKLMaxEnt(self, pModel):
  409. '''
  410. Predicts and returns correct POS tags using a scikit-learn MaxEnt
  411. (logit) model
  412. '''
  413. vllFVs = self.getFVsInSKLFormat()
  414. from sklearn import linear_model
  415. vlPredLabels = pModel.predict(vllFVs)
  416. # generating the input data in SciKitLearn format for dumping
  417. vlSKLInputDump = [' '.join([str(v) for v in fv]) for fv in vllFVs]
  418. return vlSKLInputDump, vlPredLabels
  419. def getFVsInSKLFormat(self):
  420. '''
  421. Transforms the feature vectors to the input format of ScikitLearn
  422. The ScikitLearn format is a 2D arrar of shape (#_of_samples, #_of_features)
  423. '''
  424. vllSKLFVs = []
  425. for vdFV in self.fvs:
  426. vllSKLFVs.append([v for n, v in vdFV.itervalues()])
  427. return vllSKLFVs
  428. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  429. # POSAmenderTester
  430. #¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬¬
  431. class POSAmenderEvaler:
  432. '''
  433. Class for evaluating the POS tag correction
  434. '''
  435. def __init__(self, pllGoldPOSTags):
  436. '''
  437. Constructor
  438. '''
  439. # list of gold POS tags of sentences
  440. self.goldPOSTags = pllGoldPOSTags
  441. def eval(self, pllPredPOSTags):
  442. '''
  443. Evaluates the predicted POS tags and returns the accuracy at token
  444. and sentence levels
  445. '''
  446. vPredEval = eval.PredEval()
  447. # sentence level
  448. vSAcc = vPredEval.acc(plPrediction = pllPredPOSTags, plReference = self.goldPOSTags)
  449. # token level
  450. vlGoldTags = [t for ts in self.goldPOSTags for t in ts]
  451. vlPredTags = [t for ts in pllPredPOSTags for t in ts]
  452. vTAcc = vPredEval.acc(plPrediction = vlPredTags, plReference = vlGoldTags)
  453. return vTAcc, vSAcc