generate-sea.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. # coding: utf-8
  2. # Generates the SemEval2016 SEA dataset (Sentiment Expression Annotation) by extracting the source data from
  3. # the given original SemEval2016 ABSA dataset and merging them with the SEA annotations.
  4. from lib.semeval_absa import *
  5. import sys, optparse
  6. def loadATs(pSemEval2016XML):
  7. '''
  8. Loads SemEval2016 ABSA annotations from the input XML file and returns aspect term objects
  9. '''
  10. absa2016_set = ABSA2016Set()
  11. absa2016_set.load(pSemEval2016XML, pflgTokenize = True, pflgUTF8 = True, pSort = "id")
  12. return [at for at in absa2016_set.getAspectTerms()]
  13. def loadIO(pIOFilename):
  14. '''
  15. Loads IO tagging from file in a columnar format
  16. '''
  17. return [s.split('\n') for s in open(pIOFilename).read().strip('\n').split('\n\n')]
  18. def mergeTxtSEA(pSentTokens, pSentIO):
  19. '''
  20. Given a sentence tokens and its matching IPS tags, merges them in a columnar format
  21. '''
  22. # sanity check
  23. if len(pSentTokens) != len(pSentIO):
  24. raise Exception("Number of tokens in the sentence ({}) does not match the number of token IO tags ({})".format(len(pSentTokens), len(pSentIO)))
  25. return '\n'.join(["{}\t{}".format(t, io) for t, io in zip(pSentTokens, pSentIO)])
  26. def generateOutput(pXMLFilename, pIOTagInFilename, pATOutFilename, pIOTagOutFilename):
  27. '''
  28. Generates annotation output for the given SemEval2016 XML file with the given IO tag file
  29. '''
  30. ats = loadATs(pXMLFilename)
  31. print("{} aspect terms loaded.".format(len(ats)))
  32. io_tags = loadIO(pIOTagInFilename)
  33. print("{} IO taggings loaded.".format(len(io_tags)))
  34. print("\nWriting output files ...")
  35. open(pATOutFilename, 'w').write('\n'.join(["{}\t\t{}".format(at.getCategory(), at.getPolarity()) for at in ats]) + '\n')
  36. print(" apsect terms are saved in {} (one per line)".format(pATOutFilename))
  37. open(pIOTagOutFilename, 'w').write('\n\n'.join([mergeTxtSEA(at.sentence.getTokens(), sent_io) for at, sent_io in zip(ats, io_tags)]))
  38. print(" IO taggings are saved in {} (columnar format; one per aspect term)".format(pIOTagOutFilename) + '\n')
  39. def main(argv=None):
  40. if argv is None:
  41. argv = sys.argv
  42. parser = optparse.OptionParser(usage="%prog <OPTIONS>" +
  43. "\nGenerates the SemEval2016 SEA dataset by extracting the source data from the original SemEval2016 XML file and merging them with the SEA annotations.", version="%prog 1.0")
  44. parser.add_option("-x", "--xml", help="SemEval2018 XML file", dest="xml", action="store")
  45. parser.add_option("-d", "--domain", help="laptop or restaurant", dest="domain", action="store")
  46. parser.add_option("-s", "--subset", help="train or test", dest="subset", action="store")
  47. (opts, posargs) = parser.parse_args()
  48. io_input_filename = "ann/{}.{}.io".format(opts.subset, opts.domain)
  49. io_output_filename = "{}.{}.aio".format(opts.subset, opts.domain)
  50. at_output_filename = "{}.{}.at".format(opts.subset, opts.domain)
  51. print("Loading {} {} data from {} ...\n".format(opts.domain, opts.subset, opts.xml))
  52. generateOutput(opts.xml, io_input_filename, at_output_filename, io_output_filename)
  53. if __name__ == "__main__":
  54. sys.exit(main())