12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- # coding: utf-8
- # Generates the SemEval2016 SEA dataset (Sentiment Expression Annotation) by extracting the source data from
- # the given original SemEval2016 ABSA dataset and merging them with the SEA annotations.
- from lib.semeval_absa import *
- import sys, optparse
- def loadATs(pSemEval2016XML):
- '''
- Loads SemEval2016 ABSA annotations from the input XML file and returns aspect term objects
- '''
-
- absa2016_set = ABSA2016Set()
- absa2016_set.load(pSemEval2016XML, pflgTokenize = True, pflgUTF8 = True, pSort = "id")
- return [at for at in absa2016_set.getAspectTerms()]
- def loadIO(pIOFilename):
- '''
- Loads IO tagging from file in a columnar format
- '''
-
- return [s.split('\n') for s in open(pIOFilename).read().strip('\n').split('\n\n')]
-
- def mergeTxtSEA(pSentTokens, pSentIO):
- '''
- Given a sentence tokens and its matching IPS tags, merges them in a columnar format
- '''
-
- # sanity check
- if len(pSentTokens) != len(pSentIO):
- raise Exception("Number of tokens in the sentence ({}) does not match the number of token IO tags ({})".format(len(pSentTokens), len(pSentIO)))
-
- return '\n'.join(["{}\t{}".format(t, io) for t, io in zip(pSentTokens, pSentIO)])
-
- def generateOutput(pXMLFilename, pIOTagInFilename, pATOutFilename, pIOTagOutFilename):
- '''
- Generates annotation output for the given SemEval2016 XML file with the given IO tag file
- '''
-
- ats = loadATs(pXMLFilename)
- print("{} aspect terms loaded.".format(len(ats)))
- io_tags = loadIO(pIOTagInFilename)
- print("{} IO taggings loaded.".format(len(io_tags)))
- print("\nWriting output files ...")
-
- open(pATOutFilename, 'w').write('\n'.join(["{}\t\t{}".format(at.getCategory(), at.getPolarity()) for at in ats]) + '\n')
- print(" apsect terms are saved in {} (one per line)".format(pATOutFilename))
-
- open(pIOTagOutFilename, 'w').write('\n\n'.join([mergeTxtSEA(at.sentence.getTokens(), sent_io) for at, sent_io in zip(ats, io_tags)]))
- print(" IO taggings are saved in {} (columnar format; one per aspect term)".format(pIOTagOutFilename) + '\n')
- def main(argv=None):
- if argv is None:
- argv = sys.argv
-
- parser = optparse.OptionParser(usage="%prog <OPTIONS>" +
- "\nGenerates the SemEval2016 SEA dataset by extracting the source data from the original SemEval2016 XML file and merging them with the SEA annotations.", version="%prog 1.0")
-
- parser.add_option("-x", "--xml", help="SemEval2018 XML file", dest="xml", action="store")
- parser.add_option("-d", "--domain", help="laptop or restaurant", dest="domain", action="store")
- parser.add_option("-s", "--subset", help="train or test", dest="subset", action="store")
- (opts, posargs) = parser.parse_args()
- io_input_filename = "ann/{}.{}.io".format(opts.subset, opts.domain)
- io_output_filename = "{}.{}.aio".format(opts.subset, opts.domain)
- at_output_filename = "{}.{}.at".format(opts.subset, opts.domain)
- print("Loading {} {} data from {} ...\n".format(opts.domain, opts.subset, opts.xml))
- generateOutput(opts.xml, io_input_filename, at_output_filename, io_output_filename)
- if __name__ == "__main__":
- sys.exit(main())
|