|
@@ -0,0 +1,88 @@
|
|
|
+# coding: utf-8
|
|
|
+
|
|
|
+# Generates the SemEval2016 SEA dataset (Sentiment Expression Annotation) by extracting the source data from
|
|
|
+# the given original SemEval2016 ABSA dataset and merging them with the SEA annotations.
|
|
|
+
|
|
|
+from lib.semeval_absa import *
|
|
|
+import sys, optparse
|
|
|
+
|
|
|
+
|
|
|
+def loadATs(pSemEval2016XML):
|
|
|
+ '''
|
|
|
+ Loads SemEval2016 ABSA annotations from the input XML file and returns aspect term objects
|
|
|
+ '''
|
|
|
+
|
|
|
+ absa2016_set = ABSA2016Set()
|
|
|
+ absa2016_set.load(pSemEval2016XML, pflgTokenize = True, pflgUTF8 = True, pSort = "id")
|
|
|
+ return [at for at in absa2016_set.getAspectTerms()]
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def loadIO(pIOFilename):
|
|
|
+ '''
|
|
|
+ Loads IO tagging from file in a columnar format
|
|
|
+ '''
|
|
|
+
|
|
|
+ return [s.split('\n') for s in open(pIOFilename).read().strip('\n').split('\n\n')]
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def mergeTxtSEA(pSentTokens, pSentIO):
|
|
|
+ '''
|
|
|
+ Given a sentence tokens and its matching IPS tags, merges them in a columnar format
|
|
|
+ '''
|
|
|
+
|
|
|
+ # sanity check
|
|
|
+ if len(pSentTokens) != len(pSentIO):
|
|
|
+ raise Exception("Number of tokens in the sentence ({}) does not match the number of token IO tags ({})".format(len(pSentTokens), len(pSentIO)))
|
|
|
+
|
|
|
+ return '\n'.join(["{}\t{}".format(t, io) for t, io in zip(pSentTokens, pSentIO)])
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def generateOutput(pXMLFilename, pIOTagInFilename, pATOutFilename, pIOTagOutFilename):
|
|
|
+ '''
|
|
|
+ Generates annotation output for the given SemEval2016 XML file with the given IO tag file
|
|
|
+ '''
|
|
|
+
|
|
|
+ ats = loadATs(pXMLFilename)
|
|
|
+ print("{} aspect terms loaded.".format(len(ats)))
|
|
|
+
|
|
|
+ io_tags = loadIO(pIOTagInFilename)
|
|
|
+ print("{} IO taggings loaded.".format(len(io_tags)))
|
|
|
+
|
|
|
+ print("\nWriting output files ...")
|
|
|
+
|
|
|
+ open(pATOutFilename, 'w').write('\n'.join(["{}\t\t{}".format(at.getCategory(), at.getPolarity()) for at in ats]) + '\n')
|
|
|
+ print(" apsect terms are saved in {} (one per line)".format(pATOutFilename))
|
|
|
+
|
|
|
+ open(pIOTagOutFilename, 'w').write('\n\n'.join([mergeTxtSEA(at.sentence.getTokens(), sent_io) for at, sent_io in zip(ats, io_tags)]))
|
|
|
+ print(" IO taggings are saved in {} (columnar format; one per aspect term)".format(pIOTagOutFilename) + '\n')
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def main(argv=None):
|
|
|
+ if argv is None:
|
|
|
+ argv = sys.argv
|
|
|
+
|
|
|
+ parser = optparse.OptionParser(usage="%prog <OPTIONS>" +
|
|
|
+ "\nGenerates the SemEval2016 SEA dataset by extracting the source data from the original SemEval2016 XML file and merging them with the SEA annotations.", version="%prog 1.0")
|
|
|
+
|
|
|
+ parser.add_option("-x", "--xml", help="SemEval2018 XML file", dest="xml", action="store")
|
|
|
+ parser.add_option("-d", "--domain", help="laptop or restaurant", dest="domain", action="store")
|
|
|
+ parser.add_option("-s", "--subset", help="train or test", dest="subset", action="store")
|
|
|
+
|
|
|
+ (opts, posargs) = parser.parse_args()
|
|
|
+
|
|
|
+ io_input_filename = "ann/{}.{}.io".format(opts.subset, opts.domain)
|
|
|
+ io_output_filename = "{}.{}.aio".format(opts.subset, opts.domain)
|
|
|
+ at_output_filename = "{}.{}.at".format(opts.subset, opts.domain)
|
|
|
+
|
|
|
+ print("Loading {} {} data from {} ...\n".format(opts.domain, opts.subset, opts.xml))
|
|
|
+ generateOutput(opts.xml, io_input_filename, at_output_filename, io_output_filename)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ sys.exit(main())
|