Browse Source

Script and annotations added

Rasoul Kaljahi 9 months ago
commit
0bafafb402
9 changed files with 126264 additions and 0 deletions
  1. 15324 0
      ann/test.laptop.io
  2. 16308 0
      ann/test.restaurant.io
  3. 50062 0
      ann/train.laptop.io
  4. 43404 0
      ann/train.restaurant.io
  5. 88 0
      generate-sea.py
  6. 1 0
      lib/__init__.py
  7. BIN
      lib/__init__.pyc
  8. 1077 0
      lib/semeval_absa.py
  9. BIN
      lib/semeval_absa.pyc

File diff suppressed because it is too large
+ 15324 - 0
ann/test.laptop.io


File diff suppressed because it is too large
+ 16308 - 0
ann/test.restaurant.io


File diff suppressed because it is too large
+ 50062 - 0
ann/train.laptop.io


File diff suppressed because it is too large
+ 43404 - 0
ann/train.restaurant.io


+ 88 - 0
generate-sea.py

@@ -0,0 +1,88 @@
+# coding: utf-8
+
+# Generates the SemEval2016 SEA dataset (Sentiment Expression Annotation) by extracting the source data from 
+# the given original SemEval2016 ABSA dataset and merging them with the SEA annotations.
+
+from lib.semeval_absa import *
+import sys, optparse
+
+
+def loadATs(pSemEval2016XML):
+    '''
+    Loads SemEval2016 ABSA annotations from the input XML file and returns aspect term objects
+    '''
+    
+    absa2016_set = ABSA2016Set()
+    absa2016_set.load(pSemEval2016XML, pflgTokenize = True, pflgUTF8 = True, pSort = "id")
+    return [at for at in absa2016_set.getAspectTerms()]
+
+
+
+def loadIO(pIOFilename):
+    '''
+    Loads IO tagging from file in a columnar format
+    '''
+    
+    return [s.split('\n') for s in open(pIOFilename).read().strip('\n').split('\n\n')]
+    
+
+
+def mergeTxtSEA(pSentTokens, pSentIO):
+    '''
+    Given a sentence tokens and its matching IPS tags, merges them in a columnar format
+    '''
+    
+    # sanity check 
+    if len(pSentTokens) != len(pSentIO):
+        raise Exception("Number of tokens in the sentence ({}) does not match the number of token IO tags ({})".format(len(pSentTokens), len(pSentIO)))
+    
+    return '\n'.join(["{}\t{}".format(t, io) for t, io in zip(pSentTokens, pSentIO)])
+    
+
+
+def generateOutput(pXMLFilename, pIOTagInFilename, pATOutFilename, pIOTagOutFilename):
+    '''
+    Generates annotation output for the given SemEval2016 XML file with the given IO tag file
+    '''
+    
+    ats = loadATs(pXMLFilename)
+    print("{} aspect terms loaded.".format(len(ats)))
+
+    io_tags = loadIO(pIOTagInFilename)
+    print("{} IO taggings loaded.".format(len(io_tags)))
+
+    print("\nWriting output files ...")
+    
+    open(pATOutFilename, 'w').write('\n'.join(["{}\t\t{}".format(at.getCategory(), at.getPolarity()) for at in ats]) + '\n')
+    print("  apsect terms are saved in {} (one per line)".format(pATOutFilename))
+    
+    open(pIOTagOutFilename, 'w').write('\n\n'.join([mergeTxtSEA(at.sentence.getTokens(), sent_io) for at, sent_io in zip(ats, io_tags)]))
+    print("  IO taggings are saved in {} (columnar format; one per aspect term)".format(pIOTagOutFilename) + '\n')
+
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    
+    parser = optparse.OptionParser(usage="%prog <OPTIONS>" +
+                                         "\nGenerates the SemEval2016 SEA dataset by extracting the source data from the original SemEval2016 XML file and merging them with the SEA annotations.", version="%prog 1.0")
+    
+    parser.add_option("-x", "--xml", help="SemEval2018 XML file", dest="xml", action="store")
+    parser.add_option("-d", "--domain", help="laptop or restaurant", dest="domain", action="store")
+    parser.add_option("-s", "--subset", help="train or test", dest="subset", action="store")
+
+    (opts, posargs) = parser.parse_args()
+
+    io_input_filename = "ann/{}.{}.io".format(opts.subset, opts.domain)
+    io_output_filename = "{}.{}.aio".format(opts.subset, opts.domain)
+    at_output_filename = "{}.{}.at".format(opts.subset, opts.domain)
+
+    print("Loading {} {} data from {} ...\n".format(opts.domain, opts.subset, opts.xml))
+    generateOutput(opts.xml, io_input_filename, at_output_filename, io_output_filename)
+
+
+
+
+if __name__ == "__main__":
+	sys.exit(main())

+ 1 - 0
lib/__init__.py

@@ -0,0 +1 @@
+ 

BIN
lib/__init__.pyc


File diff suppressed because it is too large
+ 1077 - 0
lib/semeval_absa.py


BIN
lib/semeval_absa.pyc