rszk
/
sea


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
							# coding: utf-8

# Generates the SemEval2016 SEA dataset (Sentiment Expression Annotation) by extracting the source data from 
# the given original SemEval2016 ABSA dataset and merging them with the SEA annotations.

from lib.semeval_absa import *
import sys, optparse


def loadATs(pSemEval2016XML):
    '''
    Loads SemEval2016 ABSA annotations from the input XML file and returns aspect term objects
    '''
    
    absa2016_set = ABSA2016Set()
    absa2016_set.load(pSemEval2016XML, pflgTokenize = True, pflgUTF8 = True, pSort = "id")
    return [at for at in absa2016_set.getAspectTerms()]


def loadIO(pIOFilename):
    '''
    Loads IO tagging from file in a columnar format
    '''
    
    return [s.split('\n') for s in open(pIOFilename).read().strip('\n').split('\n\n')]
    

def mergeTxtSEA(pSentTokens, pSentIO):
    '''
    Given a sentence tokens and its matching IPS tags, merges them in a columnar format
    '''
    
    # sanity check 
    if len(pSentTokens) != len(pSentIO):
        raise Exception("Number of tokens in the sentence ({}) does not match the number of token IO tags ({})".format(len(pSentTokens), len(pSentIO)))
    
    return '\n'.join(["{}\t{}".format(t, io) for t, io in zip(pSentTokens, pSentIO)])
    

def generateOutput(pXMLFilename, pIOTagInFilename, pATOutFilename, pIOTagOutFilename):
    '''
    Generates annotation output for the given SemEval2016 XML file with the given IO tag file
    '''
    
    ats = loadATs(pXMLFilename)
    print("{} aspect terms loaded.".format(len(ats)))

    io_tags = loadIO(pIOTagInFilename)
    print("{} IO taggings loaded.".format(len(io_tags)))

    print("\nWriting output files ...")
    
    open(pATOutFilename, 'w').write('\n'.join(["{}\t\t{}".format(at.getCategory(), at.getPolarity()) for at in ats]) + '\n')
    print("  apsect terms are saved in {} (one per line)".format(pATOutFilename))
    
    open(pIOTagOutFilename, 'w').write('\n\n'.join([mergeTxtSEA(at.sentence.getTokens(), sent_io) for at, sent_io in zip(ats, io_tags)]))
    print("  IO taggings are saved in {} (columnar format; one per aspect term)".format(pIOTagOutFilename) + '\n')


def main(argv=None):
    if argv is None:
        argv = sys.argv
    
    parser = optparse.OptionParser(usage="%prog <OPTIONS>" +
                                         "\nGenerates the SemEval2016 SEA dataset by extracting the source data from the original SemEval2016 XML file and merging them with the SEA annotations.", version="%prog 1.0")
    
    parser.add_option("-x", "--xml", help="SemEval2018 XML file", dest="xml", action="store")
    parser.add_option("-d", "--domain", help="laptop or restaurant", dest="domain", action="store")
    parser.add_option("-s", "--subset", help="train or test", dest="subset", action="store")

    (opts, posargs) = parser.parse_args()

    io_input_filename = "ann/{}.{}.io".format(opts.subset, opts.domain)
    io_output_filename = "{}.{}.aio".format(opts.subset, opts.domain)
    at_output_filename = "{}.{}.at".format(opts.subset, opts.domain)

    print("Loading {} {} data from {} ...\n".format(opts.domain, opts.subset, opts.xml))
    generateOutput(opts.xml, io_input_filename, at_output_filename, io_output_filename)


if __name__ == "__main__":
	sys.exit(main())