harsh
/
GDPRtEXT


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
							#!/usr/bin/env python3

# author: Harshvardhan Pandit

# Generates RDF graph for GDPR text from JSON (gdpr.json)
# The JSON file is in ../deliverables/gdpr.json
# The algorithm is roughly as follows:

# For every chapter, article, subpoint, et. al. it defines the following
# attributes using the ELI vocabulary
# @prefix ELI https://publications.europa.eu/en/mdr/resource/eli/eli.owl
#
# - ELI:title (xsd:string) title of the item
# - ELI:number (xsd:string) number of the item in text
# - ELI:TITLE_ALT (xsd:string) an alternative representation of the
#       item as <item_type> followed by <item_number>; e.g. Chapter 1
# - ELI:DESC (xsd:string) text of the item
# - ELI:PART_OF (ELI:LegalResource) gdpr:GDPR
#       here, this is an instance of a LegalResource created to represent
#       the GDPR, and is present in another OWL file, which is being edited
#       in Protege. The reference does not need to be resolved.
# - ELI:id_local (xsd:string) the ID of the resource in the HTML file
#       This is a bungling of the correct use of the vocabulary, but it keeps
#       all terms within the intended usage. The same ID can be used to
#       lookup the resource in the HTML file (so it is the ID attribute of
#       the resource).
#
# This script generates the RDF pairings for the text of the GDPR. The bulk
# of the work and the part where the GDPR itself is referenced is actually
# done using Protege, and does not need the use of a script, such as this,
# since it is a manual labour.

##############################################################################
from rdflib import Graph, RDF, RDFS, XSD, Literal, URIRef, BNode
from rdflib import Namespace

# Load the JSON from file
with open('../deliverables/gdpr.json') as fd:
    import json
    gdpr_json = json.load(fd)
    # import p# print
    # p# print.p# print(gdpr_json)
# This will be the graph used to hold the triples as they are being generated
graph = Graph()
GDPRtEXT_URI = URIRef(
        'http://purl.org/adaptcentre/openscience/ontologies/GDPRtEXT#')
GDPRtEXT = Namespace(GDPRtEXT_URI)
graph.namespace_manager.bind('GDPRtext', GDPRtEXT)
DCTERMS = Namespace('http://purl.org/dc/terms/')
graph.namespace_manager.bind('dcterms', DCTERMS)
# This is the ELI namespace, used as the legal vocabulary for EU text
ELI = Namespace("http://data.europa.eu/eli/ontology#")
graph.namespace_manager.bind('eli', ELI)
# bind ELI items to names that are easier to access (argumentative)
LRS = ELI.LegalResourceSubdivision
TITLE = ELI.title
NOS = ELI.number
TITLE_ALT = ELI.title_alternative
PART_OF = ELI.is_part_of
# graph.add((PART_OF, RDF.type, OWL.TransitiveProperty))
DESC = ELI.description
HAS_RECITAL = GDPRtEXT['hasRecital']

##############################################################################
# GDPR as named individual
# base URI
GDPR_URI = URIRef(
        'http://purl.org/adaptcentre/openscience/resources/GDPRtEXT#')
GDPR = Namespace(GDPR_URI)
graph.namespace_manager.bind('gdpr', GDPR)
gdpr = GDPR.GDPR
# graph.add((gdpr, RDF.type, OWL.NamedIndividual))
graph.add((gdpr, RDF.type, ELI.LR))
graph.add((gdpr, RDF.type, DCTERMS.Policy))
graph.add((gdpr, DCTERMS.identifier, Literal(
    '2016/679', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.language, Literal(
    'English', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.publisher, Literal(
    'Official Journal of the European Union', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.source, Literal(
    'http://eur-lex.europa.eu/eli/reg/2016/679/oj', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.title, Literal(
    'General Data Protection Regulation', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.title_alternative, Literal(
    'GDPR', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.title_alternative, Literal(
    'REGULATION (EU) 2016/679', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.creator, Literal(
    'European Parliament', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.creator, Literal(
    'Council of the European Union', datatype=XSD.string)))
graph.add((gdpr, DCTERMS.abstract, Literal((
    'REGULATION (EU) 2016/679 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL '
    'of 27 April 2016 '
    'on the protection of natural persons with regard to the processing of '
    'personal data and on the free movement of such data, '
    'and repealing Directive 95/46/EC '
    '(General Data Protection Regulation)'), datatype=XSD.string)))
gdpr_description = GDPR['description']
graph.add((gdpr_description, RDF.type, ELI.LRS))
graph.add((gdpr_description, DCTERMS.description, Literal((
    'THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION, '
    'Having regard to the Treaty on the Functioning of the European Union, '
    'and in particular Article 16 thereof, Having regard to the proposal from '
    'the European Commission, After transmission of the draft legislative act '
    'to the national parliaments, '
    'Having regard to the opinion of the European Economic '
    'and Social Committee, '
    'Having regard to the opinion of the Committee of the Regions, '
    'Acting in accordance with the ordinary legislative procedure,'
    ),  datatype=XSD.string)))
graph.add((gdpr_description, ELI.cites, GDPR['citation1']))
graph.add((gdpr_description, ELI.cites, GDPR['citation2']))
graph.add((gdpr_description, ELI.cites, GDPR['citation3']))
graph.add((gdpr, DCTERMS.description, gdpr_description))
graph.add((gdpr, ELI.date_document, Literal('2016-04-27', datatype=XSD.date)))
graph.add((gdpr, DCTERMS.date, Literal('2016-04-27', datatype=XSD.date)))
graph.add((gdpr, ELI.date_publication, Literal(
    '2016-05-04', datatype=XSD.date)))
graph.add((gdpr, DCTERMS.issued, Literal('2016-05-04', datatype=XSD.date)))
graph.add((gdpr, ELI.in_force, Literal(
    '2016-05-24', datatype=XSD.date)))
graph.add((gdpr, ELI.date_applicability, Literal(
    '2018-05-25', datatype=XSD.date)))


class_Chapter = GDPRtEXT['Chapter']
class_Section = GDPRtEXT['Section']
class_Article = GDPRtEXT['Article']
class_Point = GDPRtEXT['Point']
class_SubPoint = GDPRtEXT['SubPoint']
class_Recital = GDPRtEXT['Recital']
class_Citation = GDPRtEXT['Citation']
property_partof_chapter = GDPRtEXT['isPartOfChapter']
property_partof_section = GDPRtEXT['isPartOfSection']
property_partof_article = GDPRtEXT['isPartOfArticle']
property_partof_point = GDPRtEXT['isPartOfPoint']
property_has_chapter = GDPRtEXT['hasChapter']
property_has_section = GDPRtEXT['hasSection']
property_has_article = GDPRtEXT['hasArticle']
property_has_point = GDPRtEXT['hasPoint']
property_has_subpoint = GDPRtEXT['hasSubPoint']

#############################################################################

# The chapters are in json.chapters as an array of dicts
# { NOS, title, contents }
# A chapter may contain sections, and if it does, then it has the same
# structure as well.
# If a chapter has sections, then the section, else the chapter itself, has
# several articles with the same structure.
# Each article has several points, which may or may not be numbered.
# Each point may have several points, which may or may not be numbered.


def graph_subpoint(
        subpoint, article_number, point_number,
        point, article, section=None, chapter=None):
    '''adds subpoint to graph'''
    # print('SP', subpoint['number'])
    node_subpoint = GDPR['article{}-{}-{}'.format(
        article_number, point_number, subpoint['number'])]
    graph.add((node_subpoint, NOS, Literal(
        subpoint['number'], datatype=XSD.string)))
    graph.add((node_subpoint, TITLE_ALT, Literal(
        'Article' + article_number + '({})({})'.format(
            point_number, subpoint['number']),
        datatype=XSD.string)))
    graph.add((node_subpoint, RDF.type, LRS))
    graph.add((node_subpoint, RDF.type, class_SubPoint))
    graph.add((node_subpoint, PART_OF, chapter))
    graph.add((node_subpoint, property_partof_chapter, chapter))
    if section is not None:
        graph.add((node_subpoint, PART_OF, section))
        graph.add((node_subpoint, property_partof_section, section))
    graph.add((node_subpoint, PART_OF, article))
    graph.add((node_subpoint, property_partof_article, article))
    graph.add((node_subpoint, PART_OF, point))
    graph.add((node_subpoint, property_partof_point, point))
    graph.add((point, property_has_subpoint, node_subpoint))
    graph.add((node_subpoint, PART_OF, gdpr))
    graph.add((node_subpoint, DESC, Literal(
        subpoint['text'], datatype=XSD.string)))


def graph_point(point, article_number, article, section=None, chapter=None):
    '''adds point to graph'''
    # print('P', point['number'])
    node_point = GDPR['article{}-{}'.format(
        article_number, point['number'])]
    graph.add((node_point, NOS, Literal(
        point['number'], datatype=XSD.string)))
    graph.add((node_point, TITLE_ALT, Literal(
        'Article' + article_number + '({})'.format(point['number']),
        datatype=XSD.string)))
    graph.add((node_point, RDF.type, LRS))
    graph.add((node_point, RDF.type, class_Point))
    graph.add((node_point, PART_OF, chapter))
    graph.add((node_point, property_partof_chapter, chapter))
    if section is not None:
        graph.add((node_point, PART_OF, section))
        graph.add((node_point, property_partof_section, section))
    graph.add((node_point, PART_OF, article))
    graph.add((node_point, property_partof_article, article))
    graph.add((article, property_has_point, node_point))
    graph.add((node_point, PART_OF, gdpr))
    graph.add((node_point, DESC, Literal(
        point['text'], datatype=XSD.string)))
    # subpoint number to be used only when they are un-numbered
    subpoint_nos = 1
    for subpoint in point['subpoints']:
        if subpoint['number'] is None:
            subpoint['number'] = subpoint_nos
            subpoint_nos += 1
        graph_subpoint(
                subpoint, article_number, point['number'],
                node_point, article, section, chapter)


def graph_article(article, section=None, chapter=None):
    '''adds article to graph'''
    # print('A', article['number'])
    node_article = GDPR['article{}'.format(article['number'])]
    graph.add((node_article, RDF.type, LRS))
    graph.add((node_article, RDF.type, class_Article))
    graph.add((node_article, NOS, Literal(
        article['number'], datatype=XSD.string)))
    graph.add((node_article, TITLE_ALT, Literal(
        'Article ' + article['number'], datatype=XSD.string)))
    graph.add((node_article, PART_OF, chapter))
    graph.add((node_article, property_partof_chapter, chapter))
    graph.add((chapter, property_has_article, node_article))
    graph.add((gdpr, property_has_article, node_article))
    graph.add((node_article, PART_OF, gdpr))
    if section is not None:
        graph.add((node_article, PART_OF, section))
        graph.add((node_article, property_partof_section, section))
        graph.add((section, property_has_article, node_article))
    # point number for when points are unnumbered
    point_nos = 1
    for point in article['contents']:
        if point['number'] is None:
            point['number'] = point_nos
            point_nos += 1
        graph_point(point, article['number'], node_article, section, chapter)


def graph_section(section, chapter, chapter_number):
    '''adds section to graph'''
    # print('S', section['number'], section['title'])
    node_section = GDPR[
            'chapter{}-{}'.format(chapter_number, section['number'])]
    graph.add((node_section, RDF.type, LRS))
    graph.add((node_section, RDF.type, class_Section))
    graph.add((node_section, TITLE, Literal(
        section['title'], datatype=XSD.string)))
    graph.add((node_section, NOS, Literal(
        section['number'], datatype=XSD.string)))
    graph.add((node_section, TITLE_ALT, Literal(
        'Section ' + section['number'], datatype=XSD.string)))
    graph.add((node_section, PART_OF, chapter))
    graph.add((node_section, PART_OF, gdpr))
    graph.add((node_section, property_partof_chapter, chapter))
    graph.add((chapter, property_has_section, node_section))
    for article in section['contents']:
        graph_article(article, node_section, chapter)


def graph_chapter(chapter):
    '''adds chapter to graph'''
    # print('C', chapter['number'], chapter['title'])
    node_chapter = GDPR['chapter{}'.format(chapter['number'])]
    graph.add((node_chapter, RDF.type, LRS))
    graph.add((node_chapter, RDF.type, class_Chapter))
    graph.add((node_chapter, TITLE, Literal(
        chapter['title'], datatype=XSD.string)))
    graph.add((node_chapter, NOS, Literal(
        chapter['number'], datatype=XSD.string)))
    graph.add((node_chapter, TITLE_ALT, Literal(
        'Chapter ' + chapter['number'], datatype=XSD.string)))
    graph.add((node_chapter, PART_OF, gdpr))
    graph.add((gdpr, property_has_chapter, node_chapter))

    contents = chapter['contents']
    # Section (if any)
    if contents[0]['type'] == 'section':
        for item in contents:
            graph_section(item, node_chapter, chapter['number'])
    else:
        for item in contents:
            graph_article(item, None, node_chapter)


for chapter in gdpr_json['chapters']:
    graph_chapter(chapter)


for recital in gdpr_json['recitals']:
    node_recital = GDPR['recital{}'.format(recital['number'])]
    graph.add((node_recital, RDF.type, LRS))
    graph.add((node_recital, RDF.type, class_Recital))
    graph.add((node_recital, NOS, Literal(
        recital['number'], datatype=XSD.string)))
    graph.add((node_recital, DESC, Literal(
        recital['text'], datatype=XSD.string)))
    graph.add((node_recital, PART_OF, gdpr))
    graph.add((gdpr, HAS_RECITAL, node_recital))


for citation in gdpr_json['citations'].values():
    node_citation = GDPR['citation{}'.format(citation['number'])]
    graph.add((node_citation, RDF.type, LRS))
    graph.add((node_citation, RDF.type, class_Citation))
    graph.add((node_citation, NOS, Literal(
        citation['number'], datatype=XSD.string)))
    graph.add((node_citation, DESC, Literal(
        citation['text'], datatype=XSD.string)))
    graph.add((node_citation, PART_OF, gdpr))
    graph.add((gdpr, ELI.cites, node_citation))


# Add citations
graph.add((GDPR['recital3'], ELI.cites, GDPR['citation4']))
graph.add((GDPR['recital13'], ELI.cites, GDPR['citation5']))
graph.add((GDPR['recital17'], ELI.cites, GDPR['citation6']))
graph.add((GDPR['recital19'], ELI.cites, GDPR['citation7']))
graph.add((GDPR['recital21'], ELI.cites, GDPR['citation8']))
graph.add((GDPR['recital35'], ELI.cites, GDPR['citation9']))
graph.add((GDPR['recital42'], ELI.cites, GDPR['citation10']))
graph.add((GDPR['recital54'], ELI.cites, GDPR['citation11']))
graph.add((GDPR['recital106'], ELI.cites, GDPR['citation12']))
graph.add((GDPR['recital147'], ELI.cites, GDPR['citation13']))
graph.add((GDPR['recital154'], ELI.cites, GDPR['citation14']))
graph.add((GDPR['recital161'], ELI.cites, GDPR['citation15']))
graph.add((GDPR['recital163'], ELI.cites, GDPR['citation16']))
graph.add((GDPR['recital172'], ELI.cites, GDPR['citation17']))
graph.add((GDPR['recital173'], ELI.cites, GDPR['citation18']))
graph.add((GDPR['article4-25'], ELI.cites, GDPR['citation19']))
graph.add((GDPR['article43-1-2'], ELI.cites, GDPR['citation20']))
graph.add((GDPR['article76-2'], ELI.cites, GDPR['citation21']))

# Serialize
graph.serialize(destination='../deliverables/gdpr.ttl', format='turtle')
graph.serialize(destination='../deliverables/gdpr.rdf', format='pretty-xml')
graph.serialize(destination='../deliverables/gdpr.n3', format='n3')
graph.serialize(destination='../deliverables/gdpr.nt', format='nt')
graph.serialize(destination='../deliverables/gdpr.jsonld', format='json-ld')