harsh
/
GDPRtEXT


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
							#!/usr/bin/env bash

# author: Harshvardhan Pandit

# Generates RDF graph for GDPR text from JSON (gdpr.json)
# The JSON file is in ../deliverables/gdpr.json
# The algorithm is roughly as follows:

# For every chapter, article, subpoint, et. al. it defines the following
# attributes using the ELI vocabulary
# @prefix ELI https://publications.europa.eu/en/mdr/resource/eli/eli.owl
#
# - ELI:title (xsd:string) title of the item
# - ELI:number (xsd:string) number of the item in text
# - ELI:title_alternative (xsd:string) an alternative representation of the
#       item as <item_type> followed by <item_number>; e.g. Chapter 1
# - ELI:description (xsd:string) text of the item
# - ELI:is_part_of (ELI:LegalResource) gdpr:GDPR
#       here, this is an instance of a LegalResource created to represent
#       the GDPR, and is present in another OWL file, which is being edited
#       in Protege. The reference does not need to be resolved.
# - ELI:id_local (xsd:string) the ID of the resource in the HTML file
#       This is a bungling of the proper use of the vocabulary, but it keeps
#       all terms within the intended usage. The same ID can be used to
#       lookup the resource in the HTML file (so it is the ID attribute of
#       the resource).
#
# This script generates the RDF pairings for the text of the GDPR. The bulk
# of the work and the part where the GDPR itself is referenced is actually
# done using Protege, and does not need the use of a script, such as this,
# since it is a manual labour.

##############################################################################

# Load the JSON from file
with open('../deliverables/gdpr.json') as fd:
    import json
    gdpr_json = json.load(fd)
    # import p# print
    # p# print.p# print(gdpr_json)

from rdflib import Graph, RDF, XSD, Literal, BNode
# This will be the graph used to hold the triples as they are being generated
graph = Graph()

from rdflib import Namespace
# This is the ELI namespace, used as the legal vocabulary for EU text
ELI = Namespace("http://data.europa.eu/eli/ontology#")
LRS = ELI.LegalResourceSubdivision
title = ELI.title
number = ELI.number
title_alternative = ELI.title_alternative
is_part_of = ELI.is_part_of
description = ELI.description
# This is the GDPR namespace used by the project
# NOTE: this is temporary
GDPR = Namespace("http://www.semanticweb.org/harsh/ontologies/GDPR#")
node_gdpr = GDPR.GDPR

##############################################################################

# The chapters are in json.chapters as an array of dicts
# { number, title, contents }
# A chapter may contain sections, and if it does, then it has the same
# structure as well.
# If a chapter has sections, then the section, else the chapter itself, has
# several articles with the same structure.
# Each article has several points, which may or may not be numbered.
# Each point may have several points, which may or may not be numbered.


def graph_subpoint(
        subpoint, article_number, point_number,
        point, article, section=None, chapter=None):
    '''adds subpoint to graph'''
    # print('SP', subpoint['number'])
    if subpoint['number'] is not None:
        node_subpoint = GDPR['article{}-{}{}'.format(
            article_number, point_number, subpoint['number'])]
        graph.add((node_subpoint, number, Literal(
            subpoint['number'], datatype=XSD.string)))
        graph.add((node_subpoint, title_alternative, Literal(
            'Article' + article_number + '({}{})'.format(
                point_number, subpoint['number']),
            datatype=XSD.string)))
    else:
        node_subpoint = BNode()
    graph.add((node_subpoint, RDF.type, LRS))
    graph.add((node_subpoint, is_part_of, node_gdpr))
    graph.add((node_subpoint, is_part_of, chapter))
    if section is not None:
        graph.add((node_subpoint, is_part_of, section))
    graph.add((node_subpoint, is_part_of, article))
    graph.add((node_subpoint, is_part_of, point))
    graph.add((node_subpoint, description, Literal(
        subpoint['text'], datatype=XSD.string)))


def graph_point(point, article_number, article, section=None, chapter=None):
    '''adds point to graph'''
    # print('P', point['number'])
    if point['number'] is not None:
        node_point = GDPR['article{}-{}'.format(
            article_number, point['number'])]
        graph.add((node_point, number, Literal(
            point['number'], datatype=XSD.string)))
        graph.add((node_point, title_alternative, Literal(
            'Article' + article_number + '({})'.format(point['number']),
            datatype=XSD.string)))
    else:
        node_point = BNode()
    graph.add((node_point, RDF.type, LRS))
    graph.add((node_point, is_part_of, node_gdpr))
    graph.add((node_point, is_part_of, chapter))
    if section is not None:
        graph.add((node_point, is_part_of, section))
    graph.add((node_point, is_part_of, article))
    graph.add((node_point, description, Literal(
        point['text'], datatype=XSD.string)))
    for subpoint in point['subpoints']:
        graph_subpoint(
                subpoint, article_number, point['number'],
                node_point, article, section, chapter)


def graph_article(article, section=None, chapter=None):
    '''adds article to graph'''
    # print('A', article['number'])
    node_article = GDPR['article{}'.format(article['number'])]
    graph.add((node_article, RDF.type, LRS))
    graph.add((node_article, number, Literal(
        article['number'], datatype=XSD.string)))
    graph.add((node_article, title_alternative, Literal(
        'Article ' + article['number'], datatype=XSD.string)))
    graph.add((node_article, is_part_of, node_gdpr))
    graph.add((node_article, is_part_of, chapter))
    if section is not None:
        graph.add((node_article, is_part_of, section))
    for point in article['contents']:
        graph_point(point, article['number'], node_article, section, chapter)


def graph_section(section, chapter):
    '''adds section to graph'''
    # print('S', section['number'], section['title'])
    node_section = GDPR['section{}'.format(section['number'])]
    graph.add((node_section, RDF.type, LRS))
    graph.add((node_section, title, Literal(
        section['title'], datatype=XSD.string)))
    graph.add((node_section, number, Literal(
        section['number'], datatype=XSD.string)))
    graph.add((node_section, title_alternative, Literal(
        'Section ' + section['number'], datatype=XSD.string)))
    graph.add((node_section, is_part_of, node_gdpr))
    graph.add((node_section, is_part_of, chapter))
    for article in section['contents']:
        graph_article(article, node_section, chapter)


def graph_chapter(chapter):
    '''adds chapter to graph'''
    # print('C', chapter['number'], chapter['title'])
    node_chapter = GDPR['chapter{}'.format(chapter['number'])]
    graph.add((node_chapter, RDF.type, LRS))
    graph.add((node_chapter, title, Literal(
        chapter['title'], datatype=XSD.string)))
    graph.add((node_chapter, number, Literal(
        chapter['number'], datatype=XSD.string)))
    graph.add((node_chapter, title_alternative, Literal(
        'Chapter ' + chapter['number'], datatype=XSD.string)))
    graph.add((node_chapter, is_part_of, node_gdpr))

    contents = chapter['contents']
    # Section (if any)
    if contents[0]['type'] == 'section':
        for item in contents:
            graph_section(item, node_chapter)
    else:
        for item in contents:
            graph_article(item, None, node_chapter)


for chapter in gdpr_json['chapters']:
    graph_chapter(chapter)

graph.serialize(destination='gdpr.ttl', format='turtle')