generate_rdf_pairings.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. #!/usr/bin/env python3
  2. # author: Harshvardhan Pandit
  3. # Generates RDF graph for GDPR text from JSON (gdpr.json)
  4. # The JSON file is in ../deliverables/gdpr.json
  5. # The algorithm is roughly as follows:
  6. # For every chapter, article, subpoint, et. al. it defines the following
  7. # attributes using the ELI vocabulary
  8. # @prefix ELI https://publications.europa.eu/en/mdr/resource/eli/eli.owl
  9. #
  10. # - ELI:title (xsd:string) title of the item
  11. # - ELI:number (xsd:string) number of the item in text
  12. # - ELI:TITLE_ALT (xsd:string) an alternative representation of the
  13. # item as <item_type> followed by <item_number>; e.g. Chapter 1
  14. # - ELI:DESC (xsd:string) text of the item
  15. # - ELI:PART_OF (ELI:LegalResource) gdpr:GDPR
  16. # here, this is an instance of a LegalResource created to represent
  17. # the GDPR, and is present in another OWL file, which is being edited
  18. # in Protege. The reference does not need to be resolved.
  19. # - ELI:id_local (xsd:string) the ID of the resource in the HTML file
  20. # This is a bungling of the correct use of the vocabulary, but it keeps
  21. # all terms within the intended usage. The same ID can be used to
  22. # lookup the resource in the HTML file (so it is the ID attribute of
  23. # the resource).
  24. #
  25. # This script generates the RDF pairings for the text of the GDPR. The bulk
  26. # of the work and the part where the GDPR itself is referenced is actually
  27. # done using Protege, and does not need the use of a script, such as this,
  28. # since it is a manual labour.
  29. ##############################################################################
  30. from rdflib import Graph, RDF, RDFS, XSD, Literal, URIRef, BNode
  31. from rdflib import Namespace
  32. # Load the JSON from file
  33. with open('../deliverables/gdpr.json') as fd:
  34. import json
  35. gdpr_json = json.load(fd)
  36. # import p# print
  37. # p# print.p# print(gdpr_json)
  38. # This will be the graph used to hold the triples as they are being generated
  39. graph = Graph()
  40. GDPRtEXT_URI = URIRef('http://purl.org/adaptcentre/ontologies/GDPRtEXT#')
  41. GDPRtEXT = Namespace(GDPRtEXT_URI)
  42. graph.namespace_manager.bind('GDPRtext', GDPRtEXT)
  43. DCTERMS = Namespace('http://purl.org/dc/terms/')
  44. graph.namespace_manager.bind('dcterms', DCTERMS)
  45. # This is the ELI namespace, used as the legal vocabulary for EU text
  46. ELI = Namespace("http://data.europa.eu/eli/ontology#")
  47. graph.namespace_manager.bind('eli', ELI)
  48. # bind ELI items to names that are easier to access (argumentative)
  49. LRS = ELI.LegalResourceSubdivision
  50. TITLE = ELI.title
  51. NOS = ELI.number
  52. TITLE_ALT = ELI.title_alternative
  53. PART_OF = ELI.is_part_of
  54. # graph.add((PART_OF, RDF.type, OWL.TransitiveProperty))
  55. DESC = ELI.description
  56. ##############################################################################
  57. # GDPR as named individual
  58. # base URI
  59. GDPR_URI = URIRef('http://purl.org/adaptcentre/resources/GDPRtEXT#')
  60. GDPR = Namespace(GDPR_URI)
  61. graph.namespace_manager.bind('gdpr', GDPR)
  62. gdpr = GDPR.GDPR
  63. # graph.add((gdpr, RDF.type, OWL.NamedIndividual))
  64. graph.add((gdpr, RDF.type, ELI.LR))
  65. graph.add((gdpr, RDF.type, DCTERMS.Policy))
  66. graph.add((gdpr, DCTERMS.identifier, Literal(
  67. '2016/679', datatype=XSD.string)))
  68. graph.add((gdpr, DCTERMS.language, Literal(
  69. 'English', datatype=XSD.string)))
  70. graph.add((gdpr, DCTERMS.publisher, Literal(
  71. 'Official Journal of the European Union', datatype=XSD.string)))
  72. graph.add((gdpr, DCTERMS.source, Literal(
  73. 'http://eur-lex.europa.eu/eli/reg/2016/679/oj', datatype=XSD.string)))
  74. graph.add((gdpr, DCTERMS.title, Literal(
  75. 'General Data Protection Regulation', datatype=XSD.string)))
  76. graph.add((gdpr, DCTERMS.title_alternative, Literal(
  77. 'GDPR', datatype=XSD.string)))
  78. graph.add((gdpr, DCTERMS.title_alternative, Literal(
  79. 'REGULATION (EU) 2016/679', datatype=XSD.string)))
  80. graph.add((gdpr, DCTERMS.creator, Literal(
  81. 'European Parliament', datatype=XSD.string)))
  82. graph.add((gdpr, DCTERMS.creator, Literal(
  83. 'Council of the European Union', datatype=XSD.string)))
  84. graph.add((gdpr, DCTERMS.abstract, Literal((
  85. 'REGULATION (EU) 2016/679 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL '
  86. 'of 27 April 2016 '
  87. 'on the protection of natural persons with regard to the processing of '
  88. 'personal data and on the free movement of such data, '
  89. 'and repealing Directive 95/46/EC '
  90. '(General Data Protection Regulation)'), datatype=XSD.string)))
  91. gdpr_description = BNode()
  92. graph.add((gdpr_description, RDF.type, RDF.Statement))
  93. gdpr_description1 = BNode()
  94. graph.add((gdpr_description1, RDF.type, RDF.Statement))
  95. graph.add((gdpr_description1, DCTERMS.description, Literal((
  96. 'THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION, '
  97. 'Having regard to the Treaty on the Functioning of the European Union, '
  98. 'and in particular Article 16 thereof, Having regard to the proposal from '
  99. 'the European Commission, After transmission of the draft legislative act '
  100. 'to the national parliaments,'), datatype=XSD.string)))
  101. graph.add((gdpr_description1, DCTERMS.isPartOf, gdpr_description))
  102. graph.add((gdpr_description, DCTERMS.hasPart, gdpr_description1))
  103. gdpr_description2 = BNode()
  104. graph.add((gdpr_description2, RDF.type, RDF.Statement))
  105. graph.add((gdpr_description2, DCTERMS.description, Literal((
  106. 'Having regard to the opinion of the European Economic '
  107. 'and Social Committee,'), datatype=XSD.string)))
  108. graph.add((gdpr_description2, DCTERMS.isPartOf, gdpr_description))
  109. graph.add((gdpr_description, DCTERMS.hasPart, gdpr_description2))
  110. graph.add((gdpr_description2, DCTERMS.references, GDPR['citation1']))
  111. graph.add((gdpr_description2, ELI.cites, GDPR['citation1']))
  112. gdpr_description3 = BNode()
  113. graph.add((gdpr_description3, RDF.type, RDF.Statement))
  114. graph.add((gdpr_description3, DCTERMS.description, Literal(
  115. 'Having regard to the opinion of the Committee of the Regions,',
  116. datatype=XSD.string)))
  117. graph.add((gdpr_description3, DCTERMS.isPartOf, gdpr_description))
  118. graph.add((gdpr_description, DCTERMS.hasPart, gdpr_description3))
  119. graph.add((gdpr_description3, DCTERMS.references, GDPR['citation2']))
  120. graph.add((gdpr_description3, ELI.cites, GDPR['citation2']))
  121. graph.add((gdpr, DCTERMS.description, gdpr_description))
  122. gdpr_description4 = BNode()
  123. graph.add((gdpr_description4, RDF.type, RDF.Statement))
  124. graph.add((gdpr_description4, DCTERMS.description, Literal(
  125. 'Acting in accordance with the ordinary legislative procedure,',
  126. datatype=XSD.string)))
  127. graph.add((gdpr_description4, DCTERMS.isPartOf, gdpr_description))
  128. graph.add((gdpr_description, DCTERMS.hasPart, gdpr_description4))
  129. graph.add((gdpr_description4, DCTERMS.references, GDPR['citation3']))
  130. graph.add((gdpr_description4, ELI.cites, GDPR['citation3']))
  131. graph.add((gdpr, DCTERMS.description, gdpr_description))
  132. graph.add((gdpr, ELI.date_document, Literal('2016-04-27', datatype=XSD.date)))
  133. graph.add((gdpr, DCTERMS.date, Literal('2016-04-27', datatype=XSD.date)))
  134. graph.add((gdpr, ELI.date_publication, Literal(
  135. '2016-05-04', datatype=XSD.date)))
  136. graph.add((gdpr, DCTERMS.issued, Literal('2016-05-04', datatype=XSD.date)))
  137. graph.add((gdpr, ELI.in_force, Literal(
  138. '2016-05-24', datatype=XSD.date)))
  139. graph.add((gdpr, ELI.date_applicability, Literal(
  140. '2018-05-25', datatype=XSD.date)))
  141. class_Chapter = GDPRtEXT['Chapter']
  142. class_Section = GDPRtEXT['Section']
  143. class_Article = GDPRtEXT['Article']
  144. class_Point = GDPRtEXT['Point']
  145. class_SubPoint = GDPRtEXT['SubPoint']
  146. class_Recital = GDPRtEXT['Recital']
  147. class_Citation = GDPRtEXT['Citation']
  148. property_partof_chapter = GDPRtEXT['isPartOfChapter']
  149. property_partof_section = GDPRtEXT['isPartOfSection']
  150. property_partof_article = GDPRtEXT['isPartOfArticle']
  151. property_partof_point = GDPRtEXT['isPartOfPoint']
  152. #############################################################################
  153. # The chapters are in json.chapters as an array of dicts
  154. # { NOS, title, contents }
  155. # A chapter may contain sections, and if it does, then it has the same
  156. # structure as well.
  157. # If a chapter has sections, then the section, else the chapter itself, has
  158. # several articles with the same structure.
  159. # Each article has several points, which may or may not be numbered.
  160. # Each point may have several points, which may or may not be numbered.
  161. def graph_subpoint(
  162. subpoint, article_number, point_number,
  163. point, article, section=None, chapter=None):
  164. '''adds subpoint to graph'''
  165. # print('SP', subpoint['number'])
  166. node_subpoint = GDPR['article{}-{}-{}'.format(
  167. article_number, point_number, subpoint['number'])]
  168. graph.add((node_subpoint, NOS, Literal(
  169. subpoint['number'], datatype=XSD.string)))
  170. graph.add((node_subpoint, TITLE_ALT, Literal(
  171. 'Article' + article_number + '({})({})'.format(
  172. point_number, subpoint['number']),
  173. datatype=XSD.string)))
  174. graph.add((node_subpoint, RDF.type, LRS))
  175. graph.add((node_subpoint, RDF.type, class_SubPoint))
  176. graph.add((node_subpoint, PART_OF, chapter))
  177. graph.add((node_subpoint, property_partof_chapter, chapter))
  178. if section is not None:
  179. graph.add((node_subpoint, PART_OF, section))
  180. graph.add((node_subpoint, property_partof_section, section))
  181. graph.add((node_subpoint, PART_OF, article))
  182. graph.add((node_subpoint, property_partof_article, article))
  183. graph.add((node_subpoint, PART_OF, point))
  184. graph.add((node_subpoint, property_partof_point, point))
  185. graph.add((node_subpoint, PART_OF, gdpr))
  186. graph.add((node_subpoint, DESC, Literal(
  187. subpoint['text'], datatype=XSD.string)))
  188. def graph_point(point, article_number, article, section=None, chapter=None):
  189. '''adds point to graph'''
  190. # print('P', point['number'])
  191. node_point = GDPR['article{}-{}'.format(
  192. article_number, point['number'])]
  193. graph.add((node_point, NOS, Literal(
  194. point['number'], datatype=XSD.string)))
  195. graph.add((node_point, TITLE_ALT, Literal(
  196. 'Article' + article_number + '({})'.format(point['number']),
  197. datatype=XSD.string)))
  198. graph.add((node_point, RDF.type, LRS))
  199. graph.add((node_point, RDF.type, class_Point))
  200. graph.add((node_point, PART_OF, chapter))
  201. graph.add((node_point, property_partof_chapter, chapter))
  202. if section is not None:
  203. graph.add((node_point, PART_OF, section))
  204. graph.add((node_point, property_partof_section, section))
  205. graph.add((node_point, PART_OF, article))
  206. graph.add((node_point, property_partof_article, article))
  207. graph.add((node_point, PART_OF, gdpr))
  208. graph.add((node_point, DESC, Literal(
  209. point['text'], datatype=XSD.string)))
  210. # subpoint number to be used only when they are un-numbered
  211. subpoint_nos = 1
  212. for subpoint in point['subpoints']:
  213. if subpoint['number'] is None:
  214. subpoint['number'] = subpoint_nos
  215. subpoint_nos += 1
  216. graph_subpoint(
  217. subpoint, article_number, point['number'],
  218. node_point, article, section, chapter)
  219. def graph_article(article, section=None, chapter=None):
  220. '''adds article to graph'''
  221. # print('A', article['number'])
  222. node_article = GDPR['article{}'.format(article['number'])]
  223. graph.add((node_article, RDF.type, LRS))
  224. graph.add((node_article, RDF.type, class_Article))
  225. graph.add((node_article, NOS, Literal(
  226. article['number'], datatype=XSD.string)))
  227. graph.add((node_article, TITLE_ALT, Literal(
  228. 'Article ' + article['number'], datatype=XSD.string)))
  229. graph.add((node_article, PART_OF, chapter))
  230. graph.add((node_article, property_partof_chapter, chapter))
  231. graph.add((node_article, PART_OF, gdpr))
  232. if section is not None:
  233. graph.add((node_article, PART_OF, section))
  234. graph.add((node_article, property_partof_section, section))
  235. # point number for when points are unnumbered
  236. point_nos = 1
  237. for point in article['contents']:
  238. if point['number'] is None:
  239. point['number'] = point_nos
  240. point_nos += 1
  241. graph_point(point, article['number'], node_article, section, chapter)
  242. def graph_section(section, chapter, chapter_number):
  243. '''adds section to graph'''
  244. # print('S', section['number'], section['title'])
  245. node_section = GDPR[
  246. 'chapter{}-{}'.format(chapter_number, section['number'])]
  247. graph.add((node_section, RDF.type, LRS))
  248. graph.add((node_section, RDF.type, class_Section))
  249. graph.add((node_section, TITLE, Literal(
  250. section['title'], datatype=XSD.string)))
  251. graph.add((node_section, NOS, Literal(
  252. section['number'], datatype=XSD.string)))
  253. graph.add((node_section, TITLE_ALT, Literal(
  254. 'Section ' + section['number'], datatype=XSD.string)))
  255. graph.add((node_section, PART_OF, chapter))
  256. graph.add((node_section, PART_OF, gdpr))
  257. graph.add((node_section, property_partof_chapter, chapter))
  258. for article in section['contents']:
  259. graph_article(article, node_section, chapter)
  260. def graph_chapter(chapter):
  261. '''adds chapter to graph'''
  262. # print('C', chapter['number'], chapter['title'])
  263. node_chapter = GDPR['chapter{}'.format(chapter['number'])]
  264. graph.add((node_chapter, RDF.type, LRS))
  265. graph.add((node_chapter, RDF.type, class_Chapter))
  266. graph.add((node_chapter, TITLE, Literal(
  267. chapter['title'], datatype=XSD.string)))
  268. graph.add((node_chapter, NOS, Literal(
  269. chapter['number'], datatype=XSD.string)))
  270. graph.add((node_chapter, TITLE_ALT, Literal(
  271. 'Chapter ' + chapter['number'], datatype=XSD.string)))
  272. graph.add((node_chapter, PART_OF, gdpr))
  273. contents = chapter['contents']
  274. # Section (if any)
  275. if contents[0]['type'] == 'section':
  276. for item in contents:
  277. graph_section(item, node_chapter, chapter['number'])
  278. else:
  279. for item in contents:
  280. graph_article(item, None, node_chapter)
  281. for chapter in gdpr_json['chapters']:
  282. graph_chapter(chapter)
  283. for recital in gdpr_json['recitals']:
  284. node_recital = GDPR['recital{}'.format(recital['number'])]
  285. graph.add((node_recital, RDF.type, LRS))
  286. graph.add((node_recital, RDF.type, class_Recital))
  287. graph.add((node_recital, NOS, Literal(
  288. recital['number'], datatype=XSD.string)))
  289. graph.add((node_recital, DESC, Literal(
  290. recital['text'], datatype=XSD.string)))
  291. graph.add((node_recital, PART_OF, gdpr))
  292. for citation in gdpr_json['citations'].values():
  293. node_citation = GDPR['citation{}'.format(citation['number'])]
  294. graph.add((node_citation, RDF.type, LRS))
  295. graph.add((node_citation, RDF.type, class_Citation))
  296. graph.add((node_citation, NOS, Literal(
  297. citation['number'], datatype=XSD.string)))
  298. graph.add((node_citation, DESC, Literal(
  299. citation['text'], datatype=XSD.string)))
  300. graph.add((node_citation, PART_OF, gdpr))
  301. # Add citations
  302. graph.add((GDPR['recital3'], ELI.cites, GDPR['citation4']))
  303. graph.add((GDPR['recital13'], ELI.cites, GDPR['citation5']))
  304. graph.add((GDPR['recital17'], ELI.cites, GDPR['citation6']))
  305. graph.add((GDPR['recital19'], ELI.cites, GDPR['citation7']))
  306. graph.add((GDPR['recital21'], ELI.cites, GDPR['citation8']))
  307. graph.add((GDPR['recital35'], ELI.cites, GDPR['citation9']))
  308. graph.add((GDPR['recital42'], ELI.cites, GDPR['citation10']))
  309. graph.add((GDPR['recital54'], ELI.cites, GDPR['citation11']))
  310. graph.add((GDPR['recital106'], ELI.cites, GDPR['citation12']))
  311. graph.add((GDPR['recital147'], ELI.cites, GDPR['citation13']))
  312. graph.add((GDPR['recital154'], ELI.cites, GDPR['citation14']))
  313. graph.add((GDPR['recital161'], ELI.cites, GDPR['citation15']))
  314. graph.add((GDPR['recital163'], ELI.cites, GDPR['citation16']))
  315. graph.add((GDPR['recital172'], ELI.cites, GDPR['citation17']))
  316. graph.add((GDPR['recital173'], ELI.cites, GDPR['citation18']))
  317. graph.add((GDPR['article4-25'], ELI.cites, GDPR['citation19']))
  318. graph.add((GDPR['article43-1-2'], ELI.cites, GDPR['citation20']))
  319. graph.add((GDPR['article76-2'], ELI.cites, GDPR['citation21']))
  320. # Serialize
  321. graph.serialize(destination='../deliverables/gdpr.ttl', format='turtle')
  322. graph.serialize(destination='../deliverables/gdpr.rdf', format='pretty-xml')
  323. graph.serialize(destination='../deliverables/gdpr.n3', format='n3')
  324. graph.serialize(destination='../deliverables/gdpr.nt', format='nt')
  325. graph.serialize(destination='../deliverables/gdpr.jsonld', format='json-ld')