generate_rdf_pairings.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346
  1. #!/usr/bin/env python3
  2. # author: Harshvardhan Pandit
  3. # Generates RDF graph for GDPR text from JSON (gdpr.json)
  4. # The JSON file is in ../deliverables/gdpr.json
  5. # The algorithm is roughly as follows:
  6. # For every chapter, article, subpoint, et. al. it defines the following
  7. # attributes using the ELI vocabulary
  8. # @prefix ELI https://publications.europa.eu/en/mdr/resource/eli/eli.owl
  9. #
  10. # - ELI:title (xsd:string) title of the item
  11. # - ELI:number (xsd:string) number of the item in text
  12. # - ELI:TITLE_ALT (xsd:string) an alternative representation of the
  13. # item as <item_type> followed by <item_number>; e.g. Chapter 1
  14. # - ELI:DESC (xsd:string) text of the item
  15. # - ELI:PART_OF (ELI:LegalResource) gdpr:GDPR
  16. # here, this is an instance of a LegalResource created to represent
  17. # the GDPR, and is present in another OWL file, which is being edited
  18. # in Protege. The reference does not need to be resolved.
  19. # - ELI:id_local (xsd:string) the ID of the resource in the HTML file
  20. # This is a bungling of the correct use of the vocabulary, but it keeps
  21. # all terms within the intended usage. The same ID can be used to
  22. # lookup the resource in the HTML file (so it is the ID attribute of
  23. # the resource).
  24. #
  25. # This script generates the RDF pairings for the text of the GDPR. The bulk
  26. # of the work and the part where the GDPR itself is referenced is actually
  27. # done using Protege, and does not need the use of a script, such as this,
  28. # since it is a manual labour.
  29. ##############################################################################
  30. from rdflib import Graph, RDF, RDFS, XSD, Literal, URIRef, BNode
  31. from rdflib import Namespace
  32. # Load the JSON from file
  33. with open('../deliverables/gdpr.json') as fd:
  34. import json
  35. gdpr_json = json.load(fd)
  36. # import p# print
  37. # p# print.p# print(gdpr_json)
  38. # This will be the graph used to hold the triples as they are being generated
  39. graph = Graph()
  40. GDPRtEXT_URI = URIRef('http://purl.org/adaptcentre/ontologies/GDPRtEXT#')
  41. GDPRtEXT = Namespace(GDPRtEXT_URI)
  42. graph.namespace_manager.bind('GDPRtEXT', GDPRtEXT)
  43. DCTERMS = Namespace('http://purl.org/dc/terms/')
  44. graph.namespace_manager.bind('dcterms', DCTERMS)
  45. # This is the ELI namespace, used as the legal vocabulary for EU text
  46. ELI = Namespace("http://data.europa.eu/eli/ontology#")
  47. graph.namespace_manager.bind('eli', ELI)
  48. # bind ELI items to names that are easier to access (argumentative)
  49. LRS = ELI.LegalResourceSubdivision
  50. TITLE = ELI.title
  51. NOS = ELI.number
  52. TITLE_ALT = ELI.title_alternative
  53. PART_OF = ELI.is_part_of
  54. # graph.add((PART_OF, RDF.type, OWL.TransitiveProperty))
  55. DESC = ELI.description
  56. HAS_RECITAL = GDPRtEXT['hasRecital']
  57. ##############################################################################
  58. # GDPR as named individual
  59. # base URI
  60. GDPR_URI = URIRef('http://purl.org/adaptcentre/resources/GDPRtEXT#')
  61. GDPR = Namespace(GDPR_URI)
  62. graph.namespace_manager.bind('gdpr', GDPR)
  63. gdpr = GDPR.GDPR
  64. # graph.add((gdpr, RDF.type, OWL.NamedIndividual))
  65. graph.add((gdpr, RDF.type, ELI.LR))
  66. graph.add((gdpr, RDF.type, DCTERMS.Policy))
  67. graph.add((gdpr, DCTERMS.identifier, Literal(
  68. '2016/679', datatype=XSD.string)))
  69. graph.add((gdpr, DCTERMS.language, Literal(
  70. 'English', datatype=XSD.string)))
  71. graph.add((gdpr, DCTERMS.publisher, Literal(
  72. 'Official Journal of the European Union', datatype=XSD.string)))
  73. graph.add((gdpr, DCTERMS.source, Literal(
  74. 'http://eur-lex.europa.eu/eli/reg/2016/679/oj', datatype=XSD.string)))
  75. graph.add((gdpr, DCTERMS.title, Literal(
  76. 'General Data Protection Regulation', datatype=XSD.string)))
  77. graph.add((gdpr, DCTERMS.title_alternative, Literal(
  78. 'GDPR', datatype=XSD.string)))
  79. graph.add((gdpr, DCTERMS.title_alternative, Literal(
  80. 'REGULATION (EU) 2016/679', datatype=XSD.string)))
  81. graph.add((gdpr, DCTERMS.creator, Literal(
  82. 'European Parliament', datatype=XSD.string)))
  83. graph.add((gdpr, DCTERMS.creator, Literal(
  84. 'Council of the European Union', datatype=XSD.string)))
  85. graph.add((gdpr, DCTERMS.abstract, Literal((
  86. 'REGULATION (EU) 2016/679 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL '
  87. 'of 27 April 2016 '
  88. 'on the protection of natural persons with regard to the processing of '
  89. 'personal data and on the free movement of such data, '
  90. 'and repealing Directive 95/46/EC '
  91. '(General Data Protection Regulation)'), datatype=XSD.string)))
  92. gdpr_description = GDPR['description']
  93. graph.add((gdpr_description, RDF.type, ELI.LRS))
  94. graph.add((gdpr_description, DCTERMS.description, Literal((
  95. 'THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION, '
  96. 'Having regard to the Treaty on the Functioning of the European Union, '
  97. 'and in particular Article 16 thereof, Having regard to the proposal from '
  98. 'the European Commission, After transmission of the draft legislative act '
  99. 'to the national parliaments, '
  100. 'Having regard to the opinion of the European Economic '
  101. 'and Social Committee, '
  102. 'Having regard to the opinion of the Committee of the Regions, '
  103. 'Acting in accordance with the ordinary legislative procedure,'
  104. ), datatype=XSD.string)))
  105. graph.add((gdpr_description, ELI.cites, GDPR['citation1']))
  106. graph.add((gdpr_description, ELI.cites, GDPR['citation2']))
  107. graph.add((gdpr_description, ELI.cites, GDPR['citation3']))
  108. graph.add((gdpr, DCTERMS.description, gdpr_description))
  109. graph.add((gdpr, ELI.date_document, Literal('2016-04-27', datatype=XSD.date)))
  110. graph.add((gdpr, DCTERMS.date, Literal('2016-04-27', datatype=XSD.date)))
  111. graph.add((gdpr, ELI.date_publication, Literal(
  112. '2016-05-04', datatype=XSD.date)))
  113. graph.add((gdpr, DCTERMS.issued, Literal('2016-05-04', datatype=XSD.date)))
  114. graph.add((gdpr, ELI.in_force, Literal(
  115. '2016-05-24', datatype=XSD.date)))
  116. graph.add((gdpr, ELI.date_applicability, Literal(
  117. '2018-05-25', datatype=XSD.date)))
  118. class_Chapter = GDPRtEXT['Chapter']
  119. class_Section = GDPRtEXT['Section']
  120. class_Article = GDPRtEXT['Article']
  121. class_Point = GDPRtEXT['Point']
  122. class_SubPoint = GDPRtEXT['SubPoint']
  123. class_Recital = GDPRtEXT['Recital']
  124. class_Citation = GDPRtEXT['Citation']
  125. property_partof_chapter = GDPRtEXT['isPartOfChapter']
  126. property_partof_section = GDPRtEXT['isPartOfSection']
  127. property_partof_article = GDPRtEXT['isPartOfArticle']
  128. property_partof_point = GDPRtEXT['isPartOfPoint']
  129. property_has_chapter = GDPRtEXT['hasChapter']
  130. property_has_section = GDPRtEXT['hasSection']
  131. property_has_article = GDPRtEXT['hasArticle']
  132. property_has_point = GDPRtEXT['hasPoint']
  133. property_has_subpoint = GDPRtEXT['hasSubPoint']
  134. #############################################################################
  135. # The chapters are in json.chapters as an array of dicts
  136. # { NOS, title, contents }
  137. # A chapter may contain sections, and if it does, then it has the same
  138. # structure as well.
  139. # If a chapter has sections, then the section, else the chapter itself, has
  140. # several articles with the same structure.
  141. # Each article has several points, which may or may not be numbered.
  142. # Each point may have several points, which may or may not be numbered.
  143. def graph_subpoint(
  144. subpoint, article_number, point_number,
  145. point, article, section=None, chapter=None):
  146. '''adds subpoint to graph'''
  147. # print('SP', subpoint['number'])
  148. node_subpoint = GDPR['article{}-{}-{}'.format(
  149. article_number, point_number, subpoint['number'])]
  150. graph.add((node_subpoint, NOS, Literal(
  151. subpoint['number'], datatype=XSD.string)))
  152. graph.add((node_subpoint, TITLE_ALT, Literal(
  153. 'Article' + article_number + '({})({})'.format(
  154. point_number, subpoint['number']),
  155. datatype=XSD.string)))
  156. graph.add((node_subpoint, RDF.type, LRS))
  157. graph.add((node_subpoint, RDF.type, class_SubPoint))
  158. graph.add((node_subpoint, PART_OF, chapter))
  159. graph.add((node_subpoint, property_partof_chapter, chapter))
  160. if section is not None:
  161. graph.add((node_subpoint, PART_OF, section))
  162. graph.add((node_subpoint, property_partof_section, section))
  163. graph.add((node_subpoint, PART_OF, article))
  164. graph.add((node_subpoint, property_partof_article, article))
  165. graph.add((node_subpoint, PART_OF, point))
  166. graph.add((node_subpoint, property_partof_point, point))
  167. graph.add((point, property_has_subpoint, node_subpoint))
  168. graph.add((node_subpoint, PART_OF, gdpr))
  169. graph.add((node_subpoint, DESC, Literal(
  170. subpoint['text'], datatype=XSD.string)))
  171. def graph_point(point, article_number, article, section=None, chapter=None):
  172. '''adds point to graph'''
  173. # print('P', point['number'])
  174. node_point = GDPR['article{}-{}'.format(
  175. article_number, point['number'])]
  176. graph.add((node_point, NOS, Literal(
  177. point['number'], datatype=XSD.string)))
  178. graph.add((node_point, TITLE_ALT, Literal(
  179. 'Article' + article_number + '({})'.format(point['number']),
  180. datatype=XSD.string)))
  181. graph.add((node_point, RDF.type, LRS))
  182. graph.add((node_point, RDF.type, class_Point))
  183. graph.add((node_point, PART_OF, chapter))
  184. graph.add((node_point, property_partof_chapter, chapter))
  185. if section is not None:
  186. graph.add((node_point, PART_OF, section))
  187. graph.add((node_point, property_partof_section, section))
  188. graph.add((node_point, PART_OF, article))
  189. graph.add((node_point, property_partof_article, article))
  190. graph.add((article, property_has_point, node_point))
  191. graph.add((node_point, PART_OF, gdpr))
  192. graph.add((node_point, DESC, Literal(
  193. point['text'], datatype=XSD.string)))
  194. # subpoint number to be used only when they are un-numbered
  195. subpoint_nos = 1
  196. for subpoint in point['subpoints']:
  197. if subpoint['number'] is None:
  198. subpoint['number'] = subpoint_nos
  199. subpoint_nos += 1
  200. graph_subpoint(
  201. subpoint, article_number, point['number'],
  202. node_point, article, section, chapter)
  203. def graph_article(article, section=None, chapter=None):
  204. '''adds article to graph'''
  205. # print('A', article['number'])
  206. node_article = GDPR['article{}'.format(article['number'])]
  207. graph.add((node_article, RDF.type, LRS))
  208. graph.add((node_article, RDF.type, class_Article))
  209. graph.add((node_article, NOS, Literal(
  210. article['number'], datatype=XSD.string)))
  211. graph.add((node_article, TITLE_ALT, Literal(
  212. 'Article ' + article['number'], datatype=XSD.string)))
  213. graph.add((node_article, PART_OF, chapter))
  214. graph.add((node_article, property_partof_chapter, chapter))
  215. graph.add((chapter, property_has_article, node_article))
  216. graph.add((gdpr, property_has_article, node_article))
  217. graph.add((node_article, PART_OF, gdpr))
  218. if section is not None:
  219. graph.add((node_article, PART_OF, section))
  220. graph.add((node_article, property_partof_section, section))
  221. graph.add((section, property_has_article, node_article))
  222. # point number for when points are unnumbered
  223. point_nos = 1
  224. for point in article['contents']:
  225. if point['number'] is None:
  226. point['number'] = point_nos
  227. point_nos += 1
  228. graph_point(point, article['number'], node_article, section, chapter)
  229. def graph_section(section, chapter, chapter_number):
  230. '''adds section to graph'''
  231. # print('S', section['number'], section['title'])
  232. node_section = GDPR[
  233. 'chapter{}-{}'.format(chapter_number, section['number'])]
  234. graph.add((node_section, RDF.type, LRS))
  235. graph.add((node_section, RDF.type, class_Section))
  236. graph.add((node_section, TITLE, Literal(
  237. section['title'], datatype=XSD.string)))
  238. graph.add((node_section, NOS, Literal(
  239. section['number'], datatype=XSD.string)))
  240. graph.add((node_section, TITLE_ALT, Literal(
  241. 'Section ' + section['number'], datatype=XSD.string)))
  242. graph.add((node_section, PART_OF, chapter))
  243. graph.add((node_section, PART_OF, gdpr))
  244. graph.add((node_section, property_partof_chapter, chapter))
  245. graph.add((chapter, property_has_section, node_section))
  246. for article in section['contents']:
  247. graph_article(article, node_section, chapter)
  248. def graph_chapter(chapter):
  249. '''adds chapter to graph'''
  250. # print('C', chapter['number'], chapter['title'])
  251. node_chapter = GDPR['chapter{}'.format(chapter['number'])]
  252. graph.add((node_chapter, RDF.type, LRS))
  253. graph.add((node_chapter, RDF.type, class_Chapter))
  254. graph.add((node_chapter, TITLE, Literal(
  255. chapter['title'], datatype=XSD.string)))
  256. graph.add((node_chapter, NOS, Literal(
  257. chapter['number'], datatype=XSD.string)))
  258. graph.add((node_chapter, TITLE_ALT, Literal(
  259. 'Chapter ' + chapter['number'], datatype=XSD.string)))
  260. graph.add((node_chapter, PART_OF, gdpr))
  261. graph.add((gdpr, property_has_chapter, node_chapter))
  262. contents = chapter['contents']
  263. # Section (if any)
  264. if contents[0]['type'] == 'section':
  265. for item in contents:
  266. graph_section(item, node_chapter, chapter['number'])
  267. else:
  268. for item in contents:
  269. graph_article(item, None, node_chapter)
  270. for chapter in gdpr_json['chapters']:
  271. graph_chapter(chapter)
  272. for recital in gdpr_json['recitals']:
  273. node_recital = GDPR['recital{}'.format(recital['number'])]
  274. graph.add((node_recital, RDF.type, LRS))
  275. graph.add((node_recital, RDF.type, class_Recital))
  276. graph.add((node_recital, NOS, Literal(
  277. recital['number'], datatype=XSD.string)))
  278. graph.add((node_recital, DESC, Literal(
  279. recital['text'], datatype=XSD.string)))
  280. graph.add((node_recital, PART_OF, gdpr))
  281. graph.add((gdpr, HAS_RECITAL, node_recital))
  282. for citation in gdpr_json['citations'].values():
  283. node_citation = GDPR['citation{}'.format(citation['number'])]
  284. graph.add((node_citation, RDF.type, LRS))
  285. graph.add((node_citation, RDF.type, class_Citation))
  286. graph.add((node_citation, NOS, Literal(
  287. citation['number'], datatype=XSD.string)))
  288. graph.add((node_citation, DESC, Literal(
  289. citation['text'], datatype=XSD.string)))
  290. graph.add((node_citation, PART_OF, gdpr))
  291. graph.add((gdpr, ELI.cites, node_citation))
  292. # Add citations
  293. graph.add((GDPR['recital3'], ELI.cites, GDPR['citation4']))
  294. graph.add((GDPR['recital13'], ELI.cites, GDPR['citation5']))
  295. graph.add((GDPR['recital17'], ELI.cites, GDPR['citation6']))
  296. graph.add((GDPR['recital19'], ELI.cites, GDPR['citation7']))
  297. graph.add((GDPR['recital21'], ELI.cites, GDPR['citation8']))
  298. graph.add((GDPR['recital35'], ELI.cites, GDPR['citation9']))
  299. graph.add((GDPR['recital42'], ELI.cites, GDPR['citation10']))
  300. graph.add((GDPR['recital54'], ELI.cites, GDPR['citation11']))
  301. graph.add((GDPR['recital106'], ELI.cites, GDPR['citation12']))
  302. graph.add((GDPR['recital147'], ELI.cites, GDPR['citation13']))
  303. graph.add((GDPR['recital154'], ELI.cites, GDPR['citation14']))
  304. graph.add((GDPR['recital161'], ELI.cites, GDPR['citation15']))
  305. graph.add((GDPR['recital163'], ELI.cites, GDPR['citation16']))
  306. graph.add((GDPR['recital172'], ELI.cites, GDPR['citation17']))
  307. graph.add((GDPR['recital173'], ELI.cites, GDPR['citation18']))
  308. graph.add((GDPR['article4-25'], ELI.cites, GDPR['citation19']))
  309. graph.add((GDPR['article43-1-2'], ELI.cites, GDPR['citation20']))
  310. graph.add((GDPR['article76-2'], ELI.cites, GDPR['citation21']))
  311. # Serialize
  312. graph.serialize(destination='../deliverables/gdpr.ttl', format='turtle')
  313. graph.serialize(destination='../deliverables/gdpr.rdf', format='pretty-xml')
  314. graph.serialize(destination='../deliverables/gdpr.n3', format='n3')
  315. graph.serialize(destination='../deliverables/gdpr.nt', format='nt')
  316. graph.serialize(destination='../deliverables/gdpr.jsonld', format='json-ld')