generate_rdf_pairings.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. #!/usr/bin/env python3
  2. # author: Harshvardhan Pandit
  3. # Generates RDF graph for GDPR text from JSON (gdpr.json)
  4. # The JSON file is in ../deliverables/gdpr.json
  5. # The algorithm is roughly as follows:
  6. # For every chapter, article, subpoint, et. al. it defines the following
  7. # attributes using the ELI vocabulary
  8. # @prefix ELI https://publications.europa.eu/en/mdr/resource/eli/eli.owl
  9. #
  10. # - ELI:title (xsd:string) title of the item
  11. # - ELI:number (xsd:string) number of the item in text
  12. # - ELI:TITLE_ALT (xsd:string) an alternative representation of the
  13. # item as <item_type> followed by <item_number>; e.g. Chapter 1
  14. # - ELI:DESC (xsd:string) text of the item
  15. # - ELI:PART_OF (ELI:LegalResource) gdpr:GDPR
  16. # here, this is an instance of a LegalResource created to represent
  17. # the GDPR, and is present in another OWL file, which is being edited
  18. # in Protege. The reference does not need to be resolved.
  19. # - ELI:id_local (xsd:string) the ID of the resource in the HTML file
  20. # This is a bungling of the correct use of the vocabulary, but it keeps
  21. # all terms within the intended usage. The same ID can be used to
  22. # lookup the resource in the HTML file (so it is the ID attribute of
  23. # the resource).
  24. #
  25. # This script generates the RDF pairings for the text of the GDPR. The bulk
  26. # of the work and the part where the GDPR itself is referenced is actually
  27. # done using Protege, and does not need the use of a script, such as this,
  28. # since it is a manual labour.
  29. ##############################################################################
  30. from rdflib import Graph, RDF, RDFS, XSD, Literal, URIRef, BNode
  31. from rdflib import Namespace
  32. # Load the JSON from file
  33. with open('../deliverables/gdpr.json') as fd:
  34. import json
  35. gdpr_json = json.load(fd)
  36. # import p# print
  37. # p# print.p# print(gdpr_json)
  38. # This will be the graph used to hold the triples as they are being generated
  39. graph = Graph()
  40. GDPRtEXT_URI = URIRef(
  41. 'http://purl.org/adaptcentre/openscience/ontologies/GDPRtEXT#')
  42. GDPRtEXT = Namespace(GDPRtEXT_URI)
  43. graph.namespace_manager.bind('GDPRtext', GDPRtEXT)
  44. DCTERMS = Namespace('http://purl.org/dc/terms/')
  45. graph.namespace_manager.bind('dcterms', DCTERMS)
  46. # This is the ELI namespace, used as the legal vocabulary for EU text
  47. ELI = Namespace("http://data.europa.eu/eli/ontology#")
  48. graph.namespace_manager.bind('eli', ELI)
  49. # bind ELI items to names that are easier to access (argumentative)
  50. LRS = ELI.LegalResourceSubdivision
  51. TITLE = ELI.title
  52. NOS = ELI.number
  53. TITLE_ALT = ELI.title_alternative
  54. PART_OF = ELI.is_part_of
  55. # graph.add((PART_OF, RDF.type, OWL.TransitiveProperty))
  56. DESC = ELI.description
  57. HAS_RECITAL = GDPRtEXT['hasRecital']
  58. ##############################################################################
  59. # GDPR as named individual
  60. # base URI
  61. GDPR_URI = URIRef(
  62. 'http://purl.org/adaptcentre/openscience/resources/GDPRtEXT#')
  63. GDPR = Namespace(GDPR_URI)
  64. graph.namespace_manager.bind('gdpr', GDPR)
  65. gdpr = GDPR.GDPR
  66. # graph.add((gdpr, RDF.type, OWL.NamedIndividual))
  67. graph.add((gdpr, RDF.type, ELI.LR))
  68. graph.add((gdpr, RDF.type, DCTERMS.Policy))
  69. graph.add((gdpr, DCTERMS.identifier, Literal(
  70. '2016/679', datatype=XSD.string)))
  71. graph.add((gdpr, DCTERMS.language, Literal(
  72. 'English', datatype=XSD.string)))
  73. graph.add((gdpr, DCTERMS.publisher, Literal(
  74. 'Official Journal of the European Union', datatype=XSD.string)))
  75. graph.add((gdpr, DCTERMS.source, Literal(
  76. 'http://eur-lex.europa.eu/eli/reg/2016/679/oj', datatype=XSD.string)))
  77. graph.add((gdpr, DCTERMS.title, Literal(
  78. 'General Data Protection Regulation', datatype=XSD.string)))
  79. graph.add((gdpr, DCTERMS.title_alternative, Literal(
  80. 'GDPR', datatype=XSD.string)))
  81. graph.add((gdpr, DCTERMS.title_alternative, Literal(
  82. 'REGULATION (EU) 2016/679', datatype=XSD.string)))
  83. graph.add((gdpr, DCTERMS.creator, Literal(
  84. 'European Parliament', datatype=XSD.string)))
  85. graph.add((gdpr, DCTERMS.creator, Literal(
  86. 'Council of the European Union', datatype=XSD.string)))
  87. graph.add((gdpr, DCTERMS.abstract, Literal((
  88. 'REGULATION (EU) 2016/679 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL '
  89. 'of 27 April 2016 '
  90. 'on the protection of natural persons with regard to the processing of '
  91. 'personal data and on the free movement of such data, '
  92. 'and repealing Directive 95/46/EC '
  93. '(General Data Protection Regulation)'), datatype=XSD.string)))
  94. gdpr_description = GDPR['description']
  95. graph.add((gdpr_description, RDF.type, ELI.LRS))
  96. graph.add((gdpr_description, DCTERMS.description, Literal((
  97. 'THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION, '
  98. 'Having regard to the Treaty on the Functioning of the European Union, '
  99. 'and in particular Article 16 thereof, Having regard to the proposal from '
  100. 'the European Commission, After transmission of the draft legislative act '
  101. 'to the national parliaments, '
  102. 'Having regard to the opinion of the European Economic '
  103. 'and Social Committee, '
  104. 'Having regard to the opinion of the Committee of the Regions, '
  105. 'Acting in accordance with the ordinary legislative procedure,'
  106. ), datatype=XSD.string)))
  107. graph.add((gdpr_description, ELI.cites, GDPR['citation1']))
  108. graph.add((gdpr_description, ELI.cites, GDPR['citation2']))
  109. graph.add((gdpr_description, ELI.cites, GDPR['citation3']))
  110. graph.add((gdpr, DCTERMS.description, gdpr_description))
  111. graph.add((gdpr, ELI.date_document, Literal('2016-04-27', datatype=XSD.date)))
  112. graph.add((gdpr, DCTERMS.date, Literal('2016-04-27', datatype=XSD.date)))
  113. graph.add((gdpr, ELI.date_publication, Literal(
  114. '2016-05-04', datatype=XSD.date)))
  115. graph.add((gdpr, DCTERMS.issued, Literal('2016-05-04', datatype=XSD.date)))
  116. graph.add((gdpr, ELI.in_force, Literal(
  117. '2016-05-24', datatype=XSD.date)))
  118. graph.add((gdpr, ELI.date_applicability, Literal(
  119. '2018-05-25', datatype=XSD.date)))
  120. class_Chapter = GDPRtEXT['Chapter']
  121. class_Section = GDPRtEXT['Section']
  122. class_Article = GDPRtEXT['Article']
  123. class_Point = GDPRtEXT['Point']
  124. class_SubPoint = GDPRtEXT['SubPoint']
  125. class_Recital = GDPRtEXT['Recital']
  126. class_Citation = GDPRtEXT['Citation']
  127. property_partof_chapter = GDPRtEXT['isPartOfChapter']
  128. property_partof_section = GDPRtEXT['isPartOfSection']
  129. property_partof_article = GDPRtEXT['isPartOfArticle']
  130. property_partof_point = GDPRtEXT['isPartOfPoint']
  131. property_has_chapter = GDPRtEXT['hasChapter']
  132. property_has_section = GDPRtEXT['hasSection']
  133. property_has_article = GDPRtEXT['hasArticle']
  134. property_has_point = GDPRtEXT['hasPoint']
  135. property_has_subpoint = GDPRtEXT['hasSubPoint']
  136. #############################################################################
  137. # The chapters are in json.chapters as an array of dicts
  138. # { NOS, title, contents }
  139. # A chapter may contain sections, and if it does, then it has the same
  140. # structure as well.
  141. # If a chapter has sections, then the section, else the chapter itself, has
  142. # several articles with the same structure.
  143. # Each article has several points, which may or may not be numbered.
  144. # Each point may have several points, which may or may not be numbered.
  145. def graph_subpoint(
  146. subpoint, article_number, point_number,
  147. point, article, section=None, chapter=None):
  148. '''adds subpoint to graph'''
  149. # print('SP', subpoint['number'])
  150. node_subpoint = GDPR['article{}-{}-{}'.format(
  151. article_number, point_number, subpoint['number'])]
  152. graph.add((node_subpoint, NOS, Literal(
  153. subpoint['number'], datatype=XSD.string)))
  154. graph.add((node_subpoint, TITLE_ALT, Literal(
  155. 'Article' + article_number + '({})({})'.format(
  156. point_number, subpoint['number']),
  157. datatype=XSD.string)))
  158. graph.add((node_subpoint, RDF.type, LRS))
  159. graph.add((node_subpoint, RDF.type, class_SubPoint))
  160. graph.add((node_subpoint, PART_OF, chapter))
  161. graph.add((node_subpoint, property_partof_chapter, chapter))
  162. if section is not None:
  163. graph.add((node_subpoint, PART_OF, section))
  164. graph.add((node_subpoint, property_partof_section, section))
  165. graph.add((node_subpoint, PART_OF, article))
  166. graph.add((node_subpoint, property_partof_article, article))
  167. graph.add((node_subpoint, PART_OF, point))
  168. graph.add((node_subpoint, property_partof_point, point))
  169. graph.add((point, property_has_subpoint, node_subpoint))
  170. graph.add((node_subpoint, PART_OF, gdpr))
  171. graph.add((node_subpoint, DESC, Literal(
  172. subpoint['text'], datatype=XSD.string)))
  173. def graph_point(point, article_number, article, section=None, chapter=None):
  174. '''adds point to graph'''
  175. # print('P', point['number'])
  176. node_point = GDPR['article{}-{}'.format(
  177. article_number, point['number'])]
  178. graph.add((node_point, NOS, Literal(
  179. point['number'], datatype=XSD.string)))
  180. graph.add((node_point, TITLE_ALT, Literal(
  181. 'Article' + article_number + '({})'.format(point['number']),
  182. datatype=XSD.string)))
  183. graph.add((node_point, RDF.type, LRS))
  184. graph.add((node_point, RDF.type, class_Point))
  185. graph.add((node_point, PART_OF, chapter))
  186. graph.add((node_point, property_partof_chapter, chapter))
  187. if section is not None:
  188. graph.add((node_point, PART_OF, section))
  189. graph.add((node_point, property_partof_section, section))
  190. graph.add((node_point, PART_OF, article))
  191. graph.add((node_point, property_partof_article, article))
  192. graph.add((article, property_has_point, node_point))
  193. graph.add((node_point, PART_OF, gdpr))
  194. graph.add((node_point, DESC, Literal(
  195. point['text'], datatype=XSD.string)))
  196. # subpoint number to be used only when they are un-numbered
  197. subpoint_nos = 1
  198. for subpoint in point['subpoints']:
  199. if subpoint['number'] is None:
  200. subpoint['number'] = subpoint_nos
  201. subpoint_nos += 1
  202. graph_subpoint(
  203. subpoint, article_number, point['number'],
  204. node_point, article, section, chapter)
  205. def graph_article(article, section=None, chapter=None):
  206. '''adds article to graph'''
  207. # print('A', article['number'])
  208. node_article = GDPR['article{}'.format(article['number'])]
  209. graph.add((node_article, RDF.type, LRS))
  210. graph.add((node_article, RDF.type, class_Article))
  211. graph.add((node_article, NOS, Literal(
  212. article['number'], datatype=XSD.string)))
  213. graph.add((node_article, TITLE_ALT, Literal(
  214. 'Article ' + article['number'], datatype=XSD.string)))
  215. graph.add((node_article, PART_OF, chapter))
  216. graph.add((node_article, property_partof_chapter, chapter))
  217. graph.add((chapter, property_has_article, node_article))
  218. graph.add((gdpr, property_has_article, node_article))
  219. graph.add((node_article, PART_OF, gdpr))
  220. if section is not None:
  221. graph.add((node_article, PART_OF, section))
  222. graph.add((node_article, property_partof_section, section))
  223. graph.add((section, property_has_article, node_article))
  224. # point number for when points are unnumbered
  225. point_nos = 1
  226. for point in article['contents']:
  227. if point['number'] is None:
  228. point['number'] = point_nos
  229. point_nos += 1
  230. graph_point(point, article['number'], node_article, section, chapter)
  231. def graph_section(section, chapter, chapter_number):
  232. '''adds section to graph'''
  233. # print('S', section['number'], section['title'])
  234. node_section = GDPR[
  235. 'chapter{}-{}'.format(chapter_number, section['number'])]
  236. graph.add((node_section, RDF.type, LRS))
  237. graph.add((node_section, RDF.type, class_Section))
  238. graph.add((node_section, TITLE, Literal(
  239. section['title'], datatype=XSD.string)))
  240. graph.add((node_section, NOS, Literal(
  241. section['number'], datatype=XSD.string)))
  242. graph.add((node_section, TITLE_ALT, Literal(
  243. 'Section ' + section['number'], datatype=XSD.string)))
  244. graph.add((node_section, PART_OF, chapter))
  245. graph.add((node_section, PART_OF, gdpr))
  246. graph.add((node_section, property_partof_chapter, chapter))
  247. graph.add((chapter, property_has_section, node_section))
  248. for article in section['contents']:
  249. graph_article(article, node_section, chapter)
  250. def graph_chapter(chapter):
  251. '''adds chapter to graph'''
  252. # print('C', chapter['number'], chapter['title'])
  253. node_chapter = GDPR['chapter{}'.format(chapter['number'])]
  254. graph.add((node_chapter, RDF.type, LRS))
  255. graph.add((node_chapter, RDF.type, class_Chapter))
  256. graph.add((node_chapter, TITLE, Literal(
  257. chapter['title'], datatype=XSD.string)))
  258. graph.add((node_chapter, NOS, Literal(
  259. chapter['number'], datatype=XSD.string)))
  260. graph.add((node_chapter, TITLE_ALT, Literal(
  261. 'Chapter ' + chapter['number'], datatype=XSD.string)))
  262. graph.add((node_chapter, PART_OF, gdpr))
  263. graph.add((gdpr, property_has_chapter, node_chapter))
  264. contents = chapter['contents']
  265. # Section (if any)
  266. if contents[0]['type'] == 'section':
  267. for item in contents:
  268. graph_section(item, node_chapter, chapter['number'])
  269. else:
  270. for item in contents:
  271. graph_article(item, None, node_chapter)
  272. for chapter in gdpr_json['chapters']:
  273. graph_chapter(chapter)
  274. for recital in gdpr_json['recitals']:
  275. node_recital = GDPR['recital{}'.format(recital['number'])]
  276. graph.add((node_recital, RDF.type, LRS))
  277. graph.add((node_recital, RDF.type, class_Recital))
  278. graph.add((node_recital, NOS, Literal(
  279. recital['number'], datatype=XSD.string)))
  280. graph.add((node_recital, DESC, Literal(
  281. recital['text'], datatype=XSD.string)))
  282. graph.add((node_recital, PART_OF, gdpr))
  283. graph.add((gdpr, HAS_RECITAL, node_recital))
  284. for citation in gdpr_json['citations'].values():
  285. node_citation = GDPR['citation{}'.format(citation['number'])]
  286. graph.add((node_citation, RDF.type, LRS))
  287. graph.add((node_citation, RDF.type, class_Citation))
  288. graph.add((node_citation, NOS, Literal(
  289. citation['number'], datatype=XSD.string)))
  290. graph.add((node_citation, DESC, Literal(
  291. citation['text'], datatype=XSD.string)))
  292. graph.add((node_citation, PART_OF, gdpr))
  293. graph.add((gdpr, ELI.cites, node_citation))
  294. # Add citations
  295. graph.add((GDPR['recital3'], ELI.cites, GDPR['citation4']))
  296. graph.add((GDPR['recital13'], ELI.cites, GDPR['citation5']))
  297. graph.add((GDPR['recital17'], ELI.cites, GDPR['citation6']))
  298. graph.add((GDPR['recital19'], ELI.cites, GDPR['citation7']))
  299. graph.add((GDPR['recital21'], ELI.cites, GDPR['citation8']))
  300. graph.add((GDPR['recital35'], ELI.cites, GDPR['citation9']))
  301. graph.add((GDPR['recital42'], ELI.cites, GDPR['citation10']))
  302. graph.add((GDPR['recital54'], ELI.cites, GDPR['citation11']))
  303. graph.add((GDPR['recital106'], ELI.cites, GDPR['citation12']))
  304. graph.add((GDPR['recital147'], ELI.cites, GDPR['citation13']))
  305. graph.add((GDPR['recital154'], ELI.cites, GDPR['citation14']))
  306. graph.add((GDPR['recital161'], ELI.cites, GDPR['citation15']))
  307. graph.add((GDPR['recital163'], ELI.cites, GDPR['citation16']))
  308. graph.add((GDPR['recital172'], ELI.cites, GDPR['citation17']))
  309. graph.add((GDPR['recital173'], ELI.cites, GDPR['citation18']))
  310. graph.add((GDPR['article4-25'], ELI.cites, GDPR['citation19']))
  311. graph.add((GDPR['article43-1-2'], ELI.cites, GDPR['citation20']))
  312. graph.add((GDPR['article76-2'], ELI.cites, GDPR['citation21']))
  313. # Serialize
  314. graph.serialize(destination='../deliverables/gdpr.ttl', format='turtle')
  315. graph.serialize(destination='../deliverables/gdpr.rdf', format='pretty-xml')
  316. graph.serialize(destination='../deliverables/gdpr.n3', format='n3')
  317. graph.serialize(destination='../deliverables/gdpr.nt', format='nt')
  318. graph.serialize(destination='../deliverables/gdpr.jsonld', format='json-ld')