generate_rdf_pairings.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. #!/usr/bin/env bash
  2. # author: Harshvardhan Pandit
  3. # Generates RDF graph for GDPR text from JSON (gdpr.json)
  4. # The JSON file is in ../deliverables/gdpr.json
  5. # The algorithm is roughly as follows:
  6. # For every chapter, article, subpoint, et. al. it defines the following
  7. # attributes using the ELI vocabulary
  8. # @prefix ELI https://publications.europa.eu/en/mdr/resource/eli/eli.owl
  9. #
  10. # - ELI:title (xsd:string) title of the item
  11. # - ELI:number (xsd:string) number of the item in text
  12. # - ELI:title_alternative (xsd:string) an alternative representation of the
  13. # item as <item_type> followed by <item_number>; e.g. Chapter 1
  14. # - ELI:description (xsd:string) text of the item
  15. # - ELI:is_part_of (ELI:LegalResource) gdpr:GDPR
  16. # here, this is an instance of a LegalResource created to represent
  17. # the GDPR, and is present in another OWL file, which is being edited
  18. # in Protege. The reference does not need to be resolved.
  19. # - ELI:id_local (xsd:string) the ID of the resource in the HTML file
  20. # This is a bungling of the proper use of the vocabulary, but it keeps
  21. # all terms within the intended usage. The same ID can be used to
  22. # lookup the resource in the HTML file (so it is the ID attribute of
  23. # the resource).
  24. #
  25. # This script generates the RDF pairings for the text of the GDPR. The bulk
  26. # of the work and the part where the GDPR itself is referenced is actually
  27. # done using Protege, and does not need the use of a script, such as this,
  28. # since it is a manual labour.
  29. ##############################################################################
  30. # Load the JSON from file
  31. with open('../deliverables/gdpr.json') as fd:
  32. import json
  33. gdpr_json = json.load(fd)
  34. # import p# print
  35. # p# print.p# print(gdpr_json)
  36. from rdflib import Graph, RDF, XSD, Literal, BNode
  37. # This will be the graph used to hold the triples as they are being generated
  38. graph = Graph()
  39. from rdflib import Namespace
  40. # This is the ELI namespace, used as the legal vocabulary for EU text
  41. ELI = Namespace("http://data.europa.eu/eli/ontology#")
  42. LRS = ELI.LegalResourceSubdivision
  43. title = ELI.title
  44. number = ELI.number
  45. title_alternative = ELI.title_alternative
  46. is_part_of = ELI.is_part_of
  47. description = ELI.description
  48. # This is the GDPR namespace used by the project
  49. # NOTE: this is temporary
  50. GDPR = Namespace("http://www.semanticweb.org/harsh/ontologies/GDPR#")
  51. node_gdpr = GDPR.GDPR
  52. ##############################################################################
  53. # The chapters are in json.chapters as an array of dicts
  54. # { number, title, contents }
  55. # A chapter may contain sections, and if it does, then it has the same
  56. # structure as well.
  57. # If a chapter has sections, then the section, else the chapter itself, has
  58. # several articles with the same structure.
  59. # Each article has several points, which may or may not be numbered.
  60. # Each point may have several points, which may or may not be numbered.
  61. def graph_subpoint(
  62. subpoint, article_number, point_number,
  63. point, article, section=None, chapter=None):
  64. '''adds subpoint to graph'''
  65. # print('SP', subpoint['number'])
  66. if subpoint['number'] is not None:
  67. node_subpoint = GDPR['article{}-{}{}'.format(
  68. article_number, point_number, subpoint['number'])]
  69. graph.add((node_subpoint, number, Literal(
  70. subpoint['number'], datatype=XSD.string)))
  71. graph.add((node_subpoint, title_alternative, Literal(
  72. 'Article' + article_number + '({}{})'.format(
  73. point_number, subpoint['number']),
  74. datatype=XSD.string)))
  75. else:
  76. node_subpoint = BNode()
  77. graph.add((node_subpoint, RDF.type, LRS))
  78. graph.add((node_subpoint, is_part_of, node_gdpr))
  79. graph.add((node_subpoint, is_part_of, chapter))
  80. if section is not None:
  81. graph.add((node_subpoint, is_part_of, section))
  82. graph.add((node_subpoint, is_part_of, article))
  83. graph.add((node_subpoint, is_part_of, point))
  84. graph.add((node_subpoint, description, Literal(
  85. subpoint['text'], datatype=XSD.string)))
  86. def graph_point(point, article_number, article, section=None, chapter=None):
  87. '''adds point to graph'''
  88. # print('P', point['number'])
  89. if point['number'] is not None:
  90. node_point = GDPR['article{}-{}'.format(
  91. article_number, point['number'])]
  92. graph.add((node_point, number, Literal(
  93. point['number'], datatype=XSD.string)))
  94. graph.add((node_point, title_alternative, Literal(
  95. 'Article' + article_number + '({})'.format(point['number']),
  96. datatype=XSD.string)))
  97. else:
  98. node_point = BNode()
  99. graph.add((node_point, RDF.type, LRS))
  100. graph.add((node_point, is_part_of, node_gdpr))
  101. graph.add((node_point, is_part_of, chapter))
  102. if section is not None:
  103. graph.add((node_point, is_part_of, section))
  104. graph.add((node_point, is_part_of, article))
  105. graph.add((node_point, description, Literal(
  106. point['text'], datatype=XSD.string)))
  107. for subpoint in point['subpoints']:
  108. graph_subpoint(
  109. subpoint, article_number, point['number'],
  110. node_point, article, section, chapter)
  111. def graph_article(article, section=None, chapter=None):
  112. '''adds article to graph'''
  113. # print('A', article['number'])
  114. node_article = GDPR['article{}'.format(article['number'])]
  115. graph.add((node_article, RDF.type, LRS))
  116. graph.add((node_article, number, Literal(
  117. article['number'], datatype=XSD.string)))
  118. graph.add((node_article, title_alternative, Literal(
  119. 'Article ' + article['number'], datatype=XSD.string)))
  120. graph.add((node_article, is_part_of, node_gdpr))
  121. graph.add((node_article, is_part_of, chapter))
  122. if section is not None:
  123. graph.add((node_article, is_part_of, section))
  124. for point in article['contents']:
  125. graph_point(point, article['number'], node_article, section, chapter)
  126. def graph_section(section, chapter):
  127. '''adds section to graph'''
  128. # print('S', section['number'], section['title'])
  129. node_section = GDPR['section{}'.format(section['number'])]
  130. graph.add((node_section, RDF.type, LRS))
  131. graph.add((node_section, title, Literal(
  132. section['title'], datatype=XSD.string)))
  133. graph.add((node_section, number, Literal(
  134. section['number'], datatype=XSD.string)))
  135. graph.add((node_section, title_alternative, Literal(
  136. 'Section ' + section['number'], datatype=XSD.string)))
  137. graph.add((node_section, is_part_of, node_gdpr))
  138. graph.add((node_section, is_part_of, chapter))
  139. for article in section['contents']:
  140. graph_article(article, node_section, chapter)
  141. def graph_chapter(chapter):
  142. '''adds chapter to graph'''
  143. # print('C', chapter['number'], chapter['title'])
  144. node_chapter = GDPR['chapter{}'.format(chapter['number'])]
  145. graph.add((node_chapter, RDF.type, LRS))
  146. graph.add((node_chapter, title, Literal(
  147. chapter['title'], datatype=XSD.string)))
  148. graph.add((node_chapter, number, Literal(
  149. chapter['number'], datatype=XSD.string)))
  150. graph.add((node_chapter, title_alternative, Literal(
  151. 'Chapter ' + chapter['number'], datatype=XSD.string)))
  152. graph.add((node_chapter, is_part_of, node_gdpr))
  153. contents = chapter['contents']
  154. # Section (if any)
  155. if contents[0]['type'] == 'section':
  156. for item in contents:
  157. graph_section(item, node_chapter)
  158. else:
  159. for item in contents:
  160. graph_article(item, None, node_chapter)
  161. for chapter in gdpr_json['chapters']:
  162. graph_chapter(chapter)
  163. graph.serialize(destination='gdpr.ttl', format='turtle')