Bladeren bron

initial:readme,scripts,deliverables

Contains a formatted README, lacks a license. The deliverables
folder contains gdpr HTML and JSON versions; where the HTML
version contains ID markers for resources. The scripts folder
contains the scripts used to create the deliverables.
Harshvardhan Pandit 7 jaren geleden
commit
fd5b669ab7
6 gewijzigde bestanden met toevoegingen van 1225 en 0 verwijderingen
  1. 24 0
      README.md
  2. 58 0
      deliverables/gdpr.html
  3. 0 0
      deliverables/gdpr.json
  4. 546 0
      scripts/GDPR_en.html
  5. 153 0
      scripts/fancy_format_gdpr.js
  6. 444 0
      scripts/parse_gdpr.js

+ 24 - 0
README.md

@@ -0,0 +1,24 @@
+# GDPR tEXT
+Project for annotating GDPR text to make it referenceable using Linked Open
+Data ontologies/vocabularies. 
+The canonical GDPR text can be found [here](http://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679&qid=1499685310592&from=en).
+This repository acts as version control for the project. The official
+website is at [openscience.adaptcentre.ie](https://openscience.adaptcentre.ie/).
+
+## Deliverables
+
+1. [GDPR (En) text with HTML id for each item/article](https://openscience.adaptcentre.ie/resources/GDPRtEXT/gdpr.html)
+2. [GDPR (En) text as JSON](https://openscience.adaptcentre.ie/resources/GDPRtEXT/gdpr.json)
+3. [GDPR (En) as RDF dataset](https://openscience.adaptcentre.ie/resources/GDPRtEXT/gdpr.rdf)
+4. [GDPR (En) text with HTML id + RDFa](https://openscience.adaptcentre.ie/resources/GDPRtEXT/gdpr_rdfa.html)
+
+## Project Goals
+The project aims to annotate this text to fulfil the following goals:
+
+1. Reference individual chapters, articles, and points using HTML ID attributes
+2. Provide additional annotations for the content related to the context and use
+3. Provide additional annotations for the GDPRov project
+4. Annotate using legal ontologies and (perhaps) SKOS
+
+## ADAPT Centre
+Made by Harshvardhan Pandit as part of PhD research

File diff suppressed because it is too large
+ 58 - 0
deliverables/gdpr.html


File diff suppressed because it is too large
+ 0 - 0
deliverables/gdpr.json


File diff suppressed because it is too large
+ 546 - 0
scripts/GDPR_en.html


+ 153 - 0
scripts/fancy_format_gdpr.js

@@ -0,0 +1,153 @@
+/**
+ * FANCY FORMAT GDPR TEXT
+ * @author Harshvardhan Pandit
+ * @author me a.t. harshp dot com
+ * LICENSE: MIT
+ *
+ * Using the data from gdpr.json, this script restructures the HTML to create
+ * a hierarchical page that can be referenced by HTML id attributes
+ */
+
+// clear the body
+$('body').empty();
+
+// insert title and other items
+$('body').append(
+    $('<h1>', {
+        'id': 'title',
+        'class': 'title',
+        'text': data.title + ' (' + data.abbrv + ')' 
+    }));
+$('body').append(
+    $('<p>', {
+        'id': 'about',
+        'class': 'description',
+        'text': data.about
+    }));
+
+// recitals
+var element_recital = $('<div>', { 'id': 'recitals' });
+element_recital.append(
+    $('<h2>', {
+        'class': 'recital-title',
+        'text': 'Recitals'
+    }));
+data.recitals.map(function(recital) {
+   element_recital.append(
+      $('<p>', {
+         'id': 'recital-' + recital.number,
+         'class': 'recital',
+         'text': '(' + recital.number + ') ' + recital.text
+      })); 
+});
+$('body').append(element_recital);
+
+//  chapters
+data.chapters.map(function(chapter) {
+    var element_chapter = $('<div>', {
+        'id': 'chapter' + chapter.number,
+        'class': 'chapter'
+    });
+    element_chapter.append(
+        $('<h2>', {
+            'class': 'chapter-title',
+            'text': "Chapter " + chapter.number + " " + chapter.title
+        }));
+
+    // process entries in chapter
+    // store the article handler so that it can be called on sections and
+    // articles and reduces the things written in if-else blocks
+
+    /**
+     * handles article and creates elements
+     * returns article div containing points and sub-points
+     */
+    var handler_article = function(article) {
+        var element_article = $('<div>', {
+            'id': 'article' + article.number,
+            'class': 'article'
+        });
+        element_article.append(
+            $('<h3>', {
+                'class': 'article-title',
+                'text': 'Article ' + article.number + " " + article.title
+            }));
+        // handle points
+        article.contents.map(function(point) {
+            if (point.number != null) {
+                var element_point = $('<p>', {
+                    'id': 'article' + article.number + '-' + point.number,
+                    'class': 'point',
+                    'text': '(' + point.number + ') ' + point.text
+                });
+            } else {
+                var element_point = $('<p>', {
+                    'class': 'point',
+                    'text': point.text
+                });
+            }
+            point.subpoints.map(function(subpoint) {
+                if (subpoint.number != null) {
+                    element_point.append(
+                        $('<p>', {
+                           'id': 'article' + article.number + '-' + point.number
+                                + '-' + subpoint.number,
+                           'class': 'subpoint',
+                           'text': '(' + subpoint.number + ') ' + subpoint.text
+                        })); 
+                } else {
+                    element_point.append(
+                        $('<p>', {
+                           'class': 'subpoint',
+                           'text': '- ' + subpoint.text
+                        })); 
+                }
+            });
+            element_article.append(element_point);
+        });
+    
+        return element_article;
+    };
+    // process sections if chapter has sections
+    if (chapter.contents[0].type == 'section') {
+        chapter.contents.map(function(section) {
+            var element_section = $('<div>', {
+                'id': 'section' + section.number,
+                'class': 'section'
+            });
+            element_section.append(
+                $('<h3>', {
+                    'class': 'section-title',
+                    'text': 'Section ' + section.number + " " + section.title
+                }));
+            section.contents.map(function(article) {
+                element_section.append(handler_article(article));
+            });
+            element_chapter.append(element_section);
+        });
+    } else {
+        chapter.contents.map(function(article) {
+            element_chapter.append(handler_article(article));
+        });
+    }
+
+    // append chapter to body
+    $('body').append(element_chapter);
+});
+
+// Citations / References
+var element_citation = $('<div>', { 'id': 'citations' });
+element_citation.append(
+    $('<h2>', {
+        'class': 'citation-title',
+        'text': 'References'
+    }));
+data.citations.map(function(index, citation) {
+    element_citation.append(
+      $('<p>', {
+         'id': 'citation-' + citation.number,
+         'class': 'citation',
+         'text': '(' + citation.number + ') ' + citation.text
+      })); 
+});
+$('body').append(element_citation);

+ 444 - 0
scripts/parse_gdpr.js

@@ -0,0 +1,444 @@
+/**
+ * Parse GDPR text
+ * @author Harshvardhan Pandit me a.t. harshp.com
+ * @name parse_gdpr
+ * LICENSE: MIT
+ *
+ * This script parses GDPR text from the webpage into a JSON document
+ * It also re-arranges the text into a better version where each individual
+ * item in the text is assigned a contextual ID attribute making it
+ * easy to reference the particular item.
+ */
+
+var FLG_DEBUG = true;
+if (!FLG_DEBUG) {
+	console.log = function() {};
+}
+
+/**
+ * @return {jQuery array} returns all GDPR article text
+ */
+var extract_gdpr_articles_from_page = function() {
+    // starting element is the one just before CHAPTER I
+    var begin = $('p#d1e1374-1-1').prev();
+    // ending element is the div marked final
+    var end = $('div.final');
+    // first processing element is the one after begin (CHAPTER I)
+    var element = begin.next();
+    var text = [];
+
+    // continue while we haven't reached the end element
+    while (!element.is(end)) {
+        text.push(element);
+        element = element.next();
+    }
+
+    return text;
+}
+var txt_gdpr = extract_gdpr_articles_from_page();
+
+/**
+ * @param  {jQuery array} text contains the text of the GDPR
+ * @return {[array of arrays]} chapters extracted from text
+ * Each chapter contains the sections and articles within it sequentially
+ * in the arrays inside it
+ * [chapters] where [chapter] contains everything as jQuery array
+ */
+var extract_chapters_from_gdpr_text = function(text) {
+    var chapters = [];
+    var chapter = null;
+    var chapter_contents = null;
+    for (var i=0; i<text.length; i++) {
+        var element = text[i];
+        // mark for new chapter
+        if(
+            // check element type is P
+            element.prop('nodeName') == 'P' &&
+            // test id syntax
+            /^d1e\d+-1-1$/.test(element.prop('id')) && 
+            // test text is chapter
+            /^\s*CHAPTER [IVX]+\s*$/.test(element.text())
+        ) {
+            // extract one chapter
+        	// the title is in the next element
+        	var chapter_title = text[i+1].text().trim();
+        	i += 1
+        	chapter_contents = [];
+			chapter = {
+                'number': element.text().trim().slice(8),
+                'title': chapter_title,
+                "type": "chapter",
+                "contents": chapter_contents
+            };
+            chapters.push(chapter);
+        } else {
+        	chapter_contents.push(element);
+        }
+    }
+    for(let ch of chapters) {
+        // console.log(ch.type, ch.number, ch.contents.length);
+    }
+    return chapters;
+}
+var chapters = extract_chapters_from_gdpr_text(txt_gdpr);
+
+/**
+ * @param  {[jQuery array]} list contents of chapter
+ * @return {[array of arrays]} articles within that chapter
+ * Extracts articles from text (chapter)
+ */
+var extract_articles_from_text = function(text) {
+    var articles = [];
+    var article = null;
+    var article_contents = null;
+    for (var i=0; i<text.length; i++) {
+        var element = text[i];
+        if (
+            // if its a P element
+            // element.prop('nodeName') == 'P' &&
+            // with id of the form
+            /^d1e\d+-1-1$/.test(element.prop('id')) &&
+            // and text is Article
+            /^Article \d+$/.test(element.text())
+        ) {
+        	var article_title = text[i+1].text().trim();
+        	i += 1;
+        	article_contents = [];
+        	article = {
+        		'number': element.text().slice(8),
+        		'title': article_title,
+        		'type': 'article',
+        		'contents': article_contents
+        	};
+            articles.push(article);
+        } else {
+        	article_contents.push(element);
+        }
+    }
+    for (let at of articles) {
+    	// console.log(at.type, at.number, at.contents.length);
+    }
+
+    return articles;
+}
+
+/** Article 4 needs special handling
+ *	This is because in Article 4, the first point is a generic point without a number,
+ *	but the other points are all (weirdly) in a table. Additionally, some of the points
+ *	have subpoints as internal tables.
+ *
+ * The difference between the points in Article 4 and the other articles is the way
+ * they are parsed. In Article 4, in addition to each point being a table, the subpoints
+ * are embedded in the point itself, whereas in other articles, the subpoints though
+ * being a table, are a separate element/item.
+ */
+
+/**
+ * @param  {[jQuery array]}
+ * @return {[map of points]}
+ * Each point has the attribute
+ * 	- number: digit/null
+ * 	- text: text of the point
+ * 	- type: text/point
+ * 	- subpoints: []
+ * 	This function is only to extract points from Article 4
+ */
+var extract_points_from_article4 = function(article4) {
+	var text = article4.contents;
+	var points = [];
+	var point = null;
+	var subpoints = null;
+	var subpoint = null;
+	for (var i=0; i<text.length; i++) {
+		var element = text[i];
+		var p_in_element = element.find('p');
+		if (p_in_element.length == 0) {
+			points.push({
+				'number': null,
+				'text': element.text().trim(),
+				'type': 'text',
+				'subpoints': []
+			});
+			continue;
+		}
+		var p_number = $(p_in_element[0]);
+		var match = p_number.text().match('^\\((\\d+)\\)');
+        if (match != undefined && match != null) {
+            p_number = match[1];
+        } else {
+        	p_number = null;
+        }
+		var p_text = $(p_in_element[1]).text().trim();
+		var p_subpoints = p_in_element.slice(2);
+		subpoints = [];
+		for (var j=0; j<p_subpoints.length; j++) {
+			var subpoint_number = $(p_subpoints[j]).text();
+			var match = subpoint_number.match('^\\((\\w+)\\)');
+			if (match != undefined && match != null) {
+	            subpoint_number = match[1];
+	        } else {
+	        	subpoint_number = null;
+	        }
+			var subpoint_text = $(p_subpoints[j+1]).text().trim();
+			j += 1;
+			subpoints.push({
+				'number': subpoint_number,
+				'text': subpoint_text,
+				'type': 'subpoint'
+			});
+		}
+		points.push({
+			'number': p_number,
+			'text': p_text,
+			'type': 'point',
+			'subpoints': subpoints
+		});
+	}
+	for(let pt of points) {
+		// console.log(pt.type, pt.number, pt.subpoints.length);
+		for(let spt of pt.subpoints) {
+			// console.log(spt.type, spt.number);
+		}
+	}
+
+	return points;
+}
+// var points_in_article4 = extract_points_from_article4(article4);
+
+
+/**
+ * @param  {[jQuery array]} article
+ * @return {[map of points]}
+ * Each point has the attribute
+ * 	- number: digit/null
+ * 	- text: text of the point
+ * 	- type: text/point
+ * 	- subpoints: []
+ * Extracts points from article
+ */
+var extract_points_from_article = function(article) {
+	var text = article.contents;
+	// The extraction mechanism works on the basis of sequential points.
+	// This means that if a point has a subpoint,
+	// then it will be in the NEXT element as a table
+
+	var points = [];
+	var point = null;
+	var subpoints = null;
+	for(var element of text) {
+		var element_type = element.prop('nodeName');
+		if (element_type == 'P') {
+			var element_text = element.text().trim();
+			var match = element_text.match('^(\\d+).');
+			if (match == undefined || match == null) {
+				// this point has no number
+				point = {
+					'number': null,
+					'text': element_text,
+					'type': 'text',
+					'subpoints': []
+				};
+				points.push(point);
+			} else {
+				// point has a number
+				point = {
+					'number': match[1],
+					'type': 'point',
+					'subpoints': []
+				};
+				point.text = element_text.match('^\\d+.\\s+(.*)')[1];
+				points.push(point);
+			}
+		} else if (element_type == 'TABLE') {
+			var p_in_element = element.find('p');
+			var p_number = $(p_in_element[0]).text().trim();
+			var match = p_number.match('^(\\w+).');
+			if (match == undefined || match == null) {
+				p_number = null;
+			} else {
+				p_number = match[1];
+			}
+			var p_text = $(p_in_element[1]).text().trim();
+			point.subpoints.push({
+				'number': p_number,
+				'text': p_text,
+				'type': 'subpoint'
+			});
+		}
+	}
+	for(pt of points) {
+		// console.log(pt.type, pt.number, pt.text);
+	}
+
+	return points;
+}
+
+/**
+ * combine articles in chapter 1 together
+ */
+var articles_in_chapter_1 = extract_articles_from_text(chapters[0].contents);
+var points_in_articles = articles_in_chapter_1.slice(0,3).map(extract_points_from_article);
+points_in_articles.push(extract_points_from_article4(articles_in_chapter_1[3]));
+for(var i=0; i<articles_in_chapter_1.length; i++) {
+	var article = articles_in_chapter_1[i];
+	article.contents = points_in_articles[i];
+}
+chapters[0].contents = articles_in_chapter_1;
+
+/**
+ * create the final data object that will hold all the items together
+ */
+var data = {
+    'title': "General Data Protection Regulation",
+    'abbrv': "GDPR",
+    'regulation': "2016/679",
+    'dated': "27/04/2016",
+    'updated': "04/05/2016",
+    'about': "protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing Directive 95/46/EC (General Data Protection Regulation)",
+    'identifier': "L 119/1",
+    'language': "EN",
+	'chapters': [chapters[0]],
+    'recitals': []
+};
+// chapters yet to be processed
+chapters = chapters.slice(1);
+
+/**
+ * Process the other chapters
+ * Now these other chapters can also contain sections, so the first test to be made
+ * is whether a chapter contains any sections. If it does, then the sections get
+ * extracted first, and after that the articles need to be extracted from the sections.
+ * If a chapter does not contain any sections, then the articles will be extracted
+ * directly from the section.
+ */
+
+var extract_sections_from_text = function(text) {
+	var sections = [];
+    var section = null;
+    var section_contents = null;
+    for (var i=0; i<text.length; i++) {
+        var element = text[i];
+        if (
+            element.prop('nodeName') == 'P' &&
+            /^d1e\d+-1-1$/.test(element.prop('id')) &&
+            $(element).children('span.expanded').length == 1
+        ) {
+        	var section_title = text[i+1].text().trim();
+        	i += 1;
+        	section_contents = [];
+        	section = {
+        		'number': element.text().trim().slice(7).trim(),
+        		'title': section_title,
+        		'type': 'section',
+        		'contents': section_contents
+        	};
+            sections.push(section);
+        } else {
+        	if (section_contents == null) {
+        		return null;
+        	}
+        	section_contents.push(element);
+        }
+    }
+    for (let sec of sections) {
+    	// console.log(sec.type, sec.number, sec.contents.length);
+    }
+
+    return sections;
+}
+for(let ch of chapters) {
+	var sections = extract_sections_from_text(ch.contents);
+	if (sections == null) {
+		// console.log(ch.type, ch.number, "does not have Sections");
+		var articles = extract_articles_from_text(ch.contents);
+		ch.contents = articles;
+		for(let at of articles) {
+			// console.log(at.type, at.number);
+			at.contents = extract_points_from_article(at);
+		}
+	} else {
+		// console.log(ch.type, ch.number, "has Sections");
+		ch.contents = sections;
+		for(let sec of sections) {
+			var articles = extract_articles_from_text(sec.contents);
+			for(let at of articles) {
+				at.contents = extract_points_from_article(at);
+			}
+;			sec.contents = articles;
+		}
+	}
+	data.chapters.push(ch);
+}
+
+/**
+ * DEBUG OUTPUT
+ */
+// for(let ch of data.chapters) {
+// 	console.log(ch.type, ch.number, ch.contents.length);
+// 	if (ch.contents[0].type == 'section') {
+// 		for(let sec of ch.contents) {
+// 			console.log(sec.type, sec.number, sec.contents.length);
+// 			for(let at of sec.contents) {
+// 				console.log(at.type, at.number, at.contents.length);
+// 				for(let pt of at.contents) {
+// 					console.log(pt.type, pt.number, pt.subpoints.length);
+// 				}
+// 			}
+// 		}
+// 	} else {
+// 		for(let at of ch.contents) {
+// 			console.log(at.type, at.number, at.contents.length);
+// 			for(let pt of at.contents) {
+// 				console.log(pt.type, pt.number, pt.subpoints.length);
+// 			}
+// 		}
+// 	}
+// }
+
+
+/**
+ * RECITALS
+ */
+// The recitals start at the table (second) which is after the p element
+// with id = d1e40-1-1
+var element = $('p#d1e40-1-1');
+while(element.prop('nodeName') != 'TABLE') element = element.next();
+// At this stage, element points to the table where recitals begin
+// Each table holds two p elements, where the first one contains the number
+// and the second contains the text 
+while(element.prop('nodeName') != 'P') {
+    var p_in_element = element.find('p');
+    var p_text = $(p_in_element[1]).text().trim();
+    var p_number = $(p_in_element[0]).text().match('^\\((\\d+)\\)')[1];
+    data.recitals.push({
+        'number': p_number,
+        'text': p_text,
+        'type': 'recital'
+    });
+    element = element.next();
+}
+
+/**
+ * CITATIONS
+ */
+// citations are p elements with class note
+data.citations = $('p.note').map(function(index, element) {
+    var match = $(element).text().trim().match('^\\((\\d+)\\)\\s+(.*)$')
+    return {
+        'number': match[1],
+        'text': match[2],
+        'type': 'citation'
+    };
+});
+
+
+/**
+ * Download data as JSON
+ */
+$('<a id="downloadAnchorElem" style="display:none"></a>').appendTo('body');
+var dataStr = "data:text/json;charset=utf-8," + encodeURIComponent(JSON.stringify(data));
+var btn_download = document.getElementById('downloadAnchorElem');
+btn_download.setAttribute("href", dataStr);
+btn_download.setAttribute("download", "gdpr.json");
+/** TO DOWNLOAD, CALL THIS IN BROWSER CONSOLE **/
+// btn_download.click();

Some files were not shown because too many files changed in this diff