harsh
/
GDPRtEXT


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
							/**
 * Parse GDPR text
 * @author Harshvardhan Pandit me a.t. harshp.com
 * @name parse_gdpr
 * LICENSE: MIT
 *
 * This script parses GDPR text from the webpage into a JSON document
 * It also re-arranges the text into a better version where each individual
 * item in the text is assigned a contextual ID attribute making it
 * easy to reference the particular item.
 */

var FLG_DEBUG = true;
if (!FLG_DEBUG) {
	console.log = function() {};
}

/**
 * @return {jQuery array} returns all GDPR article text
 */
var extract_gdpr_articles_from_page = function() {
    // starting element is the one just before CHAPTER I
    var begin = $('p#d1e1374-1-1').prev();
    // ending element is the div marked final
    var end = $('div.final');
    // first processing element is the one after begin (CHAPTER I)
    var element = begin.next();
    var text = [];

    // continue while we haven't reached the end element
    while (!element.is(end)) {
        text.push(element);
        element = element.next();
    }

    return text;
}
var txt_gdpr = extract_gdpr_articles_from_page();

/**
 * @param  {jQuery array} text contains the text of the GDPR
 * @return {[array of arrays]} chapters extracted from text
 * Each chapter contains the sections and articles within it sequentially
 * in the arrays inside it
 * [chapters] where [chapter] contains everything as jQuery array
 */
var extract_chapters_from_gdpr_text = function(text) {
    var chapters = [];
    var chapter = null;
    var chapter_contents = null;
    for (var i=0; i<text.length; i++) {
        var element = text[i];
        // mark for new chapter
        if(
            // check element type is P
            element.prop('nodeName') == 'P' &&
            // test id syntax
            /^d1e\d+-1-1$/.test(element.prop('id')) && 
            // test text is chapter
            /^\s*CHAPTER [IVX]+\s*$/.test(element.text())
        ) {
            // extract one chapter
        	// the title is in the next element
        	var chapter_title = text[i+1].text().trim();
        	i += 1
        	chapter_contents = [];
			chapter = {
                'number': element.text().trim().slice(8),
                'title': chapter_title,
                "type": "chapter",
                "contents": chapter_contents
            };
            chapters.push(chapter);
        } else {
        	chapter_contents.push(element);
        }
    }
    for(let ch of chapters) {
        // console.log(ch.type, ch.number, ch.contents.length);
    }
    return chapters;
}
var chapters = extract_chapters_from_gdpr_text(txt_gdpr);

/**
 * @param  {[jQuery array]} list contents of chapter
 * @return {[array of arrays]} articles within that chapter
 * Extracts articles from text (chapter)
 */
var extract_articles_from_text = function(text) {
    var articles = [];
    var article = null;
    var article_contents = null;
    for (var i=0; i<text.length; i++) {
        var element = text[i];
        if (
            // if its a P element
            // element.prop('nodeName') == 'P' &&
            // with id of the form
            /^d1e\d+-1-1$/.test(element.prop('id')) &&
            // and text is Article
            /^Article \d+$/.test(element.text())
        ) {
        	var article_title = text[i+1].text().trim();
        	i += 1;
        	article_contents = [];
        	article = {
        		'number': element.text().slice(8),
        		'title': article_title,
        		'type': 'article',
        		'contents': article_contents
        	};
            articles.push(article);
        } else {
        	article_contents.push(element);
        }
    }
    for (let at of articles) {
    	// console.log(at.type, at.number, at.contents.length);
    }

    return articles;
}

/** Article 4 needs special handling
 *	This is because in Article 4, the first point is a generic point without a number,
 *	but the other points are all (weirdly) in a table. Additionally, some of the points
 *	have subpoints as internal tables.
 *
 * The difference between the points in Article 4 and the other articles is the way
 * they are parsed. In Article 4, in addition to each point being a table, the subpoints
 * are embedded in the point itself, whereas in other articles, the subpoints though
 * being a table, are a separate element/item.
 */

/**
 * @param  {[jQuery array]}
 * @return {[map of points]}
 * Each point has the attribute
 * 	- number: digit/null
 * 	- text: text of the point
 * 	- type: text/point
 * 	- subpoints: []
 * 	This function is only to extract points from Article 4
 */
var extract_points_from_article4 = function(article4) {
	var text = article4.contents;
	var points = [];
	var point = null;
	var subpoints = null;
	var subpoint = null;
	for (var i=0; i<text.length; i++) {
		var element = text[i];
		var p_in_element = element.find('p');
		if (p_in_element.length == 0) {
			points.push({
				'number': null,
				'text': element.text().trim(),
				'type': 'text',
				'subpoints': []
			});
			continue;
		}
		var p_number = $(p_in_element[0]);
		var match = p_number.text().match('^\\((\\d+)\\)');
        if (match != undefined && match != null) {
            p_number = match[1];
        } else {
        	p_number = null;
        }
		var p_text = $(p_in_element[1]).text().trim();
		var p_subpoints = p_in_element.slice(2);
		subpoints = [];
		for (var j=0; j<p_subpoints.length; j++) {
			var subpoint_number = $(p_subpoints[j]).text();
			var match = subpoint_number.match('^\\((\\w+)\\)');
			if (match != undefined && match != null) {
	            subpoint_number = match[1];
	        } else {
	        	subpoint_number = null;
	        }
			var subpoint_text = $(p_subpoints[j+1]).text().trim();
			j += 1;
			subpoints.push({
				'number': subpoint_number,
				'text': subpoint_text,
				'type': 'subpoint'
			});
		}
		points.push({
			'number': p_number,
			'text': p_text,
			'type': 'point',
			'subpoints': subpoints
		});
	}
	for(let pt of points) {
		// console.log(pt.type, pt.number, pt.subpoints.length);
		for(let spt of pt.subpoints) {
			// console.log(spt.type, spt.number);
		}
	}

	return points;
}
// var points_in_article4 = extract_points_from_article4(article4);


/**
 * @param  {[jQuery array]} article
 * @return {[map of points]}
 * Each point has the attribute
 * 	- number: digit/null
 * 	- text: text of the point
 * 	- type: text/point
 * 	- subpoints: []
 * Extracts points from article
 */
var extract_points_from_article = function(article) {
    // console.log(article.number);
	var text = article.contents;
	// The extraction mechanism works on the basis of sequential points.
	// This means that if a point has a subpoint,
	// then it will be in the NEXT element as a table

	var points = [];
	var point = null;
	var subpoints = null;
	for(var element of text) {
		var element_type = element.prop('nodeName');
		if (element_type == 'P') {
			var element_text = element.text().trim();
			var match = element_text.match('^(\\d+).');
			if (match == undefined || match == null) {
				// this point has no number
				point = {
					'number': null,
					'text': element_text,
					'type': 'text',
					'subpoints': []
				};
				points.push(point);
			} else {
				// point has a number
				point = {
					'number': match[1],
					'type': 'point',
					'subpoints': []
				};
				point.text = element_text.match('^\\d+.\\s+(.*)')[1];
				points.push(point);
			}
		} else if (element_type == 'TABLE') {
			var p_in_element = element.find('p');
			var p_number = $(p_in_element[0]).text().trim();
			var match = p_number.match('(\\w+)');
			if (match == undefined || match == null) {
				p_number = null;
			} else {
				p_number = match[1];
			}
			var p_text = $(p_in_element[1]).text().trim();
			point.subpoints.push({
				'number': p_number,
				'text': p_text,
				'type': 'subpoint'
			});
		}
	}
	for(pt of points) {
        for(spt of pt.subpoints) {
            // console.log(article.number, pt.number, spt.number);
        }
	}

	return points;
}

/**
 * combine articles in chapter 1 together
 */
var articles_in_chapter_1 = extract_articles_from_text(chapters[0].contents);
var points_in_articles = articles_in_chapter_1.slice(0,3).map(extract_points_from_article);
points_in_articles.push(extract_points_from_article4(articles_in_chapter_1[3]));
for(var i=0; i<articles_in_chapter_1.length; i++) {
	var article = articles_in_chapter_1[i];
	article.contents = points_in_articles[i];
}
chapters[0].contents = articles_in_chapter_1;

/**
 * create the final data object that will hold all the items together
 */
var data = {
    'title': "General Data Protection Regulation",
    'abbrv': "GDPR",
    'regulation': "2016/679",
    'dated': "27/04/2016",
    'updated': "04/05/2016",
    'about': "protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing Directive 95/46/EC (General Data Protection Regulation)",
    'identifier': "L 119/1",
    'language': "EN",
	'chapters': [chapters[0]],
    'recitals': []
};
// chapters yet to be processed
chapters = chapters.slice(1);

/**
 * Process the other chapters
 * Now these other chapters can also contain sections, so the first test to be made
 * is whether a chapter contains any sections. If it does, then the sections get
 * extracted first, and after that the articles need to be extracted from the sections.
 * If a chapter does not contain any sections, then the articles will be extracted
 * directly from the section.
 */

var extract_sections_from_text = function(text) {
	var sections = [];
    var section = null;
    var section_contents = null;
    for (var i=0; i<text.length; i++) {
        var element = text[i];
        if (
            element.prop('nodeName') == 'P' &&
            /^d1e\d+-1-1$/.test(element.prop('id')) &&
            $(element).children('span.expanded').length == 1
        ) {
        	var section_title = text[i+1].text().trim();
        	i += 1;
        	section_contents = [];
        	section = {
        		'number': element.text().trim().slice(7).trim(),
        		'title': section_title,
        		'type': 'section',
        		'contents': section_contents
        	};
            sections.push(section);
        } else {
        	if (section_contents == null) {
        		return null;
        	}
        	section_contents.push(element);
        }
    }
    for (let sec of sections) {
    	// console.log(sec.type, sec.number, sec.contents.length);
    }

    return sections;
}
for(let ch of chapters) {
	var sections = extract_sections_from_text(ch.contents);
	if (sections == null) {
		// console.log(ch.type, ch.number, "does not have Sections");
		var articles = extract_articles_from_text(ch.contents);
		ch.contents = articles;
		for(let at of articles) {
			// console.log(at.type, at.number);
			at.contents = extract_points_from_article(at);
		}
	} else {
		// console.log(ch.type, ch.number, "has Sections");
		ch.contents = sections;
		for(let sec of sections) {
			var articles = extract_articles_from_text(sec.contents);
			for(let at of articles) {
				at.contents = extract_points_from_article(at);
			}
;			sec.contents = articles;
		}
	}
	data.chapters.push(ch);
}

/**
 * DEBUG OUTPUT
 */
// for(let ch of data.chapters) {
// 	console.log(ch.type, ch.number, ch.contents.length);
// 	if (ch.contents[0].type == 'section') {
// 		for(let sec of ch.contents) {
// 			console.log(sec.type, sec.number, sec.contents.length);
// 			for(let at of sec.contents) {
// 				console.log(at.type, at.number, at.contents.length);
// 				for(let pt of at.contents) {
// 					console.log(pt.type, pt.number, pt.subpoints.length);
// 				}
// 			}
// 		}
// 	} else {
// 		for(let at of ch.contents) {
// 			console.log(at.type, at.number, at.contents.length);
// 			for(let pt of at.contents) {
// 				console.log(pt.type, pt.number, pt.subpoints.length);
// 			}
// 		}
// 	}
// }


/**
 * RECITALS
 */
// The recitals start at the table (second) which is after the p element
// with id = d1e40-1-1
var element = $('p#d1e40-1-1');
while(element.prop('nodeName') != 'TABLE') element = element.next();
// At this stage, element points to the table where recitals begin
// Each table holds two p elements, where the first one contains the number
// and the second contains the text 
while(element.prop('nodeName') != 'P') {
    var p_in_element = element.find('p');
    var p_text = $(p_in_element[1]).text().trim();
    var p_number = $(p_in_element[0]).text().match('^\\((\\d+)\\)')[1];
    data.recitals.push({
        'number': p_number,
        'text': p_text,
        'type': 'recital'
    });
    element = element.next();
}

/**
 * CITATIONS
 */
// citations are p elements with class note
data.citations = $('p.note').map(function(index, element) {
    var match = $(element).text().trim().match('^\\((\\d+)\\)\\s+(.*)$')
    return {
        'number': match[1],
        'text': match[2],
        'type': 'citation'
    };
});


/**
 * Download data as JSON
 */
delete data.citations.prevObject;
delete data.citations.context;
delete data.citations.length;
$('<a id="downloadAnchorElem" style="display:none"></a>').appendTo('body');
var dataStr = "data:text/json;charset=utf-8," + encodeURIComponent(JSON.stringify(data));
var btn_download = document.getElementById('downloadAnchorElem');
btn_download.setAttribute("href", dataStr);
btn_download.setAttribute("download", "gdpr.json");
/** TO DOWNLOAD, CALL THIS IN BROWSER CONSOLE **/
// btn_download.click();