parse_gdpr.js 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. /**
  2. * Parse GDPR text
  3. * @author Harshvardhan Pandit me a.t. harshp.com
  4. * @name parse_gdpr
  5. * LICENSE: MIT
  6. *
  7. * This script parses GDPR text from the webpage into a JSON document
  8. * It also re-arranges the text into a better version where each individual
  9. * item in the text is assigned a contextual ID attribute making it
  10. * easy to reference the particular item.
  11. */
  12. var FLG_DEBUG = true;
  13. if (!FLG_DEBUG) {
  14. console.log = function() {};
  15. }
  16. /**
  17. * @return {jQuery array} returns all GDPR article text
  18. */
  19. var extract_gdpr_articles_from_page = function() {
  20. // starting element is the one just before CHAPTER I
  21. var begin = $('p#d1e1374-1-1').prev();
  22. // ending element is the div marked final
  23. var end = $('div.final');
  24. // first processing element is the one after begin (CHAPTER I)
  25. var element = begin.next();
  26. var text = [];
  27. // continue while we haven't reached the end element
  28. while (!element.is(end)) {
  29. text.push(element);
  30. element = element.next();
  31. }
  32. return text;
  33. }
  34. var txt_gdpr = extract_gdpr_articles_from_page();
  35. /**
  36. * @param {jQuery array} text contains the text of the GDPR
  37. * @return {[array of arrays]} chapters extracted from text
  38. * Each chapter contains the sections and articles within it sequentially
  39. * in the arrays inside it
  40. * [chapters] where [chapter] contains everything as jQuery array
  41. */
  42. var extract_chapters_from_gdpr_text = function(text) {
  43. var chapters = [];
  44. var chapter = null;
  45. var chapter_contents = null;
  46. for (var i=0; i<text.length; i++) {
  47. var element = text[i];
  48. // mark for new chapter
  49. if(
  50. // check element type is P
  51. element.prop('nodeName') == 'P' &&
  52. // test id syntax
  53. /^d1e\d+-1-1$/.test(element.prop('id')) &&
  54. // test text is chapter
  55. /^\s*CHAPTER [IVX]+\s*$/.test(element.text())
  56. ) {
  57. // extract one chapter
  58. // the title is in the next element
  59. var chapter_title = text[i+1].text().trim();
  60. i += 1
  61. chapter_contents = [];
  62. chapter = {
  63. 'number': element.text().trim().slice(8),
  64. 'title': chapter_title,
  65. "type": "chapter",
  66. "contents": chapter_contents
  67. };
  68. chapters.push(chapter);
  69. } else {
  70. chapter_contents.push(element);
  71. }
  72. }
  73. for(let ch of chapters) {
  74. // console.log(ch.type, ch.number, ch.contents.length);
  75. }
  76. return chapters;
  77. }
  78. var chapters = extract_chapters_from_gdpr_text(txt_gdpr);
  79. /**
  80. * @param {[jQuery array]} list contents of chapter
  81. * @return {[array of arrays]} articles within that chapter
  82. * Extracts articles from text (chapter)
  83. */
  84. var extract_articles_from_text = function(text) {
  85. var articles = [];
  86. var article = null;
  87. var article_contents = null;
  88. for (var i=0; i<text.length; i++) {
  89. var element = text[i];
  90. if (
  91. // if its a P element
  92. // element.prop('nodeName') == 'P' &&
  93. // with id of the form
  94. /^d1e\d+-1-1$/.test(element.prop('id')) &&
  95. // and text is Article
  96. /^Article \d+$/.test(element.text())
  97. ) {
  98. var article_title = text[i+1].text().trim();
  99. i += 1;
  100. article_contents = [];
  101. article = {
  102. 'number': element.text().slice(8),
  103. 'title': article_title,
  104. 'type': 'article',
  105. 'contents': article_contents
  106. };
  107. articles.push(article);
  108. } else {
  109. article_contents.push(element);
  110. }
  111. }
  112. for (let at of articles) {
  113. // console.log(at.type, at.number, at.contents.length);
  114. }
  115. return articles;
  116. }
  117. /** Article 4 needs special handling
  118. * This is because in Article 4, the first point is a generic point without a number,
  119. * but the other points are all (weirdly) in a table. Additionally, some of the points
  120. * have subpoints as internal tables.
  121. *
  122. * The difference between the points in Article 4 and the other articles is the way
  123. * they are parsed. In Article 4, in addition to each point being a table, the subpoints
  124. * are embedded in the point itself, whereas in other articles, the subpoints though
  125. * being a table, are a separate element/item.
  126. */
  127. /**
  128. * @param {[jQuery array]}
  129. * @return {[map of points]}
  130. * Each point has the attribute
  131. * - number: digit/null
  132. * - text: text of the point
  133. * - type: text/point
  134. * - subpoints: []
  135. * This function is only to extract points from Article 4
  136. */
  137. var extract_points_from_article4 = function(article4) {
  138. var text = article4.contents;
  139. var points = [];
  140. var point = null;
  141. var subpoints = null;
  142. var subpoint = null;
  143. for (var i=0; i<text.length; i++) {
  144. var element = text[i];
  145. var p_in_element = element.find('p');
  146. if (p_in_element.length == 0) {
  147. points.push({
  148. 'number': null,
  149. 'text': element.text().trim(),
  150. 'type': 'text',
  151. 'subpoints': []
  152. });
  153. continue;
  154. }
  155. var p_number = $(p_in_element[0]);
  156. var match = p_number.text().match('^\\((\\d+)\\)');
  157. if (match != undefined && match != null) {
  158. p_number = match[1];
  159. } else {
  160. p_number = null;
  161. }
  162. var p_text = $(p_in_element[1]).text().trim();
  163. var p_subpoints = p_in_element.slice(2);
  164. subpoints = [];
  165. for (var j=0; j<p_subpoints.length; j++) {
  166. var subpoint_number = $(p_subpoints[j]).text();
  167. var match = subpoint_number.match('^\\((\\w+)\\)');
  168. if (match != undefined && match != null) {
  169. subpoint_number = match[1];
  170. } else {
  171. subpoint_number = null;
  172. }
  173. var subpoint_text = $(p_subpoints[j+1]).text().trim();
  174. j += 1;
  175. subpoints.push({
  176. 'number': subpoint_number,
  177. 'text': subpoint_text,
  178. 'type': 'subpoint'
  179. });
  180. }
  181. points.push({
  182. 'number': p_number,
  183. 'text': p_text,
  184. 'type': 'point',
  185. 'subpoints': subpoints
  186. });
  187. }
  188. for(let pt of points) {
  189. // console.log(pt.type, pt.number, pt.subpoints.length);
  190. for(let spt of pt.subpoints) {
  191. // console.log(spt.type, spt.number);
  192. }
  193. }
  194. return points;
  195. }
  196. // var points_in_article4 = extract_points_from_article4(article4);
  197. /**
  198. * @param {[jQuery array]} article
  199. * @return {[map of points]}
  200. * Each point has the attribute
  201. * - number: digit/null
  202. * - text: text of the point
  203. * - type: text/point
  204. * - subpoints: []
  205. * Extracts points from article
  206. */
  207. var extract_points_from_article = function(article) {
  208. // console.log(article.number);
  209. var text = article.contents;
  210. // The extraction mechanism works on the basis of sequential points.
  211. // This means that if a point has a subpoint,
  212. // then it will be in the NEXT element as a table
  213. var points = [];
  214. var point = null;
  215. var subpoints = null;
  216. for(var element of text) {
  217. var element_type = element.prop('nodeName');
  218. if (element_type == 'P') {
  219. var element_text = element.text().trim();
  220. var match = element_text.match('^(\\d+).');
  221. if (match == undefined || match == null) {
  222. // this point has no number
  223. point = {
  224. 'number': null,
  225. 'text': element_text,
  226. 'type': 'text',
  227. 'subpoints': []
  228. };
  229. points.push(point);
  230. } else {
  231. // point has a number
  232. point = {
  233. 'number': match[1],
  234. 'type': 'point',
  235. 'subpoints': []
  236. };
  237. point.text = element_text.match('^\\d+.\\s+(.*)')[1];
  238. points.push(point);
  239. }
  240. } else if (element_type == 'TABLE') {
  241. var p_in_element = element.find('p');
  242. var p_number = $(p_in_element[0]).text().trim();
  243. var match = p_number.match('(\\w+)');
  244. if (match == undefined || match == null) {
  245. p_number = null;
  246. } else {
  247. p_number = match[1];
  248. }
  249. var p_text = $(p_in_element[1]).text().trim();
  250. point.subpoints.push({
  251. 'number': p_number,
  252. 'text': p_text,
  253. 'type': 'subpoint'
  254. });
  255. }
  256. }
  257. for(pt of points) {
  258. for(spt of pt.subpoints) {
  259. // console.log(article.number, pt.number, spt.number);
  260. }
  261. }
  262. return points;
  263. }
  264. /**
  265. * combine articles in chapter 1 together
  266. */
  267. var articles_in_chapter_1 = extract_articles_from_text(chapters[0].contents);
  268. var points_in_articles = articles_in_chapter_1.slice(0,3).map(extract_points_from_article);
  269. points_in_articles.push(extract_points_from_article4(articles_in_chapter_1[3]));
  270. for(var i=0; i<articles_in_chapter_1.length; i++) {
  271. var article = articles_in_chapter_1[i];
  272. article.contents = points_in_articles[i];
  273. }
  274. chapters[0].contents = articles_in_chapter_1;
  275. /**
  276. * create the final data object that will hold all the items together
  277. */
  278. var data = {
  279. 'title': "General Data Protection Regulation",
  280. 'abbrv': "GDPR",
  281. 'regulation': "2016/679",
  282. 'dated': "27/04/2016",
  283. 'updated': "04/05/2016",
  284. 'about': "protection of natural persons with regard to the processing of personal data and on the free movement of such data, and repealing Directive 95/46/EC (General Data Protection Regulation)",
  285. 'identifier': "L 119/1",
  286. 'language': "EN",
  287. 'chapters': [chapters[0]],
  288. 'recitals': []
  289. };
  290. // chapters yet to be processed
  291. chapters = chapters.slice(1);
  292. /**
  293. * Process the other chapters
  294. * Now these other chapters can also contain sections, so the first test to be made
  295. * is whether a chapter contains any sections. If it does, then the sections get
  296. * extracted first, and after that the articles need to be extracted from the sections.
  297. * If a chapter does not contain any sections, then the articles will be extracted
  298. * directly from the section.
  299. */
  300. var extract_sections_from_text = function(text) {
  301. var sections = [];
  302. var section = null;
  303. var section_contents = null;
  304. for (var i=0; i<text.length; i++) {
  305. var element = text[i];
  306. if (
  307. element.prop('nodeName') == 'P' &&
  308. /^d1e\d+-1-1$/.test(element.prop('id')) &&
  309. $(element).children('span.expanded').length == 1
  310. ) {
  311. var section_title = text[i+1].text().trim();
  312. i += 1;
  313. section_contents = [];
  314. section = {
  315. 'number': element.text().trim().slice(7).trim(),
  316. 'title': section_title,
  317. 'type': 'section',
  318. 'contents': section_contents
  319. };
  320. sections.push(section);
  321. } else {
  322. if (section_contents == null) {
  323. return null;
  324. }
  325. section_contents.push(element);
  326. }
  327. }
  328. for (let sec of sections) {
  329. // console.log(sec.type, sec.number, sec.contents.length);
  330. }
  331. return sections;
  332. }
  333. for(let ch of chapters) {
  334. var sections = extract_sections_from_text(ch.contents);
  335. if (sections == null) {
  336. // console.log(ch.type, ch.number, "does not have Sections");
  337. var articles = extract_articles_from_text(ch.contents);
  338. ch.contents = articles;
  339. for(let at of articles) {
  340. // console.log(at.type, at.number);
  341. at.contents = extract_points_from_article(at);
  342. }
  343. } else {
  344. // console.log(ch.type, ch.number, "has Sections");
  345. ch.contents = sections;
  346. for(let sec of sections) {
  347. var articles = extract_articles_from_text(sec.contents);
  348. for(let at of articles) {
  349. at.contents = extract_points_from_article(at);
  350. }
  351. ; sec.contents = articles;
  352. }
  353. }
  354. data.chapters.push(ch);
  355. }
  356. /**
  357. * DEBUG OUTPUT
  358. */
  359. // for(let ch of data.chapters) {
  360. // console.log(ch.type, ch.number, ch.contents.length);
  361. // if (ch.contents[0].type == 'section') {
  362. // for(let sec of ch.contents) {
  363. // console.log(sec.type, sec.number, sec.contents.length);
  364. // for(let at of sec.contents) {
  365. // console.log(at.type, at.number, at.contents.length);
  366. // for(let pt of at.contents) {
  367. // console.log(pt.type, pt.number, pt.subpoints.length);
  368. // }
  369. // }
  370. // }
  371. // } else {
  372. // for(let at of ch.contents) {
  373. // console.log(at.type, at.number, at.contents.length);
  374. // for(let pt of at.contents) {
  375. // console.log(pt.type, pt.number, pt.subpoints.length);
  376. // }
  377. // }
  378. // }
  379. // }
  380. /**
  381. * RECITALS
  382. */
  383. // The recitals start at the table (second) which is after the p element
  384. // with id = d1e40-1-1
  385. var element = $('p#d1e40-1-1');
  386. while(element.prop('nodeName') != 'TABLE') element = element.next();
  387. // At this stage, element points to the table where recitals begin
  388. // Each table holds two p elements, where the first one contains the number
  389. // and the second contains the text
  390. while(element.prop('nodeName') != 'P') {
  391. var p_in_element = element.find('p');
  392. var p_text = $(p_in_element[1]).text().trim();
  393. var p_number = $(p_in_element[0]).text().match('^\\((\\d+)\\)')[1];
  394. data.recitals.push({
  395. 'number': p_number,
  396. 'text': p_text,
  397. 'type': 'recital'
  398. });
  399. element = element.next();
  400. }
  401. /**
  402. * CITATIONS
  403. */
  404. // citations are p elements with class note
  405. data.citations = $('p.note').map(function(index, element) {
  406. var match = $(element).text().trim().match('^\\((\\d+)\\)\\s+(.*)$')
  407. return {
  408. 'number': match[1],
  409. 'text': match[2],
  410. 'type': 'citation'
  411. };
  412. });
  413. /**
  414. * Download data as JSON
  415. */
  416. delete data.citations.prevObject;
  417. delete data.citations.context;
  418. delete data.citations.length;
  419. $('<a id="downloadAnchorElem" style="display:none"></a>').appendTo('body');
  420. var dataStr = "data:text/json;charset=utf-8," + encodeURIComponent(JSON.stringify(data));
  421. var btn_download = document.getElementById('downloadAnchorElem');
  422. btn_download.setAttribute("href", dataStr);
  423. btn_download.setAttribute("download", "gdpr.json");
  424. /** TO DOWNLOAD, CALL THIS IN BROWSER CONSOLE **/
  425. // btn_download.click();