Browse Source

Support for treating CSV files as tables

Christophe Debruyne 7 years ago
parent
commit
3480db2f61

+ 4 - 1
README.md

@@ -25,8 +25,11 @@ Where `config.properties` is a properties file containing:
 - `format`, format of the output files (default "TURTLE")
 - `filePerGraph`, flag to write the different graphs in separate files (default "false")
 - `baseIRI`, used in resolving relative IRIs produced by the R2RML mapping
+- `CSVFiles`, a list of paths to CSV files that are separated by semicolons.
 
-When named graphs are used in the R2RML mapping, one should use serialization that support graphs such as N-QUADS and TRIG. The use of other serializations formats (such as TURTLE) results in all triples of all graphs being written away to that file. When setting the flag `filePerGraph` to `true` for serialization formats that do not support graphs, however, the value for `outputFile` will be used to create a directory in which a file will be created for each graph in RDF dataset.
+When named graphs are used in the R2RML mapping, one should use serialization that support graphs such as N-QUADS and TRIG. The use of other serializations formats (such as TURTLE) results in all triples of all graphs being written away to that file. When setting the flag `filePerGraph` to `true` for serialization formats that do not support graphs, however, the value for `outputFile` will be used to create a directory in which a file will be created for each graph in the RDF dataset.
+
+Note that you cannot use both `CSVFiles` and `connectionURL` at the same time. For each CSV file, the name of the table will be the base name of that file.
 
 ## Example
 

+ 6 - 0
pom.xml

@@ -77,6 +77,12 @@
 			<artifactId>postgresql</artifactId>
 			<version>9.1-901.jdbc4</version>
 		</dependency>
+		<!-- h2 allows us to create tables from CSV files in an easy way -->
+		<dependency>
+			<groupId>com.h2database</groupId>
+			<artifactId>h2</artifactId>
+			<version>1.4.193</version>
+		</dependency>
 	</dependencies>
 	<organization>
 		<name>Adapt Centre, Trinity College Dublin</name>

+ 29 - 1
src/r2rml/engine/Configuration.java

@@ -3,7 +3,10 @@ package r2rml.engine;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Properties;
+import java.util.StringTokenizer;
 
 import org.apache.log4j.Logger;
 
@@ -26,7 +29,8 @@ public class Configuration {
 	private String format = null;
 	private String baseIRI = null;
 	private boolean filePerGraph = false;
-	
+	private List<String> CSVFiles = new ArrayList<String>();
+
 	public Configuration(String path) throws R2RMLException {
 		Properties properties = new Properties();
 		try {
@@ -43,6 +47,14 @@ public class Configuration {
 		format = properties.getProperty("format", "TURTLE");
 		setFilePerGraph("true".equals(properties.getProperty("filePerGraph", "false").toLowerCase()));
 		baseIRI = properties.getProperty("baseIRI");
+		
+		String files = properties.getProperty("CSVFiles");
+		if(files != null && !"".equals(files)) {
+			StringTokenizer tk = new StringTokenizer(files, ";");
+			while(tk.hasMoreTokens()) {
+				CSVFiles.add(tk.nextToken());
+			}
+		}
 	}
 	
 	public Configuration() {
@@ -119,4 +131,20 @@ public class Configuration {
 		this.filePerGraph = filePerGraph;
 	}
 	
+	public List<String> getCSVFiles() {
+		return CSVFiles;
+	}
+
+	public void setCSVFiles(List<String> cSVFiles) {
+		CSVFiles = cSVFiles;
+	}
+
+	public boolean hasConnectionURL() {
+		return connectionURL != null && !"".equals(connectionURL);
+	}
+
+	public boolean hasCSVFiles() {
+		return CSVFiles != null && CSVFiles.size() > 0;
+	}
+	
 }

+ 61 - 9
src/r2rml/engine/R2RMLProcessor.java

@@ -1,10 +1,13 @@
 package r2rml.engine;
 
+import java.io.File;
 import java.sql.Connection;
 import java.sql.DriverManager;
 import java.sql.SQLException;
+import java.sql.Statement;
 import java.util.Properties;
 
+import org.apache.commons.io.FilenameUtils;
 import org.apache.jena.query.Dataset;
 import org.apache.jena.query.DatasetFactory;
 import org.apache.log4j.Logger;
@@ -58,16 +61,35 @@ public class R2RMLProcessor {
 	}
 
 	private void createDatabaseConnection() {
-		String user = configuration.getUser();
-		String pass = configuration.getPassword();
-		Properties props = new Properties();
-		if(user != null && !"".equals(user))
-			props.setProperty("user", user);
-		if(pass != null && !"".equals(pass))
-			props.setProperty("password", pass);
-
+		// Determine situation
+		if(configuration.hasConnectionURL() && configuration.hasCSVFiles()) {
+			logger.error("You cannot provide a connection URL and a list of CSV files at the same time.");
+			System.exit(-1);
+		}
+		
 		try {
-			connection = DriverManager.getConnection(configuration.getConnectionURL(), props);
+			Properties props = new Properties();
+			
+			// If files, create in-memory database
+			if(configuration.hasCSVFiles()) {
+				try {
+					// This method will create a new connection URL that will be added to the configuration
+					connection = createTablesFromCSVFiles();
+				} catch (Exception ex) {
+					logger.error("Exception during database startup.", ex);
+					System.exit(-1);
+				}
+			} else {
+				// Connecting to a database
+				String user = configuration.getUser();
+				String pass = configuration.getPassword();
+				if(user != null && !"".equals(user))
+					props.setProperty("user", user);
+				if(pass != null && !"".equals(pass))
+					props.setProperty("password", pass);			
+				connection = DriverManager.getConnection(configuration.getConnectionURL(), props);
+			}
+			
 		} catch (SQLException e) {
 			logger.error("Error connecting to database.", e);
 			System.exit(-1);
@@ -75,6 +97,36 @@ public class R2RMLProcessor {
 
 	}
 
+	private Connection createTablesFromCSVFiles() throws Exception {
+		String connectionURL = "jdbc:h2:mem:" + System.currentTimeMillis();
+		configuration.setConnectionURL(connectionURL);
+		
+		logger.info("Starting in-memory database for unit tests");
+		DriverManager.getConnection(connectionURL + ";create=true").close();
+		
+		Connection connection = DriverManager.getConnection(connectionURL);
+		Statement statement = connection.createStatement();
+		
+		// for each file, load file as table...
+		for(String f : configuration.getCSVFiles()) {
+			File file = new File(f);
+			String name = createTableNameForFile(file);
+			logger.info("Loading " + file + " as table " + name);
+			String sql = "CREATE TABLE " + name + " AS SELECT * FROM CSVREAD('"  + file.getAbsolutePath() + "', NULL, NULL);";
+			statement.execute(sql);
+			logger.info("Loaded " + file + " as table " + name);	
+		}
+		
+		// only close the statement. Don't close connection! It will be returned 
+		statement.close();
+		return connection;
+	}
+
+	private String createTableNameForFile(File file) {
+		String name = FilenameUtils.getBaseName(file.getAbsolutePath());
+		return name;
+	}
+
 	private void closeDatabaseConnection() {
 		try {
 			if(!connection.isClosed())

+ 16 - 0
test/resources/CSV01.mapping.ttl

@@ -0,0 +1,16 @@
+@prefix rr: <http://www.w3.org/ns/r2rml#> .
+@prefix ex: <http://example.com/ns#> .
+
+<#TriplesMap1>
+    rr:logicalTable [ rr:tableName "EMP" ];
+    rr:subjectMap [
+        rr:template "http://data.example.com/employee/{EMPNO}";
+        rr:class ex:Employee;
+    ];
+    rr:predicateObjectMap [
+        rr:predicate ex:name;
+        rr:objectMap [ rr:column "ENAME" ];
+    ].
+	
+<#a> rr:objectMap <#b> .
+<#b> rr:column "ENAME" .

+ 4 - 0
test/resources/CSV01.output.ttl

@@ -0,0 +1,4 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix ex: <http://example.com/ns#> .
+<http://data.example.com/employee/7369> rdf:type ex:Employee.
+<http://data.example.com/employee/7369> ex:name "SMITH".

+ 32 - 0
test/resources/CSV02.mapping.ttl

@@ -0,0 +1,32 @@
+@prefix rr: <http://www.w3.org/ns/r2rml#> .
+@prefix ex: <http://example.com/ns#> .
+
+<#TriplesMap1>
+    rr:logicalTable [ rr:tableName "EMP" ];
+    rr:subjectMap [
+        rr:template "http://data.example.com/employee/{EMPNO}";
+    ];
+	rr:predicateObjectMap [
+		rr:predicate ex:department;
+		rr:objectMap [
+			rr:parentTriplesMap <#TriplesMap2>;
+			rr:joinCondition [
+				rr:child "DEPTNO";
+				rr:parent "DEPTNO";
+			];
+		];
+	].
+	
+<#TriplesMap2>
+    rr:logicalTable <#DeptTableView>;
+    rr:subjectMap [
+        rr:template "http://data.example.com/department/{DEPTNO}";
+    ].
+	
+<#DeptTableView> rr:sqlQuery """
+	SELECT DEPTNO,
+	       DNAME,
+	       LOC,
+	       (SELECT COUNT(*) FROM EMP WHERE EMP.DEPTNO=DEPT.DEPTNO) AS STAFF
+	FROM DEPT
+	""".

+ 3 - 0
test/resources/CSV02.output.ttl

@@ -0,0 +1,3 @@
+@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix ex: <http://example.com/ns#> .
+<http://data.example.com/employee/7369> ex:department <http://data.example.com/department/10> .

+ 1 - 0
test/resources/DEPT.CSV

@@ -0,0 +1 @@
+DEPTNO,DNAME,LOC
10,APPSERVER,NEW YORK

+ 1 - 0
test/resources/EMP.CSV

@@ -0,0 +1 @@
+EMPNO,ENAME,JOB,DEPTNO
7369,SMITH,CLERK,10

+ 62 - 0
test/test/TestR2RMLCSV.java

@@ -0,0 +1,62 @@
+package test;
+
+import org.apache.jena.rdf.model.Model;
+import org.apache.jena.rdf.model.ModelFactory;
+import org.apache.log4j.BasicConfigurator;
+import org.junit.BeforeClass;
+
+import junit.framework.TestCase;
+import r2rml.engine.Configuration;
+import r2rml.engine.R2RMLProcessor;
+
+/**
+ * Unit test for testing the functionality CSV to JDBC
+ * in memory database.
+ * 
+ * @author Christophe Debruyne
+ *
+ */
+public class TestR2RMLCSV extends TestCase {
+
+	//private static Logger logger = Logger.getLogger(TestR2RMLCSV.class.getName());
+	//private static String connectionURL = "jdbc:derby:memory:testing";
+
+	public TestR2RMLCSV(String testName) {
+		super(testName);
+	}
+
+	@BeforeClass
+	public static void init() throws Exception {
+		// Log4J junit configuration.
+		BasicConfigurator.configure();
+	}
+
+	public void testExampleCSV01() {
+		Configuration configuration = new Configuration();
+		configuration.setMappingFile("./test/resources/CSV01.mapping.ttl");
+		configuration.getCSVFiles().add("./test/resources/EMP.CSV");
+		configuration.getCSVFiles().add("./test/resources/DEPT.CSV");
+		R2RMLProcessor engine = new R2RMLProcessor(configuration);
+		engine.execute();
+		Model model = engine.getDataset().getDefaultModel();
+		Model target = ModelFactory.createDefaultModel();
+		target.read("./test/resources/CSV01.output.ttl");
+		assertEquals(true, model.difference(target).isEmpty());
+		assertEquals(true, target.difference(model).isEmpty());	
+	}
+	
+	public void testExampleCSV02() {
+		Configuration configuration = new Configuration();
+		configuration.setMappingFile("./test/resources/CSV02.mapping.ttl");
+		configuration.getCSVFiles().add("./test/resources/EMP.CSV");
+		configuration.getCSVFiles().add("./test/resources/DEPT.CSV");
+		R2RMLProcessor engine = new R2RMLProcessor(configuration);
+		engine.execute();
+		Model model = engine.getDataset().getDefaultModel();
+		Model target = ModelFactory.createDefaultModel();
+		target.read("./test/resources/CSV02.output.ttl");
+		assertEquals(true, model.difference(target).isEmpty());
+		assertEquals(true, target.difference(model).isEmpty());	
+	}
+
+}