Coverage Report

Coverage Report - org.apache.any23.extractor.csv.CSVExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

CSVExtractor

0/79

0/26

2.545

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.csv;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.rdf.RDFUtils;
 import org.apache.any23.vocab.CSV;
 import org.apache.commons.csv.CSVParser;
 import org.openrdf.model.URI;
 import org.openrdf.model.Value;
 import org.openrdf.model.impl.LiteralImpl;
 import org.openrdf.model.impl.URIImpl;
 import org.openrdf.model.vocabulary.RDF;
 import org.openrdf.model.vocabulary.RDFS;
 import org.openrdf.model.vocabulary.XMLSchema;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 
 /**
  * This extractor produces <i>RDF</i> from a <i>CSV file</i> .
  * It automatically detects fields <i>delimiter</i>. If not able uses
  * the one provided in the <i>Any23</i> configuration.
  *
  * @see {@link CSVReaderBuilder}
  * @author Davide Palmisano ( dpalmisano@gmail.com )
  */
 public class CSVExtractor implements Extractor.ContentExtractor {
 
     private CSVParser csvParser;
 
     private URI[] headerURIs;
 
     private CSV csv = CSV.getInstance();
 
     public final static ExtractorFactory<CSVExtractor> factory =
             SimpleExtractorFactory.create(
                     "csv",
                     null,
                     Arrays.asList(
                             "text/csv;q=0.1"
                     ),
                     "example-csv.csv",
                     CSVExtractor.class
             );
 
     /**
      * {@inheritDoc}
      */
     public void setStopAtFirstError(boolean f) {
     }
 
     /**
      * {@inheritDoc}
      */
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             InputStream in
             , ExtractionResult out
     ) throws IOException, ExtractionException {
         final URI documentURI = extractionContext.getDocumentURI();
 
         // build the parser
         csvParser = CSVReaderBuilder.build(in);
 
         // get the header and generate the URIs for column names
         String[] header = csvParser.getLine();
         headerURIs = processHeader(header, documentURI);
 
         // write triples to describe properties
         writeHeaderPropertiesMetadata(header, out);
 
         String[] nextLine;
         int index = 0;
         while ((nextLine = csvParser.getLine()) != null) {
             URI rowSubject = RDFUtils.uri(
                     documentURI.toString(),
                     "row/" + index
             );
             // add a row type
             out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
             // for each row produce its statements
             produceRowStatements(rowSubject, nextLine, out);
             // link the row to the document
             out.writeTriple(documentURI, csv.row, rowSubject);
             // the progressive row number
             out.writeTriple(
                     rowSubject,
                     csv.rowPosition,
                     new LiteralImpl(String.valueOf(index))
             );
             index++;
         }
         // add some CSV metadata such as the number of rows and columns
         addTableMetadataStatements(
                 documentURI,
                 out,
                 index,
                 headerURIs.length
         );
     }
 
     /**
      * Check whether a number is an integer.
      *
      * @param number
      * @return
      */
     private boolean isInteger(String number) {
         try {
             Integer.valueOf(number);
             return true;
         } catch (NumberFormatException e) {
             return false;
         }
     }
 
     /**
      * Check whether a number is a float.
      *
      * @param number
      * @return
      */
     private boolean isFloat(String number) {
         try {
             Float.valueOf(number);
             return true;
         } catch (NumberFormatException e) {
             return false;
         }
     }
 
     /**
      * It writes <i>RDF</i> statements representing properties of the header.
      *
      * @param header
      * @param out
      */
     private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
         int index = 0;
         for (URI singleHeader : headerURIs) {
             if (index > headerURIs.length) {
                 break;
             }
             if (!RDFUtils.isAbsoluteURI(header[index])) {
                 out.writeTriple(
                         singleHeader,
                         RDFS.LABEL,
                         new LiteralImpl(header[index])
                 );
             }
             out.writeTriple(
                     singleHeader,
                     csv.columnPosition,
                     new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER)
             );
             index++;
         }
     }
 
     /**
      * It process the first row of the file, returning a list of {@link URI}s representing
      * the properties for each column. If a value of the header is an absolute <i>URI</i>
      * then it leave it as is. Otherwise the {@link org.apache.any23.vocab.CSV} vocabulary is used.
      *
      * @param header
      * @return an array of {@link URI}s identifying the column names.
      */
     private URI[] processHeader(String[] header, URI documentURI) {
         URI[] result = new URI[header.length];
         int index = 0;
         for (String h : header) {
             String candidate = h.trim();
             if (RDFUtils.isAbsoluteURI(candidate)) {
                 result[index] = new URIImpl(candidate);
             } else {
                 result[index] = normalize(candidate, documentURI);
             }
             index++;
         }
         return result;
     }
 
     private URI normalize(String toBeNormalized, URI documentURI) {
         String candidate = toBeNormalized;
         candidate = candidate.trim().toLowerCase().replace("?", "").replace("&", "");
         String[] tokens = candidate.split(" ");
         candidate = tokens[0];
         for (int i = 1; i < tokens.length; i++) {
             String firstChar = ("" + tokens[i].charAt(0)).toUpperCase();
             candidate += firstChar + tokens[i].substring(1);
         }
         return new URIImpl(documentURI.toString() + candidate);
     }
 
     /**
      * It writes on the provided {@link ExtractionResult}, the </>RDF statements</>
      * representing the row <i>cell</i>. If a  row <i>cell</i> is an absolute <i>URI</i>
      * then an object property is written, literal otherwise.
      *
      * @param rowSubject
      * @param values
      * @param out
      */
     private void produceRowStatements(
             URI rowSubject,
             String[] values,
             ExtractionResult out
     ) {
         int index = 0;
         for (String cell : values) {
             if (index >= headerURIs.length) {
                 // there are some row cells that don't have an associated column name
                 break;
             }
             if (cell.equals("")) {
                 continue;
             }
             URI predicate = headerURIs[index];
             Value object = getObjectFromCell(cell);
             out.writeTriple(rowSubject, predicate, object);
             index++;
         }
     }
 
     private Value getObjectFromCell(String cell) {
         Value object;
         cell = cell.trim();
         if (RDFUtils.isAbsoluteURI(cell)) {
             object = new URIImpl(cell);
         } else {
             URI datatype = XMLSchema.STRING;
             if (isInteger(cell)) {
                 datatype = XMLSchema.INTEGER;
             } else if(isFloat(cell)) {
                 datatype = XMLSchema.FLOAT;
             }
             object = new LiteralImpl(cell, datatype);
         }
         return object;
     }
 
     /**
      * It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i>
      * on generic properties of the <i>CSV</i> file, such as number of rows and columns.
      *
      * @param documentURI
      * @param out
      * @param numberOfRows
      * @param numberOfColumns
      */
     private void addTableMetadataStatements(
             URI documentURI,
             ExtractionResult out,
             int numberOfRows,
             int numberOfColumns) {
         out.writeTriple(
                 documentURI,
                 csv.numberOfRows,
                 new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER)
         );
         out.writeTriple(
                 documentURI,
                 csv.numberOfColumns,
                 new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
         );
     }
 
     /**
      * {@inheritDoc}
      */
     public ExtractorDescription getDescription() {
         return factory;
     }
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.csv;
19
20		import org.apache.any23.extractor.ExtractionContext;
21		import org.apache.any23.extractor.ExtractionException;
22		import org.apache.any23.extractor.ExtractionParameters;
23		import org.apache.any23.extractor.ExtractionResult;
24		import org.apache.any23.extractor.Extractor;
25		import org.apache.any23.extractor.ExtractorDescription;
26		import org.apache.any23.extractor.ExtractorFactory;
27		import org.apache.any23.extractor.SimpleExtractorFactory;
28		import org.apache.any23.rdf.RDFUtils;
29		import org.apache.any23.vocab.CSV;
30		import org.apache.commons.csv.CSVParser;
31		import org.openrdf.model.URI;
32		import org.openrdf.model.Value;
33		import org.openrdf.model.impl.LiteralImpl;
34		import org.openrdf.model.impl.URIImpl;
35		import org.openrdf.model.vocabulary.RDF;
36		import org.openrdf.model.vocabulary.RDFS;
37		import org.openrdf.model.vocabulary.XMLSchema;
38
39		import java.io.IOException;
40		import java.io.InputStream;
41		import java.util.Arrays;
42
43		/**
44		* This extractor produces <i>RDF</i> from a <i>CSV file</i> .
45		* It automatically detects fields <i>delimiter</i>. If not able uses
46		* the one provided in the <i>Any23</i> configuration.
47		*
48		* @see {@link CSVReaderBuilder}
49		* @author Davide Palmisano ( dpalmisano@gmail.com )
50		*/
51	0	public class CSVExtractor implements Extractor.ContentExtractor {
52
53		private CSVParser csvParser;
54
55		private URI[] headerURIs;
56
57	0	private CSV csv = CSV.getInstance();
58
59	0	public final static ExtractorFactory<CSVExtractor> factory =
60		SimpleExtractorFactory.create(
61		"csv",
62		null,
63		Arrays.asList(
64		"text/csv;q=0.1"
65		),
66		"example-csv.csv",
67		CSVExtractor.class
68		);
69
70		/**
71		* {@inheritDoc}
72		*/
73		public void setStopAtFirstError(boolean f) {
74	0	}
75
76		/**
77		* {@inheritDoc}
78		*/
79		public void run(
80		ExtractionParameters extractionParameters,
81		ExtractionContext extractionContext,
82		InputStream in
83		, ExtractionResult out
84		) throws IOException, ExtractionException {
85	0	final URI documentURI = extractionContext.getDocumentURI();
86
87		// build the parser
88	0	csvParser = CSVReaderBuilder.build(in);
89
90		// get the header and generate the URIs for column names
91	0	String[] header = csvParser.getLine();
92	0	headerURIs = processHeader(header, documentURI);
93
94		// write triples to describe properties
95	0	writeHeaderPropertiesMetadata(header, out);
96
97		String[] nextLine;
98	0	int index = 0;
99	0	while ((nextLine = csvParser.getLine()) != null) {
100	0	URI rowSubject = RDFUtils.uri(
101		documentURI.toString(),
102		"row/" + index
103		);
104		// add a row type
105	0	out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
106		// for each row produce its statements
107	0	produceRowStatements(rowSubject, nextLine, out);
108		// link the row to the document
109	0	out.writeTriple(documentURI, csv.row, rowSubject);
110		// the progressive row number
111	0	out.writeTriple(
112		rowSubject,
113		csv.rowPosition,
114		new LiteralImpl(String.valueOf(index))
115		);
116	0	index++;
117	0	}
118		// add some CSV metadata such as the number of rows and columns
119	0	addTableMetadataStatements(
120		documentURI,
121		out,
122		index,
123		headerURIs.length
124		);
125	0	}
126
127		/**
128		* Check whether a number is an integer.
129		*
130		* @param number
131		* @return
132		*/
133		private boolean isInteger(String number) {
134		try {
135	0	Integer.valueOf(number);
136	0	return true;
137	0	} catch (NumberFormatException e) {
138	0	return false;
139		}
140		}
141
142		/**
143		* Check whether a number is a float.
144		*
145		* @param number
146		* @return
147		*/
148		private boolean isFloat(String number) {
149		try {
150	0	Float.valueOf(number);
151	0	return true;
152	0	} catch (NumberFormatException e) {
153	0	return false;
154		}
155		}
156
157		/**
158		* It writes <i>RDF</i> statements representing properties of the header.
159		*
160		* @param header
161		* @param out
162		*/
163		private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
164	0	int index = 0;
165	0	for (URI singleHeader : headerURIs) {
166	0	if (index > headerURIs.length) {
167	0	break;
168		}
169	0	if (!RDFUtils.isAbsoluteURI(header[index])) {
170	0	out.writeTriple(
171		singleHeader,
172		RDFS.LABEL,
173		new LiteralImpl(header[index])
174		);
175		}
176	0	out.writeTriple(
177		singleHeader,
178		csv.columnPosition,
179		new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER)
180		);
181	0	index++;
182		}
183	0	}
184
185		/**
186		* It process the first row of the file, returning a list of {@link URI}s representing
187		* the properties for each column. If a value of the header is an absolute <i>URI</i>
188		* then it leave it as is. Otherwise the {@link org.apache.any23.vocab.CSV} vocabulary is used.
189		*
190		* @param header
191		* @return an array of {@link URI}s identifying the column names.
192		*/
193		private URI[] processHeader(String[] header, URI documentURI) {
194	0	URI[] result = new URI[header.length];
195	0	int index = 0;
196	0	for (String h : header) {
197	0	String candidate = h.trim();
198	0	if (RDFUtils.isAbsoluteURI(candidate)) {
199	0	result[index] = new URIImpl(candidate);
200		} else {
201	0	result[index] = normalize(candidate, documentURI);
202		}
203	0	index++;
204		}
205	0	return result;
206		}
207
208		private URI normalize(String toBeNormalized, URI documentURI) {
209	0	String candidate = toBeNormalized;
210	0	candidate = candidate.trim().toLowerCase().replace("?", "").replace("&", "");
211	0	String[] tokens = candidate.split(" ");
212	0	candidate = tokens[0];
213	0	for (int i = 1; i < tokens.length; i++) {
214	0	String firstChar = ("" + tokens[i].charAt(0)).toUpperCase();
215	0	candidate += firstChar + tokens[i].substring(1);
216		}
217	0	return new URIImpl(documentURI.toString() + candidate);
218		}
219
220		/**
221		* It writes on the provided {@link ExtractionResult}, the </>RDF statements</>
222		* representing the row <i>cell</i>. If a row <i>cell</i> is an absolute <i>URI</i>
223		* then an object property is written, literal otherwise.
224		*
225		* @param rowSubject
226		* @param values
227		* @param out
228		*/
229		private void produceRowStatements(
230		URI rowSubject,
231		String[] values,
232		ExtractionResult out
233		) {
234	0	int index = 0;
235	0	for (String cell : values) {
236	0	if (index >= headerURIs.length) {
237		// there are some row cells that don't have an associated column name
238	0	break;
239		}
240	0	if (cell.equals("")) {
241	0	continue;
242		}
243	0	URI predicate = headerURIs[index];
244	0	Value object = getObjectFromCell(cell);
245	0	out.writeTriple(rowSubject, predicate, object);
246	0	index++;
247		}
248	0	}
249
250		private Value getObjectFromCell(String cell) {
251		Value object;
252	0	cell = cell.trim();
253	0	if (RDFUtils.isAbsoluteURI(cell)) {
254	0	object = new URIImpl(cell);
255		} else {
256	0	URI datatype = XMLSchema.STRING;
257	0	if (isInteger(cell)) {
258	0	datatype = XMLSchema.INTEGER;
259	0	} else if(isFloat(cell)) {
260	0	datatype = XMLSchema.FLOAT;
261		}
262	0	object = new LiteralImpl(cell, datatype);
263		}
264	0	return object;
265		}
266
267		/**
268		* It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i>
269		* on generic properties of the <i>CSV</i> file, such as number of rows and columns.
270		*
271		* @param documentURI
272		* @param out
273		* @param numberOfRows
274		* @param numberOfColumns
275		*/
276		private void addTableMetadataStatements(
277		URI documentURI,
278		ExtractionResult out,
279		int numberOfRows,
280		int numberOfColumns) {
281	0	out.writeTriple(
282		documentURI,
283		csv.numberOfRows,
284		new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER)
285		);
286	0	out.writeTriple(
287		documentURI,
288		csv.numberOfColumns,
289		new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
290		);
291	0	}
292
293		/**
294		* {@inheritDoc}
295		*/
296		public ExtractorDescription getDescription() {
297	0	return factory;
298		}
299		}