View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.csv;
19  
20  import static java.lang.Character.toUpperCase;
21  
22  import org.apache.any23.extractor.ExtractionContext;
23  import org.apache.any23.extractor.ExtractionException;
24  import org.apache.any23.extractor.ExtractionParameters;
25  import org.apache.any23.extractor.ExtractionResult;
26  import org.apache.any23.extractor.Extractor;
27  import org.apache.any23.extractor.ExtractorDescription;
28  import org.apache.any23.extractor.ExtractorFactory;
29  import org.apache.any23.extractor.SimpleExtractorFactory;
30  import org.apache.any23.rdf.RDFUtils;
31  import org.apache.any23.vocab.CSV;
32  import org.apache.commons.csv.CSVParser;
33  import org.openrdf.model.URI;
34  import org.openrdf.model.Value;
35  import org.openrdf.model.impl.LiteralImpl;
36  import org.openrdf.model.impl.URIImpl;
37  import org.openrdf.model.vocabulary.RDF;
38  import org.openrdf.model.vocabulary.RDFS;
39  import org.openrdf.model.vocabulary.XMLSchema;
40  
41  import java.io.IOException;
42  import java.io.InputStream;
43  import java.util.Arrays;
44  import java.util.StringTokenizer;
45  
46  /**
47   * This extractor produces <i>RDF</i> from a <i>CSV file</i> .
48   * It automatically detects fields <i>delimiter</i>. If not able uses
49   * the one provided in the <i>Any23</i> configuration.
50   *
51   * @see CSVReaderBuilder
52   * @author Davide Palmisano ( dpalmisano@gmail.com )
53   */
54  public class CSVExtractor implements Extractor.ContentExtractor {
55  
56      private CSVParser csvParser;
57  
58      private URI[] headerURIs;
59  
60      private CSV csv = CSV.getInstance();
61  
62      public final static ExtractorFactory<CSVExtractor> factory =
63              SimpleExtractorFactory.create(
64                      "csv",
65                      null,
66                      Arrays.asList(
67                              "text/csv;q=0.1"
68                      ),
69                      "example-csv.csv",
70                      CSVExtractor.class
71              );
72  
73      /**
74       * {@inheritDoc}
75       */
76      public void setStopAtFirstError(boolean f) {
77      }
78  
79      /**
80       * {@inheritDoc}
81       */
82      public void run(
83              ExtractionParameters extractionParameters,
84              ExtractionContext extractionContext,
85              InputStream in
86              , ExtractionResult out
87      ) throws IOException, ExtractionException {
88          final URI documentURI = extractionContext.getDocumentURI();
89  
90          // build the parser
91          csvParser = CSVReaderBuilder.build(in);
92  
93          // get the header and generate the URIs for column names
94          String[] header = csvParser.getLine();
95          headerURIs = processHeader(header, documentURI);
96  
97          // write triples to describe properties
98          writeHeaderPropertiesMetadata(header, out);
99  
100         String[] nextLine;
101         int index = 0;
102         while ((nextLine = csvParser.getLine()) != null) {
103             URI rowSubject = RDFUtils.uri(
104                     documentURI.toString(),
105                     "row/" + index
106             );
107             // add a row type
108             out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
109             // for each row produce its statements
110             produceRowStatements(rowSubject, nextLine, out);
111             // link the row to the document
112             out.writeTriple(documentURI, csv.row, rowSubject);
113             // the progressive row number
114             out.writeTriple(
115                     rowSubject,
116                     csv.rowPosition,
117                     new LiteralImpl(String.valueOf(index))
118             );
119             index++;
120         }
121         // add some CSV metadata such as the number of rows and columns
122         addTableMetadataStatements(
123                 documentURI,
124                 out,
125                 index,
126                 headerURIs.length
127         );
128     }
129 
130     /**
131      * Check whether a number is an integer.
132      *
133      * @param number
134      * @return
135      */
136     private boolean isInteger(String number) {
137         try {
138             Integer.valueOf(number);
139             return true;
140         } catch (NumberFormatException e) {
141             return false;
142         }
143     }
144 
145     /**
146      * Check whether a number is a float.
147      *
148      * @param number
149      * @return
150      */
151     private boolean isFloat(String number) {
152         try {
153             Float.valueOf(number);
154             return true;
155         } catch (NumberFormatException e) {
156             return false;
157         }
158     }
159 
160     /**
161      * It writes <i>RDF</i> statements representing properties of the header.
162      *
163      * @param header
164      * @param out
165      */
166     private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
167         int index = 0;
168         for (URI singleHeader : headerURIs) {
169             if (index > headerURIs.length) {
170                 break;
171             }
172             if (!RDFUtils.isAbsoluteURI(header[index])) {
173                 out.writeTriple(
174                         singleHeader,
175                         RDFS.LABEL,
176                         new LiteralImpl(header[index])
177                 );
178             }
179             out.writeTriple(
180                     singleHeader,
181                     csv.columnPosition,
182                     new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER)
183             );
184             index++;
185         }
186     }
187 
188     /**
189      * It process the first row of the file, returning a list of {@link URI}s representing
190      * the properties for each column. If a value of the header is an absolute <i>URI</i>
191      * then it leave it as is. Otherwise the {@link org.apache.any23.vocab.CSV} vocabulary is used.
192      *
193      * @param header
194      * @return an array of {@link URI}s identifying the column names.
195      */
196     private URI[] processHeader(String[] header, URI documentURI) {
197         URI[] result = new URI[header.length];
198         int index = 0;
199         for (String h : header) {
200             String candidate = h.trim();
201             if (RDFUtils.isAbsoluteURI(candidate)) {
202                 result[index] = new URIImpl(candidate);
203             } else {
204                 result[index] = normalize(candidate, documentURI);
205             }
206             index++;
207         }
208         return result;
209     }
210 
211     private URI normalize(String toBeNormalized, URI documentURI) {
212         toBeNormalized = toBeNormalized.trim().toLowerCase().replace("?", "").replace("&", "");
213 
214         StringBuilder result = new StringBuilder(documentURI.toString());
215 
216         StringTokenizer tokenizer = new StringTokenizer(toBeNormalized, " ");
217         while (tokenizer.hasMoreTokens()) {
218             String current = tokenizer.nextToken();
219 
220             result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
221         }
222 
223         return new URIImpl(result.toString());
224     }
225 
226     /**
227      * It writes on the provided {@link ExtractionResult}, the </>RDF statements</>
228      * representing the row <i>cell</i>. If a  row <i>cell</i> is an absolute <i>URI</i>
229      * then an object property is written, literal otherwise.
230      *
231      * @param rowSubject
232      * @param values
233      * @param out
234      */
235     private void produceRowStatements(
236             URI rowSubject,
237             String[] values,
238             ExtractionResult out
239     ) {
240         int index = 0;
241         for (String cell : values) {
242             if (index >= headerURIs.length) {
243                 // there are some row cells that don't have an associated column name
244                 break;
245             }
246             if (cell.equals("")) {
247                 continue;
248             }
249             URI predicate = headerURIs[index];
250             Value object = getObjectFromCell(cell);
251             out.writeTriple(rowSubject, predicate, object);
252             index++;
253         }
254     }
255 
256     private Value getObjectFromCell(String cell) {
257         Value object;
258         cell = cell.trim();
259         if (RDFUtils.isAbsoluteURI(cell)) {
260             object = new URIImpl(cell);
261         } else {
262             URI datatype = XMLSchema.STRING;
263             if (isInteger(cell)) {
264                 datatype = XMLSchema.INTEGER;
265             } else if(isFloat(cell)) {
266                 datatype = XMLSchema.FLOAT;
267             }
268             object = new LiteralImpl(cell, datatype);
269         }
270         return object;
271     }
272 
273     /**
274      * It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i>
275      * on generic properties of the <i>CSV</i> file, such as number of rows and columns.
276      *
277      * @param documentURI
278      * @param out
279      * @param numberOfRows
280      * @param numberOfColumns
281      */
282     private void addTableMetadataStatements(
283             URI documentURI,
284             ExtractionResult out,
285             int numberOfRows,
286             int numberOfColumns) {
287         out.writeTriple(
288                 documentURI,
289                 csv.numberOfRows,
290                 new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER)
291         );
292         out.writeTriple(
293                 documentURI,
294                 csv.numberOfColumns,
295                 new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
296         );
297     }
298 
299     /**
300      * {@inheritDoc}
301      */
302     public ExtractorDescription getDescription() {
303         return factory;
304     }
305 }