Coverage Report - org.apache.any23.extractor.csv.CSVExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
CSVExtractor
0%
0/79
0%
0/26
2.545
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.csv;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionContext;
 21  
 import org.apache.any23.extractor.ExtractionException;
 22  
 import org.apache.any23.extractor.ExtractionParameters;
 23  
 import org.apache.any23.extractor.ExtractionResult;
 24  
 import org.apache.any23.extractor.Extractor;
 25  
 import org.apache.any23.extractor.ExtractorDescription;
 26  
 import org.apache.any23.extractor.ExtractorFactory;
 27  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 28  
 import org.apache.any23.rdf.RDFUtils;
 29  
 import org.apache.any23.vocab.CSV;
 30  
 import org.apache.commons.csv.CSVParser;
 31  
 import org.openrdf.model.URI;
 32  
 import org.openrdf.model.Value;
 33  
 import org.openrdf.model.impl.LiteralImpl;
 34  
 import org.openrdf.model.impl.URIImpl;
 35  
 import org.openrdf.model.vocabulary.RDF;
 36  
 import org.openrdf.model.vocabulary.RDFS;
 37  
 import org.openrdf.model.vocabulary.XMLSchema;
 38  
 
 39  
 import java.io.IOException;
 40  
 import java.io.InputStream;
 41  
 import java.util.Arrays;
 42  
 
 43  
 /**
 44  
  * This extractor produces <i>RDF</i> from a <i>CSV file</i> .
 45  
  * It automatically detects fields <i>delimiter</i>. If not able uses
 46  
  * the one provided in the <i>Any23</i> configuration.
 47  
  *
 48  
  * @see {@link CSVReaderBuilder}
 49  
  * @author Davide Palmisano ( dpalmisano@gmail.com )
 50  
  */
 51  0
 public class CSVExtractor implements Extractor.ContentExtractor {
 52  
 
 53  
     private CSVParser csvParser;
 54  
 
 55  
     private URI[] headerURIs;
 56  
 
 57  0
     private CSV csv = CSV.getInstance();
 58  
 
 59  0
     public final static ExtractorFactory<CSVExtractor> factory =
 60  
             SimpleExtractorFactory.create(
 61  
                     "csv",
 62  
                     null,
 63  
                     Arrays.asList(
 64  
                             "text/csv;q=0.1"
 65  
                     ),
 66  
                     "example-csv.csv",
 67  
                     CSVExtractor.class
 68  
             );
 69  
 
 70  
     /**
 71  
      * {@inheritDoc}
 72  
      */
 73  
     public void setStopAtFirstError(boolean f) {
 74  0
     }
 75  
 
 76  
     /**
 77  
      * {@inheritDoc}
 78  
      */
 79  
     public void run(
 80  
             ExtractionParameters extractionParameters,
 81  
             ExtractionContext extractionContext,
 82  
             InputStream in
 83  
             , ExtractionResult out
 84  
     ) throws IOException, ExtractionException {
 85  0
         final URI documentURI = extractionContext.getDocumentURI();
 86  
 
 87  
         // build the parser
 88  0
         csvParser = CSVReaderBuilder.build(in);
 89  
 
 90  
         // get the header and generate the URIs for column names
 91  0
         String[] header = csvParser.getLine();
 92  0
         headerURIs = processHeader(header, documentURI);
 93  
 
 94  
         // write triples to describe properties
 95  0
         writeHeaderPropertiesMetadata(header, out);
 96  
 
 97  
         String[] nextLine;
 98  0
         int index = 0;
 99  0
         while ((nextLine = csvParser.getLine()) != null) {
 100  0
             URI rowSubject = RDFUtils.uri(
 101  
                     documentURI.toString(),
 102  
                     "row/" + index
 103  
             );
 104  
             // add a row type
 105  0
             out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
 106  
             // for each row produce its statements
 107  0
             produceRowStatements(rowSubject, nextLine, out);
 108  
             // link the row to the document
 109  0
             out.writeTriple(documentURI, csv.row, rowSubject);
 110  
             // the progressive row number
 111  0
             out.writeTriple(
 112  
                     rowSubject,
 113  
                     csv.rowPosition,
 114  
                     new LiteralImpl(String.valueOf(index))
 115  
             );
 116  0
             index++;
 117  0
         }
 118  
         // add some CSV metadata such as the number of rows and columns
 119  0
         addTableMetadataStatements(
 120  
                 documentURI,
 121  
                 out,
 122  
                 index,
 123  
                 headerURIs.length
 124  
         );
 125  0
     }
 126  
 
 127  
     /**
 128  
      * Check whether a number is an integer.
 129  
      *
 130  
      * @param number
 131  
      * @return
 132  
      */
 133  
     private boolean isInteger(String number) {
 134  
         try {
 135  0
             Integer.valueOf(number);
 136  0
             return true;
 137  0
         } catch (NumberFormatException e) {
 138  0
             return false;
 139  
         }
 140  
     }
 141  
 
 142  
     /**
 143  
      * Check whether a number is a float.
 144  
      *
 145  
      * @param number
 146  
      * @return
 147  
      */
 148  
     private boolean isFloat(String number) {
 149  
         try {
 150  0
             Float.valueOf(number);
 151  0
             return true;
 152  0
         } catch (NumberFormatException e) {
 153  0
             return false;
 154  
         }
 155  
     }
 156  
 
 157  
     /**
 158  
      * It writes <i>RDF</i> statements representing properties of the header.
 159  
      *
 160  
      * @param header
 161  
      * @param out
 162  
      */
 163  
     private void writeHeaderPropertiesMetadata(String[] header, ExtractionResult out) {
 164  0
         int index = 0;
 165  0
         for (URI singleHeader : headerURIs) {
 166  0
             if (index > headerURIs.length) {
 167  0
                 break;
 168  
             }
 169  0
             if (!RDFUtils.isAbsoluteURI(header[index])) {
 170  0
                 out.writeTriple(
 171  
                         singleHeader,
 172  
                         RDFS.LABEL,
 173  
                         new LiteralImpl(header[index])
 174  
                 );
 175  
             }
 176  0
             out.writeTriple(
 177  
                     singleHeader,
 178  
                     csv.columnPosition,
 179  
                     new LiteralImpl(String.valueOf(index), XMLSchema.INTEGER)
 180  
             );
 181  0
             index++;
 182  
         }
 183  0
     }
 184  
 
 185  
     /**
 186  
      * It process the first row of the file, returning a list of {@link URI}s representing
 187  
      * the properties for each column. If a value of the header is an absolute <i>URI</i>
 188  
      * then it leave it as is. Otherwise the {@link org.apache.any23.vocab.CSV} vocabulary is used.
 189  
      *
 190  
      * @param header
 191  
      * @return an array of {@link URI}s identifying the column names.
 192  
      */
 193  
     private URI[] processHeader(String[] header, URI documentURI) {
 194  0
         URI[] result = new URI[header.length];
 195  0
         int index = 0;
 196  0
         for (String h : header) {
 197  0
             String candidate = h.trim();
 198  0
             if (RDFUtils.isAbsoluteURI(candidate)) {
 199  0
                 result[index] = new URIImpl(candidate);
 200  
             } else {
 201  0
                 result[index] = normalize(candidate, documentURI);
 202  
             }
 203  0
             index++;
 204  
         }
 205  0
         return result;
 206  
     }
 207  
 
 208  
     private URI normalize(String toBeNormalized, URI documentURI) {
 209  0
         String candidate = toBeNormalized;
 210  0
         candidate = candidate.trim().toLowerCase().replace("?", "").replace("&", "");
 211  0
         String[] tokens = candidate.split(" ");
 212  0
         candidate = tokens[0];
 213  0
         for (int i = 1; i < tokens.length; i++) {
 214  0
             String firstChar = ("" + tokens[i].charAt(0)).toUpperCase();
 215  0
             candidate += firstChar + tokens[i].substring(1);
 216  
         }
 217  0
         return new URIImpl(documentURI.toString() + candidate);
 218  
     }
 219  
 
 220  
     /**
 221  
      * It writes on the provided {@link ExtractionResult}, the </>RDF statements</>
 222  
      * representing the row <i>cell</i>. If a  row <i>cell</i> is an absolute <i>URI</i>
 223  
      * then an object property is written, literal otherwise.
 224  
      *
 225  
      * @param rowSubject
 226  
      * @param values
 227  
      * @param out
 228  
      */
 229  
     private void produceRowStatements(
 230  
             URI rowSubject,
 231  
             String[] values,
 232  
             ExtractionResult out
 233  
     ) {
 234  0
         int index = 0;
 235  0
         for (String cell : values) {
 236  0
             if (index >= headerURIs.length) {
 237  
                 // there are some row cells that don't have an associated column name
 238  0
                 break;
 239  
             }
 240  0
             if (cell.equals("")) {
 241  0
                 continue;
 242  
             }
 243  0
             URI predicate = headerURIs[index];
 244  0
             Value object = getObjectFromCell(cell);
 245  0
             out.writeTriple(rowSubject, predicate, object);
 246  0
             index++;
 247  
         }
 248  0
     }
 249  
 
 250  
     private Value getObjectFromCell(String cell) {
 251  
         Value object;
 252  0
         cell = cell.trim();
 253  0
         if (RDFUtils.isAbsoluteURI(cell)) {
 254  0
             object = new URIImpl(cell);
 255  
         } else {
 256  0
             URI datatype = XMLSchema.STRING;
 257  0
             if (isInteger(cell)) {
 258  0
                 datatype = XMLSchema.INTEGER;
 259  0
             } else if(isFloat(cell)) {
 260  0
                 datatype = XMLSchema.FLOAT;
 261  
             }
 262  0
             object = new LiteralImpl(cell, datatype);
 263  
         }
 264  0
         return object;
 265  
     }
 266  
 
 267  
     /**
 268  
      * It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i>
 269  
      * on generic properties of the <i>CSV</i> file, such as number of rows and columns.
 270  
      *
 271  
      * @param documentURI
 272  
      * @param out
 273  
      * @param numberOfRows
 274  
      * @param numberOfColumns
 275  
      */
 276  
     private void addTableMetadataStatements(
 277  
             URI documentURI,
 278  
             ExtractionResult out,
 279  
             int numberOfRows,
 280  
             int numberOfColumns) {
 281  0
         out.writeTriple(
 282  
                 documentURI,
 283  
                 csv.numberOfRows,
 284  
                 new LiteralImpl(String.valueOf(numberOfRows), XMLSchema.INTEGER)
 285  
         );
 286  0
         out.writeTriple(
 287  
                 documentURI,
 288  
                 csv.numberOfColumns,
 289  
                 new LiteralImpl(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
 290  
         );
 291  0
     }
 292  
 
 293  
     /**
 294  
      * {@inheritDoc}
 295  
      */
 296  
     public ExtractorDescription getDescription() {
 297  0
         return factory;
 298  
     }
 299  
 }