Coverage Report - org.apache.any23.extractor.csv.CSVReaderBuilder
 
Classes in this File Line Coverage Branch Coverage Complexity
CSVReaderBuilder
0%
0/46
0%
0/24
3.429
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.csv;
 19  
 
 20  
 import org.apache.any23.configuration.DefaultConfiguration;
 21  
 import org.apache.commons.csv.CSVParser;
 22  
 import org.apache.commons.csv.CSVStrategy;
 23  
 
 24  
 import java.io.IOException;
 25  
 import java.io.InputStream;
 26  
 import java.io.InputStreamReader;
 27  
 
 28  
 /**
 29  
  * This class is responsible to build a reader first guessing the configuration
 30  
  * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
 31  
  *
 32  
  * @author Davide Palmisano ( dpalmisano@gmail.com )
 33  
  * @author Michele Mostarda ( michele.mostarda@gmail.com )
 34  
  */
 35  0
 public class CSVReaderBuilder {
 36  
 
 37  
     private static final String DEFAULT_FIELD_DELIMITER = ",";
 38  
 
 39  
     private static final String DEFAULT_COMMENT_DELIMITER = "#";
 40  
 
 41  
     public static final char NULL_CHAR = ' ';
 42  
 
 43  0
     private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
 44  
 
 45  0
     private static DefaultConfiguration defaultConfiguration =
 46  
             DefaultConfiguration.singleton();
 47  
 
 48  
     private static final CSVStrategy[] strategies;
 49  
 
 50  
     static {
 51  0
         strategies = new CSVStrategy[ popularDelimiters.length + 1 ];
 52  0
         strategies[0] = CSVStrategy.DEFAULT_STRATEGY;
 53  0
         int index = 1;
 54  0
         for(char dlmt : popularDelimiters) {
 55  0
             strategies[index++] = getCsvStrategy(dlmt, NULL_CHAR);
 56  
         }
 57  0
     }
 58  
 
 59  
     /**
 60  
      * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
 61  
      * from the provided <i>CSV</i> file.
 62  
      *
 63  
      * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
 64  
      * @return a {@link CSVParser}
 65  
      * @throws java.io.IOException
 66  
      */
 67  
     public static CSVParser build(InputStream is) throws IOException {
 68  0
         CSVStrategy bestStrategy = getBestStrategy(is);
 69  0
         if(bestStrategy == null) bestStrategy = getCSVStrategyFromConfiguration();
 70  0
         return new CSVParser( new InputStreamReader(is), bestStrategy );
 71  
     }
 72  
 
 73  
     /**
 74  
      * Checks whether the given input stream is a CSV or not.
 75  
      *
 76  
      * @param is input stream to be verified.
 77  
      * @return
 78  
      * @throws IOException
 79  
      */
 80  
     public static boolean isCSV(InputStream is) throws IOException {
 81  0
         return getBestStrategy(is) != null;
 82  
     }
 83  
 
 84  
     private static CSVStrategy getBestStrategy(InputStream is) throws IOException {
 85  0
         for( CSVStrategy strategy : strategies ) {
 86  0
             if( testStrategy(is, strategy) ) {
 87  0
                 return strategy;
 88  
             }
 89  
         }
 90  0
         return null;
 91  
     }
 92  
 
 93  
     private static CSVStrategy getCsvStrategy(char delimiter, char comment) {
 94  0
         return new CSVStrategy(delimiter, '\'', comment);
 95  
     }
 96  
 
 97  
     private static CSVStrategy getCSVStrategyFromConfiguration() {
 98  0
         char fieldDelimiter = getCharValueFromConfiguration(
 99  
                 "any23.extraction.csv.field",
 100  
                 DEFAULT_FIELD_DELIMITER
 101  
         );
 102  0
         char commentDelimiter = getCharValueFromConfiguration(
 103  
                 "any23.extraction.csv.comment",
 104  
                 DEFAULT_COMMENT_DELIMITER
 105  
         );
 106  0
         return new CSVStrategy(fieldDelimiter, '\'', commentDelimiter);
 107  
     }
 108  
 
 109  
     private static char getCharValueFromConfiguration(String property, String defaultValue) {
 110  0
         String delimiter = defaultConfiguration.getProperty(
 111  
                 property,
 112  
                 defaultValue
 113  
         );
 114  0
         if (delimiter.length() != 1 || delimiter.equals("")) {
 115  0
             throw new RuntimeException(property + " value must be a single character");
 116  
         }
 117  0
         return delimiter.charAt(0);
 118  
     }
 119  
 
 120  
     /**
 121  
      * make sure the reader has correct delimiter and quotation set.
 122  
      * Check first lines and make sure they have the same amount of columns and at least 2
 123  
      *
 124  
      * @param is input stream to be checked
 125  
      * @param strategy strategy to be verified.
 126  
      * @return
 127  
      * @throws IOException
 128  
      * @param is
 129  
      */
 130  
     private static boolean testStrategy(InputStream is, CSVStrategy strategy) throws IOException {
 131  0
         final int MIN_COLUMNS = 2;
 132  
 
 133  0
         is.mark(Integer.MAX_VALUE);
 134  
         try {
 135  0
             final CSVParser parser = new CSVParser(new InputStreamReader(is), strategy);
 136  0
             int linesToCheck = 5;
 137  0
             int headerColumnCount = -1;
 138  0
             while (linesToCheck > 0) {
 139  
                 String[] row;
 140  0
                 row = parser.getLine();
 141  0
                 if (row == null) {
 142  0
                     break;
 143  
                 }
 144  0
                 if (row.length < MIN_COLUMNS) {
 145  0
                     return false;
 146  
                 }
 147  0
                 if (headerColumnCount == -1) { // first row
 148  0
                     headerColumnCount = row.length;
 149  
                 } else { // make sure rows have the same number of columns or one more than the header
 150  0
                     if (row.length < headerColumnCount) {
 151  0
                         return false;
 152  0
                     } else if (row.length - 1 > headerColumnCount) {
 153  0
                         return false;
 154  
                     }
 155  
                 }
 156  0
                 linesToCheck--;
 157  0
             }
 158  0
             return true;
 159  
         } finally {
 160  0
             is.reset();
 161  
         }
 162  
     }
 163  
 
 164  
 
 165  
 }