Coverage Report - org.apache.any23.mime.TikaMIMETypeDetector
 
Classes in this File Line Coverage Branch Coverage Complexity
TikaMIMETypeDetector
0%
0/115
0%
0/72
4.615
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.mime;
 19  
 
 20  
 import org.apache.any23.extractor.csv.CSVReaderBuilder;
 21  
 import org.apache.any23.mime.purifier.Purifier;
 22  
 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
 23  
 import org.apache.tika.Tika;
 24  
 import org.apache.tika.config.TikaConfig;
 25  
 import org.apache.tika.metadata.Metadata;
 26  
 import org.apache.tika.mime.MimeType;
 27  
 import org.apache.tika.mime.MimeTypeException;
 28  
 import org.apache.tika.mime.MimeTypes;
 29  
 import org.openrdf.rio.RDFParser;
 30  
 import org.openrdf.rio.turtle.TurtleParser;
 31  
 
 32  
 import java.io.BufferedReader;
 33  
 import java.io.ByteArrayInputStream;
 34  
 import java.io.IOException;
 35  
 import java.io.InputStream;
 36  
 import java.io.InputStreamReader;
 37  
 import java.util.regex.Pattern;
 38  
 
 39  
 /**
 40  
  * Implementation of {@link MIMETypeDetector} based on
 41  
  * <a href="http://lucene.apache.org/tika/">Apache Tika</a>.
 42  
  *
 43  
  * @author Michele Mostarda (michele.mostarda@gmail.com)
 44  
  * @author Davide Palmisano (dpalmisano@gmail.com)
 45  
  */
 46  
 public class TikaMIMETypeDetector implements MIMETypeDetector {
 47  
 
 48  
     private Purifier purifier;
 49  
 
 50  
     // TODO: centralize mimetype strings somewhere.
 51  
 
 52  
     public static final String N3_MIMETYPE = "text/n3";
 53  
 
 54  
     public static final String NQUADS_MIMETYPE = "text/nq";
 55  
 
 56  
     public static final String TURTLE_MIMETYPE = "application/turtle";
 57  
 
 58  
     public static final String CSV_MIMETYPE = "text/csv";
 59  
 
 60  
     public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
 61  
 
 62  
     /**
 63  
      * N3 patterns.
 64  
      */
 65  0
     private static final Pattern[] N3_PATTERNS = {
 66  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."             ), // * URI URI .
 67  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."             ), // * URI BNODE .
 68  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."     ), // * URI LLITERAL .
 69  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.")  // * URI TLITERAL .
 70  
     };
 71  
 
 72  
     /**
 73  
      * N-Quads patterns.
 74  
      */
 75  0
     private static final Pattern[] NQUADS_PATTERNS = {
 76  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."             ), // * URI URI      URI .
 77  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."             ), // * URI BNODE    URI .
 78  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."     ), // * URI LLITERAL URI .
 79  
             Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.")  // * URI TLITERAL URI .
 80  
     };
 81  
 
 82  0
     private static TikaConfig config = null;
 83  
 
 84  
     private static Tika tika;
 85  
 
 86  
     private static MimeTypes types;
 87  
 
 88  
     /**
 89  
      * Checks if the stream contains the <i>N3</i> triple patterns.
 90  
      *
 91  
      * @param is input stream to be verified.
 92  
      * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
 93  
      * @throws IOException
 94  
      */
 95  
     public static boolean checkN3Format(InputStream is) throws IOException {
 96  0
         return findPattern(N3_PATTERNS, '.', is);
 97  
     }
 98  
 
 99  
     /**
 100  
      * Checks if the stream contains the <i>NQuads</i> patterns.
 101  
      *
 102  
      * @param is input stream to be verified.
 103  
      * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
 104  
      * @throws IOException
 105  
      */
 106  
     public static boolean checkNQuadsFormat(InputStream is) throws IOException {
 107  0
         return findPattern(NQUADS_PATTERNS, '.', is);
 108  
     }
 109  
 
 110  
     /**
 111  
      * Checks if the stream contains <i>Turtle</i> triple patterns.
 112  
      *
 113  
      * @param is input stream to be verified.
 114  
      * @return <code>true</code> if <i>Turtle</i> patterns are detected, <code>false</code> otherwise.
 115  
      * @throws IOException
 116  
      */
 117  
     public static boolean checkTurtleFormat(InputStream is) throws IOException {
 118  0
         String sample = extractDataSample(is, '.');
 119  0
         TurtleParser turtleParser = new TurtleParser();
 120  0
         turtleParser.setDatatypeHandling(RDFParser.DatatypeHandling.VERIFY);
 121  0
         turtleParser.setStopAtFirstError(true);
 122  0
         turtleParser.setVerifyData(true);
 123  0
         ByteArrayInputStream bais = new ByteArrayInputStream( sample.getBytes() );
 124  
         try {
 125  0
             turtleParser.parse(bais, "");
 126  0
             return true;
 127  0
         } catch (Exception e) {
 128  0
             return false;
 129  
         }
 130  
     }
 131  
 
 132  
     /**
 133  
      * Checks if the stream contains a valid <i>CSV</i> pattern.
 134  
      *
 135  
      * @param is input stream to be verified.
 136  
      * @return <code>true</code> if <i>CSV</i> patterns are detected, <code>false</code> otherwise.
 137  
      * @throws IOException
 138  
      */
 139  
     public static boolean checkCSVFormat(InputStream is) throws IOException {
 140  0
         return CSVReaderBuilder.isCSV(is);
 141  
     }
 142  
 
 143  
     /**
 144  
      * Tries to apply one of the given patterns on a sample of the input stream.
 145  
      *
 146  
      * @param patterns the patterns to apply.
 147  
      * @param delimiterChar the delimiter of the sample.
 148  
      * @param is the input stream to sample.
 149  
      * @return <code>true</code> if a pattern has been applied, <code>false</code> otherwise.
 150  
      * @throws IOException
 151  
      */
 152  
     private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is)
 153  
     throws IOException {
 154  0
         String sample = extractDataSample(is, delimiterChar);
 155  0
         for(Pattern pattern : patterns) {
 156  0
             if(pattern.matcher(sample).find()) {
 157  0
                 return true;
 158  
             }
 159  
         }
 160  0
         return false;
 161  
     }
 162  
 
 163  
     /**
 164  
      * Extracts a sample data from the input stream, from the current
 165  
      * mark to the first <i>breakChar</i> char.
 166  
      *
 167  
      * @param is the input stream to sample.
 168  
      * @param breakChar the char to break to sample.
 169  
      * @return the sample string.
 170  
      * @throws IOException if an error occurs during sampling.
 171  
      */
 172  
     private static String extractDataSample(InputStream is, char breakChar) throws IOException {
 173  0
         BufferedReader br = new BufferedReader(new InputStreamReader(is));
 174  0
         StringBuilder sb = new StringBuilder();
 175  0
         final int MAX_SIZE = 1024 * 2;
 176  
         int c;
 177  0
         boolean insideBlock = false;
 178  0
         int read = 0;
 179  0
         br.mark(MAX_SIZE);
 180  
         try {
 181  0
             while ((c = br.read()) != -1) {
 182  0
                 read++;
 183  0
                 if (read > MAX_SIZE) {
 184  0
                     break;
 185  
                 }
 186  0
                 if ('<' == c) {
 187  0
                     insideBlock = true;
 188  0
                 } else if ('>' == c) {
 189  0
                     insideBlock = false;
 190  0
                 } else if ('"' == c) {
 191  0
                     insideBlock = !insideBlock;
 192  
                 }
 193  0
                 sb.append((char) c);
 194  0
                 if (!insideBlock && breakChar == c) {
 195  0
                     break;
 196  
                 }
 197  
             }
 198  
         } finally {
 199  0
             is.reset();
 200  0
             br.reset();
 201  0
         }
 202  0
         return sb.toString();
 203  
     }
 204  
 
 205  0
     public TikaMIMETypeDetector(Purifier purifier) {
 206  0
         this.purifier = purifier;
 207  0
         InputStream is = getResourceAsStream();
 208  0
         if (config == null) {
 209  
             try {
 210  0
                 config = new TikaConfig(is);
 211  0
             } catch (Exception e) {
 212  0
                 throw new RuntimeException("Error while loading Tika configuration.", e);
 213  0
             }
 214  
         }
 215  
 
 216  0
         if (types == null) {
 217  0
             types = config.getMimeRepository();
 218  
         }
 219  
 
 220  0
         if(tika == null) {
 221  0
             tika = new Tika(config);
 222  
         }
 223  0
     }
 224  
 
 225  
     public TikaMIMETypeDetector() {
 226  0
         this( new WhiteSpacesPurifier() );
 227  0
     }
 228  
 
 229  
     /**
 230  
      * Estimates the <code>MIME</code> type of the content of input file.
 231  
      * The <i>input</i> stream must be resettable.
 232  
      *
 233  
      * @param fileName name of the data source.
 234  
      * @param input <code>null</code> or a <b>resettable</i> input stream containing data.
 235  
      * @param mimeTypeFromMetadata mimetype declared in metadata.
 236  
      * @return the supposed mime type or <code>null</code> if nothing appropriate found.
 237  
      * @throws IllegalArgumentException if <i>input</i> is not <code>null</code> and is not resettable.
 238  
      */
 239  
     public MIMEType guessMIMEType(
 240  
             String fileName,
 241  
             InputStream input,
 242  
             MIMEType mimeTypeFromMetadata
 243  
     ) {
 244  0
         if(input != null) {
 245  
             try {
 246  0
                 this.purifier.purify(input);
 247  0
             } catch (IOException e) {
 248  0
                 throw new RuntimeException("Error while purifying the provided input", e);
 249  0
             }
 250  
         }
 251  
 
 252  0
         final Metadata meta = new Metadata();
 253  0
         if (mimeTypeFromMetadata != null)
 254  0
             meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
 255  0
         if (fileName != null)
 256  0
             meta.set(Metadata.RESOURCE_NAME_KEY, fileName);
 257  
 
 258  
         String type;
 259  
         try {
 260  0
             final String mt = guessMimeTypeByInputAndMeta(input, meta);
 261  0
             if( ! MimeTypes.OCTET_STREAM.equals(mt) ) {
 262  0
                 type = mt;
 263  
             } else {
 264  0
                 if( checkN3Format(input) ) {
 265  0
                     type = N3_MIMETYPE;
 266  0
                 } else if( checkNQuadsFormat(input) ) {
 267  0
                     type = NQUADS_MIMETYPE;
 268  0
                 } else if( checkTurtleFormat(input) ) {
 269  0
                     type = TURTLE_MIMETYPE;
 270  0
                 } else if( checkCSVFormat(input) ) {
 271  0
                     type = CSV_MIMETYPE;
 272  
                 }
 273  
                 else {
 274  0
                     type = MimeTypes.OCTET_STREAM; 
 275  
                 }
 276  
             }
 277  0
         } catch (IOException ioe) {
 278  0
             throw new RuntimeException("Error while retrieving mime type.", ioe);
 279  0
         }
 280  0
         return MIMEType.parse(type);
 281  
     }
 282  
 
 283  
      /**
 284  
       * Loads the <code>Tika</code> configuration file.
 285  
       *
 286  
       * @return the input stream containing the configuration.
 287  
       */
 288  
      private InputStream getResourceAsStream() {
 289  
          InputStream result;
 290  0
          result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
 291  0
          if (result == null) {
 292  0
              result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
 293  0
              if (result == null) {
 294  0
                  result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
 295  
              }
 296  
          }
 297  0
          return result;
 298  
      }
 299  
 
 300  
     /**
 301  
      * Automatically detects the MIME type of a document based on magic
 302  
      * markers in the stream prefix and any given metadata hints.
 303  
      * <p/>
 304  
      * The given stream is expected to support marks, so that this method
 305  
      * can reset the stream to the position it was in before this method
 306  
      * was called.
 307  
      *
 308  
      * @param stream   document stream
 309  
      * @param metadata metadata hints
 310  
      * @return MIME type of the document
 311  
      * @throws IOException if the document stream could not be read
 312  
      */
 313  
     private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata)
 314  
     throws IOException {
 315  0
         if (stream != null) {
 316  0
             final String type = tika.detect(stream);
 317  0
             if ( type != null && ! isGenericMIMEType(type) ) {
 318  0
                 return type;
 319  
             }
 320  
         }
 321  
 
 322  
         // Determines the MIMEType based on Content-Type hint if available.
 323  0
         final String contentType = metadata.get(Metadata.CONTENT_TYPE);
 324  0
         String candidateMIMEType = null;
 325  0
         if (contentType != null) {
 326  
             try {
 327  0
                 MimeType type = types.forName(contentType);
 328  0
                 if (type != null) {
 329  0
                     if( ! isPlainMIMEType(type.getName()) ) {
 330  0
                         return type.getName();
 331  
                     } else {
 332  0
                         candidateMIMEType = type.getName();
 333  
                     }
 334  
                 }
 335  
             }
 336  0
             catch (MimeTypeException mte) {
 337  
                 // Malformed ocntent-type value, ignore.
 338  0
             }
 339  
         }
 340  
 
 341  
         // Determines the MIMEType based on resource name hint if available.
 342  0
         final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
 343  0
         if (resourceName != null) {
 344  0
             MimeType type = types.getMimeType(resourceName);
 345  0
             if (type != null) {
 346  0
                 return type.getName();
 347  
             }
 348  
         }
 349  
 
 350  
         // Finally, use the default type if no matches found
 351  0
         if(candidateMIMEType != null) {
 352  0
             return candidateMIMEType;
 353  
         } else {
 354  0
             return MimeTypes.OCTET_STREAM;
 355  
         }
 356  
     }
 357  
 
 358  
     private boolean isPlainMIMEType(String type) {
 359  0
         return
 360  
             type.equals(MimeTypes.OCTET_STREAM)
 361  
                 ||
 362  
             type.equals(MimeTypes.PLAIN_TEXT);
 363  
     }
 364  
 
 365  
     private boolean isGenericMIMEType(String type) {
 366  0
         return
 367  
             isPlainMIMEType(type)
 368  
                 ||
 369  
             type.equals(MimeTypes.XML);
 370  
     }
 371  
 
 372  
 }
 373