Coverage Report - org.apache.any23.Any23
 
Classes in this File Line Coverage Branch Coverage Complexity
Any23
0%
0/78
0%
0/32
2.111
Any23$1
0%
0/5
N/A
2.111
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23;
 19  
 
 20  
 import org.apache.any23.configuration.Configuration;
 21  
 import org.apache.any23.configuration.DefaultConfiguration;
 22  
 import org.apache.any23.extractor.ExtractionException;
 23  
 import org.apache.any23.extractor.ExtractionParameters;
 24  
 import org.apache.any23.extractor.ExtractorFactory;
 25  
 import org.apache.any23.extractor.ExtractorGroup;
 26  
 import org.apache.any23.extractor.ExtractorRegistry;
 27  
 import org.apache.any23.extractor.SingleDocumentExtraction;
 28  
 import org.apache.any23.extractor.SingleDocumentExtractionReport;
 29  
 import org.apache.any23.http.AcceptHeaderBuilder;
 30  
 import org.apache.any23.http.DefaultHTTPClient;
 31  
 import org.apache.any23.http.HTTPClient;
 32  
 import org.apache.any23.http.HTTPClientConfiguration;
 33  
 import org.apache.any23.mime.MIMEType;
 34  
 import org.apache.any23.mime.MIMETypeDetector;
 35  
 import org.apache.any23.mime.TikaMIMETypeDetector;
 36  
 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
 37  
 import org.apache.any23.source.DocumentSource;
 38  
 import org.apache.any23.source.FileDocumentSource;
 39  
 import org.apache.any23.source.HTTPDocumentSource;
 40  
 import org.apache.any23.source.LocalCopyFactory;
 41  
 import org.apache.any23.source.MemCopyFactory;
 42  
 import org.apache.any23.source.StringDocumentSource;
 43  
 import org.apache.any23.writer.TripleHandler;
 44  
 import org.slf4j.Logger;
 45  
 import org.slf4j.LoggerFactory;
 46  
 
 47  
 import java.io.File;
 48  
 import java.io.IOException;
 49  
 import java.net.URI;
 50  
 import java.net.URISyntaxException;
 51  
 import java.util.ArrayList;
 52  
 import java.util.Arrays;
 53  
 import java.util.Collection;
 54  
 
 55  
 
 56  
 /**
 57  
  * A facade with convenience methods for typical <i>Any23</i> extraction
 58  
  * operations.
 59  
  *
 60  
  * @author Richard Cyganiak (richard@cyganiak.de)
 61  
  * @author Michele Mostarda (michele.mostarda@gmail.com)
 62  
  */
 63  0
 public class Any23 {
 64  
 
 65  
     /**
 66  
      * Any23 core library version.
 67  
      * NOTE: there's also a version string in pom.xml, they should match.
 68  
      */
 69  0
     public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
 70  
 
 71  
     /**
 72  
      * Default HTTP User Agent defined in default configuration.
 73  
      */
 74  0
     public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail(
 75  
             "any23.http.user.agent.default"
 76  
     );
 77  
 
 78  0
     protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
 79  
 
 80  
     private final Configuration configuration;
 81  
     private final String        defaultUserAgent;
 82  
 
 83  0
     private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector( new WhiteSpacesPurifier() );
 84  
 
 85  0
     private HTTPClient httpClient = new DefaultHTTPClient();
 86  
 
 87  0
     private boolean httpClientInitialized = false;
 88  
 
 89  
     private final ExtractorGroup factories;
 90  
     private LocalCopyFactory     streamCache;
 91  
     private String               userAgent;
 92  
 
 93  
     /**
 94  
      * Constructor that allows the specification of a
 95  
      * custom configuration and of a list of extractors.
 96  
      *
 97  
      * @param configuration configuration used to build the <i>Any23</i> instance.
 98  
      * @param extractorGroup the group of extractors to be applied.
 99  
      */
 100  0
     public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
 101  0
         if(configuration == null) throw new NullPointerException("configuration must be not null.");
 102  0
         this.configuration = configuration;
 103  0
         logger.info( configuration.getConfigurationDump() );
 104  
 
 105  0
         this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
 106  
 
 107  0
         this.factories = (extractorGroup == null)
 108  
                 ? ExtractorRegistry.getInstance().getExtractorGroup()
 109  
                 : extractorGroup;
 110  0
         setCacheFactory(new MemCopyFactory());
 111  0
     }
 112  
 
 113  
     /**
 114  
      * Constructor that allows the specification of a list of extractors.
 115  
      *
 116  
      * @param extractorGroup the group of extractors to be applied.
 117  
      */
 118  
     public Any23(ExtractorGroup extractorGroup) {
 119  0
         this(DefaultConfiguration.singleton(), extractorGroup);
 120  0
     }
 121  
 
 122  
     /**
 123  
      * Constructor that allows the specification of a
 124  
      * custom configuration and of list of extractor names.
 125  
      *
 126  
      * @param extractorNames list of extractor's names.
 127  
      */
 128  
     public Any23(Configuration configuration, String... extractorNames) {
 129  0
         this(
 130  
                 configuration,
 131  
                 extractorNames == null
 132  
                         ?
 133  
                 null
 134  
                         :
 135  
                 ExtractorRegistry.getInstance().getExtractorGroup( Arrays.asList(extractorNames))
 136  
         );
 137  0
     }
 138  
 
 139  
     /**
 140  
      * Constructor that allows the specification of a list of extractor names.
 141  
      *
 142  
      * @param extractorNames list of extractor's names.
 143  
      */
 144  
     public Any23(String... extractorNames) {
 145  0
         this( DefaultConfiguration.singleton(), extractorNames );
 146  0
     }
 147  
 
 148  
     /**
 149  
      * Constructor accepting {@link Configuration}.
 150  
      */
 151  
     public Any23(Configuration configuration) {
 152  0
         this(configuration, (String[]) null);
 153  0
     }
 154  
 
 155  
     /**
 156  
      * Constructor with default configuration.
 157  
      */
 158  
     public Any23() {
 159  0
         this( DefaultConfiguration.singleton() );
 160  0
     }
 161  
 
 162  
     /**
 163  
      * Sets the <i>HTTP Header User Agent</i>,
 164  
      * see <i>RFC 2616-14.43</i>.
 165  
      *
 166  
      * @param userAgent text describing the user agent.
 167  
      */
 168  
     public void setHTTPUserAgent(String userAgent) {
 169  0
         if (httpClientInitialized) {
 170  0
             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
 171  
         }
 172  0
         if(userAgent == null) {
 173  0
             userAgent = defaultUserAgent;
 174  
         }
 175  0
         if(userAgent.trim().length() == 0) {
 176  0
             throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
 177  
         }
 178  0
         this.userAgent = userAgent;
 179  0
     }
 180  
 
 181  
     /**
 182  
      * Returns the <i>HTTP Header User Agent</i>,
 183  
      * see <i>RFC 2616-14.43</i>.
 184  
      *
 185  
      * @return text describing the user agent.
 186  
      */
 187  
     public String getHTTPUserAgent() {
 188  0
         return this.userAgent;
 189  
     }
 190  
 
 191  
     /**
 192  
      * Allows to set the {@link org.apache.any23.http.HTTPClient} implementation
 193  
      * used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}.
 194  
      *
 195  
      * @param httpClient a valid client instance.
 196  
      * @throws IllegalStateException if invoked after client has been initialized.
 197  
      */
 198  
     public void setHTTPClient(HTTPClient httpClient) {
 199  0
         if(httpClient == null) {
 200  0
             throw new NullPointerException("httpClient cannot be null.");
 201  
         }
 202  0
         if (httpClientInitialized) {
 203  0
             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
 204  
         }
 205  0
         this.httpClient = httpClient;
 206  0
     }
 207  
 
 208  
     /**
 209  
      * Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
 210  
      *
 211  
      * @return instance of HTTPClient.
 212  
      * @throws IOException if the HTTP client has not initialized.
 213  
      */
 214  
     public HTTPClient getHTTPClient() throws IOException {
 215  0
         if (!httpClientInitialized) {
 216  0
             if (userAgent == null) {
 217  0
                 throw new IOException("Must call " + Any23.class.getSimpleName() +
 218  
                         ".setHTTPUserAgent(String) before extracting from HTTP URI");
 219  
             }
 220  0
             httpClient.init( new HTTPClientConfiguration() {
 221  
                 public String getUserAgent() {
 222  0
                     return userAgent;
 223  
                 }
 224  
                 public String getAcceptHeader() {
 225  0
                     return Any23.this.getAcceptHeader();
 226  
                 }
 227  
                 public int getDefaultTimeout() {
 228  0
                     return configuration.getPropertyIntOrFail("any23.http.client.timeout");
 229  
                 }
 230  
                 public int getMaxConnections() {
 231  0
                     return configuration.getPropertyIntOrFail("any23.http.client.max.connections");
 232  
                 }
 233  
             } );
 234  0
             httpClientInitialized = true;
 235  
         }
 236  0
         return httpClient;
 237  
     }
 238  
 
 239  
     /**
 240  
      * Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
 241  
      *
 242  
      * @param cache valid cache instance.
 243  
      */
 244  
     public void setCacheFactory(LocalCopyFactory cache) {
 245  0
         if(cache == null) {
 246  0
             throw new NullPointerException("cache cannot be null.");
 247  
         }
 248  0
         this.streamCache = cache;
 249  0
     }
 250  
 
 251  
     /**
 252  
      * Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
 253  
      *
 254  
      * @param detector a valid detector instance, if <code>null</code> all the detectors
 255  
      *        will be used.
 256  
      */
 257  
     public void setMIMETypeDetector(MIMETypeDetector detector) {
 258  0
         this.mimeTypeDetector = detector;
 259  0
     }
 260  
 
 261  
     /**
 262  
      * Returns the most appropriate {@link DocumentSource} for the given<code>documentURI</code>.
 263  
      *
 264  
      * @param documentURI the document <i>URI</i>.
 265  
      * @return a new instance of DocumentSource.
 266  
      * @throws URISyntaxException if an error occurs while parsing the <code>documentURI</code> as a <i>URI</i>.
 267  
      * @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
 268  
      */
 269  
     public DocumentSource createDocumentSource(String documentURI) throws URISyntaxException, IOException {
 270  0
         if(documentURI == null) throw new NullPointerException("documentURI cannot be null.");
 271  0
         if (documentURI.toLowerCase().startsWith("file:")) {
 272  0
             return new FileDocumentSource( new File(new URI(documentURI)) );
 273  
         }
 274  0
         if (documentURI.toLowerCase().startsWith("http:") || documentURI.toLowerCase().startsWith("https:")) {
 275  0
             return new HTTPDocumentSource(getHTTPClient(), documentURI);
 276  
         }
 277  0
         throw new IllegalArgumentException(
 278  
                 String.format("Unsupported protocol for document URI: '%s' .", documentURI)
 279  
         );
 280  
     }
 281  
 
 282  
 
 283  
     /**
 284  
      * Performs metadata extraction from the content of the given
 285  
      * <code>in</code> document source, sending the generated events
 286  
      * to the specified <code>outputHandler</code>.
 287  
      *
 288  
      * @param eps the extraction parameters to be applied.
 289  
      * @param in the input document source.
 290  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 291  
      * @param encoding explicit encoding see
 292  
      *        <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
 293  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 294  
      * @throws IOException
 295  
      * @throws org.apache.any23.extractor.ExtractionException
 296  
      */
 297  
     public ExtractionReport extract(
 298  
             ExtractionParameters eps,
 299  
             DocumentSource in,
 300  
             TripleHandler outputHandler,
 301  
             String encoding
 302  
     ) throws IOException, ExtractionException {
 303  0
         final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
 304  0
         ex.setMIMETypeDetector(mimeTypeDetector);
 305  0
         ex.setLocalCopyFactory(streamCache);
 306  0
         ex.setParserEncoding(encoding);
 307  0
         final SingleDocumentExtractionReport sder = ex.run(eps);
 308  0
         return new ExtractionReport(
 309  
                 ex.getMatchingExtractors(),
 310  
                 ex.getParserEncoding(),
 311  
                 ex.getDetectedMIMEType(),
 312  
                 sder.getValidationReport(),
 313  
                 sder.getExtractorToErrors()
 314  
         );
 315  
     }
 316  
 
 317  
     /**
 318  
      * Performs metadata extraction on the <code>in</code> string
 319  
      * associated to the <code>documentURI</code> URI, declaring
 320  
      * <code>contentType</code> and <code>encoding</code>.
 321  
      * The generated events are sent to the specified <code>outputHandler</code>.
 322  
      *
 323  
      * @param in raw data to be analyzed.
 324  
      * @param documentURI URI from which the raw data has been extracted.
 325  
      * @param contentType declared data content type.
 326  
      * @param encoding declared data encoding.
 327  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 328  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 329  
      * @throws IOException
 330  
      * @throws ExtractionException
 331  
      */
 332  
     public ExtractionReport extract(
 333  
             String in,
 334  
             String documentURI,
 335  
             String contentType,
 336  
             String encoding,
 337  
             TripleHandler outputHandler
 338  
     ) throws IOException, ExtractionException {
 339  0
         return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler);
 340  
     }
 341  
 
 342  
     /**
 343  
      * Performs metadata extraction on the <code>in</code> string
 344  
      * associated to the <code>documentURI</code> URI, sending the generated
 345  
      * events to the specified <code>outputHandler</code>.
 346  
      *
 347  
      * @param in raw data to be analyzed.
 348  
      * @param documentURI URI from which the raw data has been extracted.
 349  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 350  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 351  
      * @throws IOException
 352  
      * @throws ExtractionException
 353  
      */
 354  
     public ExtractionReport extract(String in, String documentURI, TripleHandler outputHandler)
 355  
     throws IOException, ExtractionException {
 356  0
         return extract(new StringDocumentSource(in, documentURI), outputHandler);
 357  
     }
 358  
 
 359  
     /**
 360  
      * Performs metadata extraction from the content of the given <code>file</code>
 361  
      * sending the generated events to the specified <code>outputHandler</code>.
 362  
      *
 363  
      * @param file file containing raw data.
 364  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 365  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 366  
      * @throws IOException
 367  
      * @throws ExtractionException
 368  
      */
 369  
     public ExtractionReport extract(File file, TripleHandler outputHandler)
 370  
     throws IOException, ExtractionException {
 371  0
         return extract(new FileDocumentSource(file), outputHandler);
 372  
     }
 373  
 
 374  
     /**
 375  
      * Performs metadata extraction from the content of the given <code>documentURI</code>
 376  
      * sending the generated events to the specified <code>outputHandler</code>.
 377  
      * If the <i>URI</i> is replied with a redirect, the last will be followed.
 378  
      *
 379  
      * @param eps the parameters to be applied to the extraction.
 380  
      * @param documentURI the URI from which retrieve document.
 381  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 382  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 383  
      * @throws IOException
 384  
      * @throws ExtractionException
 385  
      */
 386  
     public ExtractionReport extract(ExtractionParameters eps, String documentURI, TripleHandler outputHandler)
 387  
     throws IOException, ExtractionException {
 388  
         try {
 389  0
             return extract(eps, createDocumentSource(documentURI), outputHandler);
 390  0
         } catch (URISyntaxException ex) {
 391  0
             throw new ExtractionException("Error while extracting data from document URI.", ex);
 392  
         }
 393  
     }
 394  
 
 395  
     /**
 396  
      * Performs metadata extraction from the content of the given <code>documentURI</code>
 397  
      * sending the generated events to the specified <code>outputHandler</code>.
 398  
      * If the <i>URI</i> is replied with a redirect, the last will be followed.
 399  
      *
 400  
      * @param documentURI the URI from which retrieve document.
 401  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 402  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 403  
      * @throws IOException
 404  
      * @throws ExtractionException
 405  
      */
 406  
     public ExtractionReport extract(String documentURI, TripleHandler outputHandler)
 407  
     throws IOException, ExtractionException {
 408  0
         return extract((ExtractionParameters) null, documentURI, outputHandler);
 409  
     }
 410  
 
 411  
     /**
 412  
      * Performs metadata extraction from the content of the given
 413  
      * <code>in</code> document source, sending the generated events
 414  
      * to the specified <code>outputHandler</code>.
 415  
      *
 416  
      * @param in the input document source.
 417  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 418  
      * @param encoding explicit encoding see
 419  
      *        <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
 420  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 421  
      * @throws IOException
 422  
      * @throws ExtractionException
 423  
      */
 424  
     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
 425  
     throws IOException, ExtractionException {
 426  0
         return extract(null, in, outputHandler, encoding);
 427  
     }
 428  
 
 429  
     /**
 430  
      * Performs metadata extraction from the content of the given
 431  
      * <code>in</code> document source, sending the generated events
 432  
      * to the specified <code>outputHandler</code>.
 433  
      *
 434  
      * @param in the input document source.
 435  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 436  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 437  
      * @throws IOException
 438  
      * @throws ExtractionException
 439  
      */
 440  
     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
 441  
     throws IOException, ExtractionException {
 442  0
         return extract(null, in, outputHandler, null);
 443  
     }
 444  
 
 445  
     /**
 446  
      * Performs metadata extraction from the content of the given
 447  
      * <code>in</code> document source, sending the generated events
 448  
      * to the specified <code>outputHandler</code>.
 449  
      *
 450  
      * @param eps the parameters to be applied for the extraction phase.
 451  
      * @param in the input document source.
 452  
      * @param outputHandler handler responsible for collecting of the extracted metadata.
 453  
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
 454  
      * @throws IOException
 455  
      * @throws ExtractionException
 456  
      */
 457  
     public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
 458  
     throws IOException, ExtractionException {
 459  0
         return extract(eps, in, outputHandler, null);
 460  
     }
 461  
 
 462  
     private String getAcceptHeader() {
 463  0
         Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
 464  0
         for (ExtractorFactory<?> factory : factories) {
 465  0
             mimeTypes.addAll(factory.getSupportedMIMETypes());
 466  
         }
 467  0
         return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
 468  
     }
 469  
     
 470  
 }