Coverage Report

Coverage Report - org.apache.any23.Any23

Classes in this File

Line Coverage

Branch Coverage

Complexity

Any23

0/78

0/32

2.111

Any23$1

0/5

N/A

2.111

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23;
 
 import org.apache.any23.configuration.Configuration;
 import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.ExtractorGroup;
 import org.apache.any23.extractor.ExtractorRegistry;
 import org.apache.any23.extractor.SingleDocumentExtraction;
 import org.apache.any23.extractor.SingleDocumentExtractionReport;
 import org.apache.any23.http.AcceptHeaderBuilder;
 import org.apache.any23.http.DefaultHTTPClient;
 import org.apache.any23.http.HTTPClient;
 import org.apache.any23.http.HTTPClientConfiguration;
 import org.apache.any23.mime.MIMEType;
 import org.apache.any23.mime.MIMETypeDetector;
 import org.apache.any23.mime.TikaMIMETypeDetector;
 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
 import org.apache.any23.source.DocumentSource;
 import org.apache.any23.source.FileDocumentSource;
 import org.apache.any23.source.HTTPDocumentSource;
 import org.apache.any23.source.LocalCopyFactory;
 import org.apache.any23.source.MemCopyFactory;
 import org.apache.any23.source.StringDocumentSource;
 import org.apache.any23.writer.TripleHandler;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 import java.io.File;
 import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 
 
 /**
  * A facade with convenience methods for typical <i>Any23</i> extraction
  * operations.
  *
  * @author Richard Cyganiak (richard@cyganiak.de)
  * @author Michele Mostarda (michele.mostarda@gmail.com)
  */
 public class Any23 {
 
     /**
      * Any23 core library version.
      * NOTE: there's also a version string in pom.xml, they should match.
      */
     public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
 
     /**
      * Default HTTP User Agent defined in default configuration.
      */
     public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail(
             "any23.http.user.agent.default"
     );
 
     protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
 
     private final Configuration configuration;
     private final String        defaultUserAgent;
 
     private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector( new WhiteSpacesPurifier() );
 
     private HTTPClient httpClient = new DefaultHTTPClient();
 
     private boolean httpClientInitialized = false;
 
     private final ExtractorGroup factories;
     private LocalCopyFactory     streamCache;
     private String               userAgent;
 
     /**
      * Constructor that allows the specification of a
      * custom configuration and of a list of extractors.
      *
      * @param configuration configuration used to build the <i>Any23</i> instance.
      * @param extractorGroup the group of extractors to be applied.
      */
     public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
         if(configuration == null) throw new NullPointerException("configuration must be not null.");
         this.configuration = configuration;
         logger.info( configuration.getConfigurationDump() );
 
         this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
 
         this.factories = (extractorGroup == null)
                 ? ExtractorRegistry.getInstance().getExtractorGroup()
                 : extractorGroup;
         setCacheFactory(new MemCopyFactory());
     }
 
     /**
      * Constructor that allows the specification of a list of extractors.
      *
      * @param extractorGroup the group of extractors to be applied.
      */
     public Any23(ExtractorGroup extractorGroup) {
         this(DefaultConfiguration.singleton(), extractorGroup);
     }
 
     /**
      * Constructor that allows the specification of a
      * custom configuration and of list of extractor names.
      *
      * @param extractorNames list of extractor's names.
      */
     public Any23(Configuration configuration, String... extractorNames) {
         this(
                 configuration,
                 extractorNames == null
                         ?
                 null
                         :
                 ExtractorRegistry.getInstance().getExtractorGroup( Arrays.asList(extractorNames))
         );
     }
 
     /**
      * Constructor that allows the specification of a list of extractor names.
      *
      * @param extractorNames list of extractor's names.
      */
     public Any23(String... extractorNames) {
         this( DefaultConfiguration.singleton(), extractorNames );
     }
 
     /**
      * Constructor accepting {@link Configuration}.
      */
     public Any23(Configuration configuration) {
         this(configuration, (String[]) null);
     }
 
     /**
      * Constructor with default configuration.
      */
     public Any23() {
         this( DefaultConfiguration.singleton() );
     }
 
     /**
      * Sets the <i>HTTP Header User Agent</i>,
      * see <i>RFC 2616-14.43</i>.
      *
      * @param userAgent text describing the user agent.
      */
     public void setHTTPUserAgent(String userAgent) {
         if (httpClientInitialized) {
             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
         }
         if(userAgent == null) {
             userAgent = defaultUserAgent;
         }
         if(userAgent.trim().length() == 0) {
             throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
         }
         this.userAgent = userAgent;
     }
 
     /**
      * Returns the <i>HTTP Header User Agent</i>,
      * see <i>RFC 2616-14.43</i>.
      *
      * @return text describing the user agent.
      */
     public String getHTTPUserAgent() {
         return this.userAgent;
     }
 
     /**
      * Allows to set the {@link org.apache.any23.http.HTTPClient} implementation
      * used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}.
      *
      * @param httpClient a valid client instance.
      * @throws IllegalStateException if invoked after client has been initialized.
      */
     public void setHTTPClient(HTTPClient httpClient) {
         if(httpClient == null) {
             throw new NullPointerException("httpClient cannot be null.");
         }
         if (httpClientInitialized) {
             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
         }
         this.httpClient = httpClient;
     }
 
     /**
      * Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
      *
      * @return instance of HTTPClient.
      * @throws IOException if the HTTP client has not initialized.
      */
     public HTTPClient getHTTPClient() throws IOException {
         if (!httpClientInitialized) {
             if (userAgent == null) {
                 throw new IOException("Must call " + Any23.class.getSimpleName() +
                         ".setHTTPUserAgent(String) before extracting from HTTP URI");
             }
             httpClient.init( new HTTPClientConfiguration() {
                 public String getUserAgent() {
                     return userAgent;
                 }
                 public String getAcceptHeader() {
                     return Any23.this.getAcceptHeader();
                 }
                 public int getDefaultTimeout() {
                     return configuration.getPropertyIntOrFail("any23.http.client.timeout");
                 }
                 public int getMaxConnections() {
                     return configuration.getPropertyIntOrFail("any23.http.client.max.connections");
                 }
             } );
             httpClientInitialized = true;
         }
         return httpClient;
     }
 
     /**
      * Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
      *
      * @param cache valid cache instance.
      */
     public void setCacheFactory(LocalCopyFactory cache) {
         if(cache == null) {
             throw new NullPointerException("cache cannot be null.");
         }
         this.streamCache = cache;
     }
 
     /**
      * Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
      *
      * @param detector a valid detector instance, if <code>null</code> all the detectors
      *        will be used.
      */
     public void setMIMETypeDetector(MIMETypeDetector detector) {
         this.mimeTypeDetector = detector;
     }
 
     /**
      * Returns the most appropriate {@link DocumentSource} for the given<code>documentURI</code>.
      *
      * @param documentURI the document <i>URI</i>.
      * @return a new instance of DocumentSource.
      * @throws URISyntaxException if an error occurs while parsing the <code>documentURI</code> as a <i>URI</i>.
      * @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
      */
     public DocumentSource createDocumentSource(String documentURI) throws URISyntaxException, IOException {
         if(documentURI == null) throw new NullPointerException("documentURI cannot be null.");
         if (documentURI.toLowerCase().startsWith("file:")) {
             return new FileDocumentSource( new File(new URI(documentURI)) );
         }
         if (documentURI.toLowerCase().startsWith("http:") || documentURI.toLowerCase().startsWith("https:")) {
             return new HTTPDocumentSource(getHTTPClient(), documentURI);
         }
         throw new IllegalArgumentException(
                 String.format("Unsupported protocol for document URI: '%s' .", documentURI)
         );
     }
 
 
     /**
      * Performs metadata extraction from the content of the given
      * <code>in</code> document source, sending the generated events
      * to the specified <code>outputHandler</code>.
      *
      * @param eps the extraction parameters to be applied.
      * @param in the input document source.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @param encoding explicit encoding see
      *        <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws org.apache.any23.extractor.ExtractionException
      */
     public ExtractionReport extract(
             ExtractionParameters eps,
             DocumentSource in,
             TripleHandler outputHandler,
             String encoding
     ) throws IOException, ExtractionException {
         final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
         ex.setMIMETypeDetector(mimeTypeDetector);
         ex.setLocalCopyFactory(streamCache);
         ex.setParserEncoding(encoding);
         final SingleDocumentExtractionReport sder = ex.run(eps);
         return new ExtractionReport(
                 ex.getMatchingExtractors(),
                 ex.getParserEncoding(),
                 ex.getDetectedMIMEType(),
                 sder.getValidationReport(),
                 sder.getExtractorToErrors()
         );
     }
 
     /**
      * Performs metadata extraction on the <code>in</code> string
      * associated to the <code>documentURI</code> URI, declaring
      * <code>contentType</code> and <code>encoding</code>.
      * The generated events are sent to the specified <code>outputHandler</code>.
      *
      * @param in raw data to be analyzed.
      * @param documentURI URI from which the raw data has been extracted.
      * @param contentType declared data content type.
      * @param encoding declared data encoding.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(
             String in,
             String documentURI,
             String contentType,
             String encoding,
             TripleHandler outputHandler
     ) throws IOException, ExtractionException {
         return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler);
     }
 
     /**
      * Performs metadata extraction on the <code>in</code> string
      * associated to the <code>documentURI</code> URI, sending the generated
      * events to the specified <code>outputHandler</code>.
      *
      * @param in raw data to be analyzed.
      * @param documentURI URI from which the raw data has been extracted.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(String in, String documentURI, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract(new StringDocumentSource(in, documentURI), outputHandler);
     }
 
     /**
      * Performs metadata extraction from the content of the given <code>file</code>
      * sending the generated events to the specified <code>outputHandler</code>.
      *
      * @param file file containing raw data.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(File file, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract(new FileDocumentSource(file), outputHandler);
     }
 
     /**
      * Performs metadata extraction from the content of the given <code>documentURI</code>
      * sending the generated events to the specified <code>outputHandler</code>.
      * If the <i>URI</i> is replied with a redirect, the last will be followed.
      *
      * @param eps the parameters to be applied to the extraction.
      * @param documentURI the URI from which retrieve document.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(ExtractionParameters eps, String documentURI, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         try {
             return extract(eps, createDocumentSource(documentURI), outputHandler);
         } catch (URISyntaxException ex) {
             throw new ExtractionException("Error while extracting data from document URI.", ex);
         }
     }
 
     /**
      * Performs metadata extraction from the content of the given <code>documentURI</code>
      * sending the generated events to the specified <code>outputHandler</code>.
      * If the <i>URI</i> is replied with a redirect, the last will be followed.
      *
      * @param documentURI the URI from which retrieve document.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(String documentURI, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract((ExtractionParameters) null, documentURI, outputHandler);
     }
 
     /**
      * Performs metadata extraction from the content of the given
      * <code>in</code> document source, sending the generated events
      * to the specified <code>outputHandler</code>.
      *
      * @param in the input document source.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @param encoding explicit encoding see
      *        <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
     throws IOException, ExtractionException {
         return extract(null, in, outputHandler, encoding);
     }
 
     /**
      * Performs metadata extraction from the content of the given
      * <code>in</code> document source, sending the generated events
      * to the specified <code>outputHandler</code>.
      *
      * @param in the input document source.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract(null, in, outputHandler, null);
     }
 
     /**
      * Performs metadata extraction from the content of the given
      * <code>in</code> document source, sending the generated events
      * to the specified <code>outputHandler</code>.
      *
      * @param eps the parameters to be applied for the extraction phase.
      * @param in the input document source.
      * @param outputHandler handler responsible for collecting of the extracted metadata.
      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
      * @throws IOException
      * @throws ExtractionException
      */
     public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
     throws IOException, ExtractionException {
         return extract(eps, in, outputHandler, null);
     }
 
     private String getAcceptHeader() {
         Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
         for (ExtractorFactory<?> factory : factories) {
             mimeTypes.addAll(factory.getSupportedMIMETypes());
         }
         return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
     }
     
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23;
19
20		import org.apache.any23.configuration.Configuration;
21		import org.apache.any23.configuration.DefaultConfiguration;
22		import org.apache.any23.extractor.ExtractionException;
23		import org.apache.any23.extractor.ExtractionParameters;
24		import org.apache.any23.extractor.ExtractorFactory;
25		import org.apache.any23.extractor.ExtractorGroup;
26		import org.apache.any23.extractor.ExtractorRegistry;
27		import org.apache.any23.extractor.SingleDocumentExtraction;
28		import org.apache.any23.extractor.SingleDocumentExtractionReport;
29		import org.apache.any23.http.AcceptHeaderBuilder;
30		import org.apache.any23.http.DefaultHTTPClient;
31		import org.apache.any23.http.HTTPClient;
32		import org.apache.any23.http.HTTPClientConfiguration;
33		import org.apache.any23.mime.MIMEType;
34		import org.apache.any23.mime.MIMETypeDetector;
35		import org.apache.any23.mime.TikaMIMETypeDetector;
36		import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
37		import org.apache.any23.source.DocumentSource;
38		import org.apache.any23.source.FileDocumentSource;
39		import org.apache.any23.source.HTTPDocumentSource;
40		import org.apache.any23.source.LocalCopyFactory;
41		import org.apache.any23.source.MemCopyFactory;
42		import org.apache.any23.source.StringDocumentSource;
43		import org.apache.any23.writer.TripleHandler;
44		import org.slf4j.Logger;
45		import org.slf4j.LoggerFactory;
46
47		import java.io.File;
48		import java.io.IOException;
49		import java.net.URI;
50		import java.net.URISyntaxException;
51		import java.util.ArrayList;
52		import java.util.Arrays;
53		import java.util.Collection;
54
55
56		/**
57		* A facade with convenience methods for typical <i>Any23</i> extraction
58		* operations.
59		*
60		* @author Richard Cyganiak (richard@cyganiak.de)
61		* @author Michele Mostarda (michele.mostarda@gmail.com)
62		*/
63	0	public class Any23 {
64
65		/**
66		* Any23 core library version.
67		* NOTE: there's also a version string in pom.xml, they should match.
68		*/
69	0	public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
70
71		/**
72		* Default HTTP User Agent defined in default configuration.
73		*/
74	0	public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail(
75		"any23.http.user.agent.default"
76		);
77
78	0	protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
79
80		private final Configuration configuration;
81		private final String defaultUserAgent;
82
83	0	private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector( new WhiteSpacesPurifier() );
84
85	0	private HTTPClient httpClient = new DefaultHTTPClient();
86
87	0	private boolean httpClientInitialized = false;
88
89		private final ExtractorGroup factories;
90		private LocalCopyFactory streamCache;
91		private String userAgent;
92
93		/**
94		* Constructor that allows the specification of a
95		* custom configuration and of a list of extractors.
96		*
97		* @param configuration configuration used to build the <i>Any23</i> instance.
98		* @param extractorGroup the group of extractors to be applied.
99		*/
100	0	public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
101	0	if(configuration == null) throw new NullPointerException("configuration must be not null.");
102	0	this.configuration = configuration;
103	0	logger.info( configuration.getConfigurationDump() );
104
105	0	this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
106
107	0	this.factories = (extractorGroup == null)
108		? ExtractorRegistry.getInstance().getExtractorGroup()
109		: extractorGroup;
110	0	setCacheFactory(new MemCopyFactory());
111	0	}
112
113		/**
114		* Constructor that allows the specification of a list of extractors.
115		*
116		* @param extractorGroup the group of extractors to be applied.
117		*/
118		public Any23(ExtractorGroup extractorGroup) {
119	0	this(DefaultConfiguration.singleton(), extractorGroup);
120	0	}
121
122		/**
123		* Constructor that allows the specification of a
124		* custom configuration and of list of extractor names.
125		*
126		* @param extractorNames list of extractor's names.
127		*/
128		public Any23(Configuration configuration, String... extractorNames) {
129	0	this(
130		configuration,
131		extractorNames == null
132		?
133		null
134		:
135		ExtractorRegistry.getInstance().getExtractorGroup( Arrays.asList(extractorNames))
136		);
137	0	}
138
139		/**
140		* Constructor that allows the specification of a list of extractor names.
141		*
142		* @param extractorNames list of extractor's names.
143		*/
144		public Any23(String... extractorNames) {
145	0	this( DefaultConfiguration.singleton(), extractorNames );
146	0	}
147
148		/**
149		* Constructor accepting {@link Configuration}.
150		*/
151		public Any23(Configuration configuration) {
152	0	this(configuration, (String[]) null);
153	0	}
154
155		/**
156		* Constructor with default configuration.
157		*/
158		public Any23() {
159	0	this( DefaultConfiguration.singleton() );
160	0	}
161
162		/**
163		* Sets the <i>HTTP Header User Agent</i>,
164		* see <i>RFC 2616-14.43</i>.
165		*
166		* @param userAgent text describing the user agent.
167		*/
168		public void setHTTPUserAgent(String userAgent) {
169	0	if (httpClientInitialized) {
170	0	throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
171		}
172	0	if(userAgent == null) {
173	0	userAgent = defaultUserAgent;
174		}
175	0	if(userAgent.trim().length() == 0) {
176	0	throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) );
177		}
178	0	this.userAgent = userAgent;
179	0	}
180
181		/**
182		* Returns the <i>HTTP Header User Agent</i>,
183		* see <i>RFC 2616-14.43</i>.
184		*
185		* @return text describing the user agent.
186		*/
187		public String getHTTPUserAgent() {
188	0	return this.userAgent;
189		}
190
191		/**
192		* Allows to set the {@link org.apache.any23.http.HTTPClient} implementation
193		* used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}.
194		*
195		* @param httpClient a valid client instance.
196		* @throws IllegalStateException if invoked after client has been initialized.
197		*/
198		public void setHTTPClient(HTTPClient httpClient) {
199	0	if(httpClient == null) {
200	0	throw new NullPointerException("httpClient cannot be null.");
201		}
202	0	if (httpClientInitialized) {
203	0	throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
204		}
205	0	this.httpClient = httpClient;
206	0	}
207
208		/**
209		* Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
210		*
211		* @return instance of HTTPClient.
212		* @throws IOException if the HTTP client has not initialized.
213		*/
214		public HTTPClient getHTTPClient() throws IOException {
215	0	if (!httpClientInitialized) {
216	0	if (userAgent == null) {
217	0	throw new IOException("Must call " + Any23.class.getSimpleName() +
218		".setHTTPUserAgent(String) before extracting from HTTP URI");
219		}
220	0	httpClient.init( new HTTPClientConfiguration() {
221		public String getUserAgent() {
222	0	return userAgent;
223		}
224		public String getAcceptHeader() {
225	0	return Any23.this.getAcceptHeader();
226		}
227		public int getDefaultTimeout() {
228	0	return configuration.getPropertyIntOrFail("any23.http.client.timeout");
229		}
230		public int getMaxConnections() {
231	0	return configuration.getPropertyIntOrFail("any23.http.client.max.connections");
232		}
233		} );
234	0	httpClientInitialized = true;
235		}
236	0	return httpClient;
237		}
238
239		/**
240		* Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
241		*
242		* @param cache valid cache instance.
243		*/
244		public void setCacheFactory(LocalCopyFactory cache) {
245	0	if(cache == null) {
246	0	throw new NullPointerException("cache cannot be null.");
247		}
248	0	this.streamCache = cache;
249	0	}
250
251		/**
252		* Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
253		*
254		* @param detector a valid detector instance, if <code>null</code> all the detectors
255		* will be used.
256		*/
257		public void setMIMETypeDetector(MIMETypeDetector detector) {
258	0	this.mimeTypeDetector = detector;
259	0	}
260
261		/**
262		* Returns the most appropriate {@link DocumentSource} for the given<code>documentURI</code>.
263		*
264		* @param documentURI the document <i>URI</i>.
265		* @return a new instance of DocumentSource.
266		* @throws URISyntaxException if an error occurs while parsing the <code>documentURI</code> as a <i>URI</i>.
267		* @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
268		*/
269		public DocumentSource createDocumentSource(String documentURI) throws URISyntaxException, IOException {
270	0	if(documentURI == null) throw new NullPointerException("documentURI cannot be null.");
271	0	if (documentURI.toLowerCase().startsWith("file:")) {
272	0	return new FileDocumentSource( new File(new URI(documentURI)) );
273		}
274	0	if (documentURI.toLowerCase().startsWith("http:") \|\| documentURI.toLowerCase().startsWith("https:")) {
275	0	return new HTTPDocumentSource(getHTTPClient(), documentURI);
276		}
277	0	throw new IllegalArgumentException(
278		String.format("Unsupported protocol for document URI: '%s' .", documentURI)
279		);
280		}
281
282
283		/**
284		* Performs metadata extraction from the content of the given
285		* <code>in</code> document source, sending the generated events
286		* to the specified <code>outputHandler</code>.
287		*
288		* @param eps the extraction parameters to be applied.
289		* @param in the input document source.
290		* @param outputHandler handler responsible for collecting of the extracted metadata.
291		* @param encoding explicit encoding see
292		* <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
293		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
294		* @throws IOException
295		* @throws org.apache.any23.extractor.ExtractionException
296		*/
297		public ExtractionReport extract(
298		ExtractionParameters eps,
299		DocumentSource in,
300		TripleHandler outputHandler,
301		String encoding
302		) throws IOException, ExtractionException {
303	0	final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
304	0	ex.setMIMETypeDetector(mimeTypeDetector);
305	0	ex.setLocalCopyFactory(streamCache);
306	0	ex.setParserEncoding(encoding);
307	0	final SingleDocumentExtractionReport sder = ex.run(eps);
308	0	return new ExtractionReport(
309		ex.getMatchingExtractors(),
310		ex.getParserEncoding(),
311		ex.getDetectedMIMEType(),
312		sder.getValidationReport(),
313		sder.getExtractorToErrors()
314		);
315		}
316
317		/**
318		* Performs metadata extraction on the <code>in</code> string
319		* associated to the <code>documentURI</code> URI, declaring
320		* <code>contentType</code> and <code>encoding</code>.
321		* The generated events are sent to the specified <code>outputHandler</code>.
322		*
323		* @param in raw data to be analyzed.
324		* @param documentURI URI from which the raw data has been extracted.
325		* @param contentType declared data content type.
326		* @param encoding declared data encoding.
327		* @param outputHandler handler responsible for collecting of the extracted metadata.
328		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
329		* @throws IOException
330		* @throws ExtractionException
331		*/
332		public ExtractionReport extract(
333		String in,
334		String documentURI,
335		String contentType,
336		String encoding,
337		TripleHandler outputHandler
338		) throws IOException, ExtractionException {
339	0	return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler);
340		}
341
342		/**
343		* Performs metadata extraction on the <code>in</code> string
344		* associated to the <code>documentURI</code> URI, sending the generated
345		* events to the specified <code>outputHandler</code>.
346		*
347		* @param in raw data to be analyzed.
348		* @param documentURI URI from which the raw data has been extracted.
349		* @param outputHandler handler responsible for collecting of the extracted metadata.
350		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
351		* @throws IOException
352		* @throws ExtractionException
353		*/
354		public ExtractionReport extract(String in, String documentURI, TripleHandler outputHandler)
355		throws IOException, ExtractionException {
356	0	return extract(new StringDocumentSource(in, documentURI), outputHandler);
357		}
358
359		/**
360		* Performs metadata extraction from the content of the given <code>file</code>
361		* sending the generated events to the specified <code>outputHandler</code>.
362		*
363		* @param file file containing raw data.
364		* @param outputHandler handler responsible for collecting of the extracted metadata.
365		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
366		* @throws IOException
367		* @throws ExtractionException
368		*/
369		public ExtractionReport extract(File file, TripleHandler outputHandler)
370		throws IOException, ExtractionException {
371	0	return extract(new FileDocumentSource(file), outputHandler);
372		}
373
374		/**
375		* Performs metadata extraction from the content of the given <code>documentURI</code>
376		* sending the generated events to the specified <code>outputHandler</code>.
377		* If the <i>URI</i> is replied with a redirect, the last will be followed.
378		*
379		* @param eps the parameters to be applied to the extraction.
380		* @param documentURI the URI from which retrieve document.
381		* @param outputHandler handler responsible for collecting of the extracted metadata.
382		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
383		* @throws IOException
384		* @throws ExtractionException
385		*/
386		public ExtractionReport extract(ExtractionParameters eps, String documentURI, TripleHandler outputHandler)
387		throws IOException, ExtractionException {
388		try {
389	0	return extract(eps, createDocumentSource(documentURI), outputHandler);
390	0	} catch (URISyntaxException ex) {
391	0	throw new ExtractionException("Error while extracting data from document URI.", ex);
392		}
393		}
394
395		/**
396		* Performs metadata extraction from the content of the given <code>documentURI</code>
397		* sending the generated events to the specified <code>outputHandler</code>.
398		* If the <i>URI</i> is replied with a redirect, the last will be followed.
399		*
400		* @param documentURI the URI from which retrieve document.
401		* @param outputHandler handler responsible for collecting of the extracted metadata.
402		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
403		* @throws IOException
404		* @throws ExtractionException
405		*/
406		public ExtractionReport extract(String documentURI, TripleHandler outputHandler)
407		throws IOException, ExtractionException {
408	0	return extract((ExtractionParameters) null, documentURI, outputHandler);
409		}
410
411		/**
412		* Performs metadata extraction from the content of the given
413		* <code>in</code> document source, sending the generated events
414		* to the specified <code>outputHandler</code>.
415		*
416		* @param in the input document source.
417		* @param outputHandler handler responsible for collecting of the extracted metadata.
418		* @param encoding explicit encoding see
419		* <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
420		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
421		* @throws IOException
422		* @throws ExtractionException
423		*/
424		public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
425		throws IOException, ExtractionException {
426	0	return extract(null, in, outputHandler, encoding);
427		}
428
429		/**
430		* Performs metadata extraction from the content of the given
431		* <code>in</code> document source, sending the generated events
432		* to the specified <code>outputHandler</code>.
433		*
434		* @param in the input document source.
435		* @param outputHandler handler responsible for collecting of the extracted metadata.
436		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
437		* @throws IOException
438		* @throws ExtractionException
439		*/
440		public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
441		throws IOException, ExtractionException {
442	0	return extract(null, in, outputHandler, null);
443		}
444
445		/**
446		* Performs metadata extraction from the content of the given
447		* <code>in</code> document source, sending the generated events
448		* to the specified <code>outputHandler</code>.
449		*
450		* @param eps the parameters to be applied for the extraction phase.
451		* @param in the input document source.
452		* @param outputHandler handler responsible for collecting of the extracted metadata.
453		* @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
454		* @throws IOException
455		* @throws ExtractionException
456		*/
457		public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
458		throws IOException, ExtractionException {
459	0	return extract(eps, in, outputHandler, null);
460		}
461
462		private String getAcceptHeader() {
463	0	Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>();
464	0	for (ExtractorFactory<?> factory : factories) {
465	0	mimeTypes.addAll(factory.getSupportedMIMETypes());
466		}
467	0	return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
468		}
469
470		}