Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Any23 |
|
| 2.111111111111111;2.111 | ||||
Any23$1 |
|
| 2.111111111111111;2.111 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.any23; | |
19 | ||
20 | import org.apache.any23.configuration.Configuration; | |
21 | import org.apache.any23.configuration.DefaultConfiguration; | |
22 | import org.apache.any23.extractor.ExtractionException; | |
23 | import org.apache.any23.extractor.ExtractionParameters; | |
24 | import org.apache.any23.extractor.ExtractorFactory; | |
25 | import org.apache.any23.extractor.ExtractorGroup; | |
26 | import org.apache.any23.extractor.ExtractorRegistry; | |
27 | import org.apache.any23.extractor.SingleDocumentExtraction; | |
28 | import org.apache.any23.extractor.SingleDocumentExtractionReport; | |
29 | import org.apache.any23.http.AcceptHeaderBuilder; | |
30 | import org.apache.any23.http.DefaultHTTPClient; | |
31 | import org.apache.any23.http.HTTPClient; | |
32 | import org.apache.any23.http.HTTPClientConfiguration; | |
33 | import org.apache.any23.mime.MIMEType; | |
34 | import org.apache.any23.mime.MIMETypeDetector; | |
35 | import org.apache.any23.mime.TikaMIMETypeDetector; | |
36 | import org.apache.any23.mime.purifier.WhiteSpacesPurifier; | |
37 | import org.apache.any23.source.DocumentSource; | |
38 | import org.apache.any23.source.FileDocumentSource; | |
39 | import org.apache.any23.source.HTTPDocumentSource; | |
40 | import org.apache.any23.source.LocalCopyFactory; | |
41 | import org.apache.any23.source.MemCopyFactory; | |
42 | import org.apache.any23.source.StringDocumentSource; | |
43 | import org.apache.any23.writer.TripleHandler; | |
44 | import org.slf4j.Logger; | |
45 | import org.slf4j.LoggerFactory; | |
46 | ||
47 | import java.io.File; | |
48 | import java.io.IOException; | |
49 | import java.net.URI; | |
50 | import java.net.URISyntaxException; | |
51 | import java.util.ArrayList; | |
52 | import java.util.Arrays; | |
53 | import java.util.Collection; | |
54 | ||
55 | ||
56 | /** | |
57 | * A facade with convenience methods for typical <i>Any23</i> extraction | |
58 | * operations. | |
59 | * | |
60 | * @author Richard Cyganiak (richard@cyganiak.de) | |
61 | * @author Michele Mostarda (michele.mostarda@gmail.com) | |
62 | */ | |
63 | 0 | public class Any23 { |
64 | ||
65 | /** | |
66 | * Any23 core library version. | |
67 | * NOTE: there's also a version string in pom.xml, they should match. | |
68 | */ | |
69 | 0 | public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version"); |
70 | ||
71 | /** | |
72 | * Default HTTP User Agent defined in default configuration. | |
73 | */ | |
74 | 0 | public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail( |
75 | "any23.http.user.agent.default" | |
76 | ); | |
77 | ||
78 | 0 | protected static final Logger logger = LoggerFactory.getLogger(Any23.class); |
79 | ||
80 | private final Configuration configuration; | |
81 | private final String defaultUserAgent; | |
82 | ||
83 | 0 | private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector( new WhiteSpacesPurifier() ); |
84 | ||
85 | 0 | private HTTPClient httpClient = new DefaultHTTPClient(); |
86 | ||
87 | 0 | private boolean httpClientInitialized = false; |
88 | ||
89 | private final ExtractorGroup factories; | |
90 | private LocalCopyFactory streamCache; | |
91 | private String userAgent; | |
92 | ||
93 | /** | |
94 | * Constructor that allows the specification of a | |
95 | * custom configuration and of a list of extractors. | |
96 | * | |
97 | * @param configuration configuration used to build the <i>Any23</i> instance. | |
98 | * @param extractorGroup the group of extractors to be applied. | |
99 | */ | |
100 | 0 | public Any23(Configuration configuration, ExtractorGroup extractorGroup) { |
101 | 0 | if(configuration == null) throw new NullPointerException("configuration must be not null."); |
102 | 0 | this.configuration = configuration; |
103 | 0 | logger.info( configuration.getConfigurationDump() ); |
104 | ||
105 | 0 | this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default"); |
106 | ||
107 | 0 | this.factories = (extractorGroup == null) |
108 | ? ExtractorRegistry.getInstance().getExtractorGroup() | |
109 | : extractorGroup; | |
110 | 0 | setCacheFactory(new MemCopyFactory()); |
111 | 0 | } |
112 | ||
113 | /** | |
114 | * Constructor that allows the specification of a list of extractors. | |
115 | * | |
116 | * @param extractorGroup the group of extractors to be applied. | |
117 | */ | |
118 | public Any23(ExtractorGroup extractorGroup) { | |
119 | 0 | this(DefaultConfiguration.singleton(), extractorGroup); |
120 | 0 | } |
121 | ||
122 | /** | |
123 | * Constructor that allows the specification of a | |
124 | * custom configuration and of list of extractor names. | |
125 | * | |
126 | * @param extractorNames list of extractor's names. | |
127 | */ | |
128 | public Any23(Configuration configuration, String... extractorNames) { | |
129 | 0 | this( |
130 | configuration, | |
131 | extractorNames == null | |
132 | ? | |
133 | null | |
134 | : | |
135 | ExtractorRegistry.getInstance().getExtractorGroup( Arrays.asList(extractorNames)) | |
136 | ); | |
137 | 0 | } |
138 | ||
139 | /** | |
140 | * Constructor that allows the specification of a list of extractor names. | |
141 | * | |
142 | * @param extractorNames list of extractor's names. | |
143 | */ | |
144 | public Any23(String... extractorNames) { | |
145 | 0 | this( DefaultConfiguration.singleton(), extractorNames ); |
146 | 0 | } |
147 | ||
148 | /** | |
149 | * Constructor accepting {@link Configuration}. | |
150 | */ | |
151 | public Any23(Configuration configuration) { | |
152 | 0 | this(configuration, (String[]) null); |
153 | 0 | } |
154 | ||
155 | /** | |
156 | * Constructor with default configuration. | |
157 | */ | |
158 | public Any23() { | |
159 | 0 | this( DefaultConfiguration.singleton() ); |
160 | 0 | } |
161 | ||
162 | /** | |
163 | * Sets the <i>HTTP Header User Agent</i>, | |
164 | * see <i>RFC 2616-14.43</i>. | |
165 | * | |
166 | * @param userAgent text describing the user agent. | |
167 | */ | |
168 | public void setHTTPUserAgent(String userAgent) { | |
169 | 0 | if (httpClientInitialized) { |
170 | 0 | throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized"); |
171 | } | |
172 | 0 | if(userAgent == null) { |
173 | 0 | userAgent = defaultUserAgent; |
174 | } | |
175 | 0 | if(userAgent.trim().length() == 0) { |
176 | 0 | throw new IllegalArgumentException( String.format("Invalid user agent: '%s'", userAgent) ); |
177 | } | |
178 | 0 | this.userAgent = userAgent; |
179 | 0 | } |
180 | ||
181 | /** | |
182 | * Returns the <i>HTTP Header User Agent</i>, | |
183 | * see <i>RFC 2616-14.43</i>. | |
184 | * | |
185 | * @return text describing the user agent. | |
186 | */ | |
187 | public String getHTTPUserAgent() { | |
188 | 0 | return this.userAgent; |
189 | } | |
190 | ||
191 | /** | |
192 | * Allows to set the {@link org.apache.any23.http.HTTPClient} implementation | |
193 | * used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}. | |
194 | * | |
195 | * @param httpClient a valid client instance. | |
196 | * @throws IllegalStateException if invoked after client has been initialized. | |
197 | */ | |
198 | public void setHTTPClient(HTTPClient httpClient) { | |
199 | 0 | if(httpClient == null) { |
200 | 0 | throw new NullPointerException("httpClient cannot be null."); |
201 | } | |
202 | 0 | if (httpClientInitialized) { |
203 | 0 | throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized"); |
204 | } | |
205 | 0 | this.httpClient = httpClient; |
206 | 0 | } |
207 | ||
208 | /** | |
209 | * Returns the current {@link org.apache.any23.http.HTTPClient} implementation. | |
210 | * | |
211 | * @return instance of HTTPClient. | |
212 | * @throws IOException if the HTTP client has not initialized. | |
213 | */ | |
214 | public HTTPClient getHTTPClient() throws IOException { | |
215 | 0 | if (!httpClientInitialized) { |
216 | 0 | if (userAgent == null) { |
217 | 0 | throw new IOException("Must call " + Any23.class.getSimpleName() + |
218 | ".setHTTPUserAgent(String) before extracting from HTTP URI"); | |
219 | } | |
220 | 0 | httpClient.init( new HTTPClientConfiguration() { |
221 | public String getUserAgent() { | |
222 | 0 | return userAgent; |
223 | } | |
224 | public String getAcceptHeader() { | |
225 | 0 | return Any23.this.getAcceptHeader(); |
226 | } | |
227 | public int getDefaultTimeout() { | |
228 | 0 | return configuration.getPropertyIntOrFail("any23.http.client.timeout"); |
229 | } | |
230 | public int getMaxConnections() { | |
231 | 0 | return configuration.getPropertyIntOrFail("any23.http.client.max.connections"); |
232 | } | |
233 | } ); | |
234 | 0 | httpClientInitialized = true; |
235 | } | |
236 | 0 | return httpClient; |
237 | } | |
238 | ||
239 | /** | |
240 | * Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance. | |
241 | * | |
242 | * @param cache valid cache instance. | |
243 | */ | |
244 | public void setCacheFactory(LocalCopyFactory cache) { | |
245 | 0 | if(cache == null) { |
246 | 0 | throw new NullPointerException("cache cannot be null."); |
247 | } | |
248 | 0 | this.streamCache = cache; |
249 | 0 | } |
250 | ||
251 | /** | |
252 | * Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}. | |
253 | * | |
254 | * @param detector a valid detector instance, if <code>null</code> all the detectors | |
255 | * will be used. | |
256 | */ | |
257 | public void setMIMETypeDetector(MIMETypeDetector detector) { | |
258 | 0 | this.mimeTypeDetector = detector; |
259 | 0 | } |
260 | ||
261 | /** | |
262 | * Returns the most appropriate {@link DocumentSource} for the given<code>documentURI</code>. | |
263 | * | |
264 | * @param documentURI the document <i>URI</i>. | |
265 | * @return a new instance of DocumentSource. | |
266 | * @throws URISyntaxException if an error occurs while parsing the <code>documentURI</code> as a <i>URI</i>. | |
267 | * @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}. | |
268 | */ | |
269 | public DocumentSource createDocumentSource(String documentURI) throws URISyntaxException, IOException { | |
270 | 0 | if(documentURI == null) throw new NullPointerException("documentURI cannot be null."); |
271 | 0 | if (documentURI.toLowerCase().startsWith("file:")) { |
272 | 0 | return new FileDocumentSource( new File(new URI(documentURI)) ); |
273 | } | |
274 | 0 | if (documentURI.toLowerCase().startsWith("http:") || documentURI.toLowerCase().startsWith("https:")) { |
275 | 0 | return new HTTPDocumentSource(getHTTPClient(), documentURI); |
276 | } | |
277 | 0 | throw new IllegalArgumentException( |
278 | String.format("Unsupported protocol for document URI: '%s' .", documentURI) | |
279 | ); | |
280 | } | |
281 | ||
282 | ||
283 | /** | |
284 | * Performs metadata extraction from the content of the given | |
285 | * <code>in</code> document source, sending the generated events | |
286 | * to the specified <code>outputHandler</code>. | |
287 | * | |
288 | * @param eps the extraction parameters to be applied. | |
289 | * @param in the input document source. | |
290 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
291 | * @param encoding explicit encoding see | |
292 | * <a href="http://www.iana.org/assignments/character-sets">available encodings</a>. | |
293 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
294 | * @throws IOException | |
295 | * @throws org.apache.any23.extractor.ExtractionException | |
296 | */ | |
297 | public ExtractionReport extract( | |
298 | ExtractionParameters eps, | |
299 | DocumentSource in, | |
300 | TripleHandler outputHandler, | |
301 | String encoding | |
302 | ) throws IOException, ExtractionException { | |
303 | 0 | final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler); |
304 | 0 | ex.setMIMETypeDetector(mimeTypeDetector); |
305 | 0 | ex.setLocalCopyFactory(streamCache); |
306 | 0 | ex.setParserEncoding(encoding); |
307 | 0 | final SingleDocumentExtractionReport sder = ex.run(eps); |
308 | 0 | return new ExtractionReport( |
309 | ex.getMatchingExtractors(), | |
310 | ex.getParserEncoding(), | |
311 | ex.getDetectedMIMEType(), | |
312 | sder.getValidationReport(), | |
313 | sder.getExtractorToErrors() | |
314 | ); | |
315 | } | |
316 | ||
317 | /** | |
318 | * Performs metadata extraction on the <code>in</code> string | |
319 | * associated to the <code>documentURI</code> URI, declaring | |
320 | * <code>contentType</code> and <code>encoding</code>. | |
321 | * The generated events are sent to the specified <code>outputHandler</code>. | |
322 | * | |
323 | * @param in raw data to be analyzed. | |
324 | * @param documentURI URI from which the raw data has been extracted. | |
325 | * @param contentType declared data content type. | |
326 | * @param encoding declared data encoding. | |
327 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
328 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
329 | * @throws IOException | |
330 | * @throws ExtractionException | |
331 | */ | |
332 | public ExtractionReport extract( | |
333 | String in, | |
334 | String documentURI, | |
335 | String contentType, | |
336 | String encoding, | |
337 | TripleHandler outputHandler | |
338 | ) throws IOException, ExtractionException { | |
339 | 0 | return extract(new StringDocumentSource(in, documentURI, contentType, encoding), outputHandler); |
340 | } | |
341 | ||
342 | /** | |
343 | * Performs metadata extraction on the <code>in</code> string | |
344 | * associated to the <code>documentURI</code> URI, sending the generated | |
345 | * events to the specified <code>outputHandler</code>. | |
346 | * | |
347 | * @param in raw data to be analyzed. | |
348 | * @param documentURI URI from which the raw data has been extracted. | |
349 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
350 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
351 | * @throws IOException | |
352 | * @throws ExtractionException | |
353 | */ | |
354 | public ExtractionReport extract(String in, String documentURI, TripleHandler outputHandler) | |
355 | throws IOException, ExtractionException { | |
356 | 0 | return extract(new StringDocumentSource(in, documentURI), outputHandler); |
357 | } | |
358 | ||
359 | /** | |
360 | * Performs metadata extraction from the content of the given <code>file</code> | |
361 | * sending the generated events to the specified <code>outputHandler</code>. | |
362 | * | |
363 | * @param file file containing raw data. | |
364 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
365 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
366 | * @throws IOException | |
367 | * @throws ExtractionException | |
368 | */ | |
369 | public ExtractionReport extract(File file, TripleHandler outputHandler) | |
370 | throws IOException, ExtractionException { | |
371 | 0 | return extract(new FileDocumentSource(file), outputHandler); |
372 | } | |
373 | ||
374 | /** | |
375 | * Performs metadata extraction from the content of the given <code>documentURI</code> | |
376 | * sending the generated events to the specified <code>outputHandler</code>. | |
377 | * If the <i>URI</i> is replied with a redirect, the last will be followed. | |
378 | * | |
379 | * @param eps the parameters to be applied to the extraction. | |
380 | * @param documentURI the URI from which retrieve document. | |
381 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
382 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
383 | * @throws IOException | |
384 | * @throws ExtractionException | |
385 | */ | |
386 | public ExtractionReport extract(ExtractionParameters eps, String documentURI, TripleHandler outputHandler) | |
387 | throws IOException, ExtractionException { | |
388 | try { | |
389 | 0 | return extract(eps, createDocumentSource(documentURI), outputHandler); |
390 | 0 | } catch (URISyntaxException ex) { |
391 | 0 | throw new ExtractionException("Error while extracting data from document URI.", ex); |
392 | } | |
393 | } | |
394 | ||
395 | /** | |
396 | * Performs metadata extraction from the content of the given <code>documentURI</code> | |
397 | * sending the generated events to the specified <code>outputHandler</code>. | |
398 | * If the <i>URI</i> is replied with a redirect, the last will be followed. | |
399 | * | |
400 | * @param documentURI the URI from which retrieve document. | |
401 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
402 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
403 | * @throws IOException | |
404 | * @throws ExtractionException | |
405 | */ | |
406 | public ExtractionReport extract(String documentURI, TripleHandler outputHandler) | |
407 | throws IOException, ExtractionException { | |
408 | 0 | return extract((ExtractionParameters) null, documentURI, outputHandler); |
409 | } | |
410 | ||
411 | /** | |
412 | * Performs metadata extraction from the content of the given | |
413 | * <code>in</code> document source, sending the generated events | |
414 | * to the specified <code>outputHandler</code>. | |
415 | * | |
416 | * @param in the input document source. | |
417 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
418 | * @param encoding explicit encoding see | |
419 | * <a href="http://www.iana.org/assignments/character-sets">available encodings</a>. | |
420 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
421 | * @throws IOException | |
422 | * @throws ExtractionException | |
423 | */ | |
424 | public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding) | |
425 | throws IOException, ExtractionException { | |
426 | 0 | return extract(null, in, outputHandler, encoding); |
427 | } | |
428 | ||
429 | /** | |
430 | * Performs metadata extraction from the content of the given | |
431 | * <code>in</code> document source, sending the generated events | |
432 | * to the specified <code>outputHandler</code>. | |
433 | * | |
434 | * @param in the input document source. | |
435 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
436 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
437 | * @throws IOException | |
438 | * @throws ExtractionException | |
439 | */ | |
440 | public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler) | |
441 | throws IOException, ExtractionException { | |
442 | 0 | return extract(null, in, outputHandler, null); |
443 | } | |
444 | ||
445 | /** | |
446 | * Performs metadata extraction from the content of the given | |
447 | * <code>in</code> document source, sending the generated events | |
448 | * to the specified <code>outputHandler</code>. | |
449 | * | |
450 | * @param eps the parameters to be applied for the extraction phase. | |
451 | * @param in the input document source. | |
452 | * @param outputHandler handler responsible for collecting of the extracted metadata. | |
453 | * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise. | |
454 | * @throws IOException | |
455 | * @throws ExtractionException | |
456 | */ | |
457 | public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler) | |
458 | throws IOException, ExtractionException { | |
459 | 0 | return extract(eps, in, outputHandler, null); |
460 | } | |
461 | ||
462 | private String getAcceptHeader() { | |
463 | 0 | Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>(); |
464 | 0 | for (ExtractorFactory<?> factory : factories) { |
465 | 0 | mimeTypes.addAll(factory.getSupportedMIMETypes()); |
466 | } | |
467 | 0 | return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader(); |
468 | } | |
469 | ||
470 | } |