Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
Extractor |
|
| 1.0;1 | ||||
Extractor$BlindExtractor |
|
| 1.0;1 | ||||
Extractor$ContentExtractor |
|
| 1.0;1 | ||||
Extractor$TagSoupDOMExtractor |
|
| 1.0;1 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.any23.extractor; | |
19 | ||
20 | import org.openrdf.model.URI; | |
21 | import org.w3c.dom.Document; | |
22 | ||
23 | import java.io.IOException; | |
24 | import java.io.InputStream; | |
25 | ||
26 | /** | |
27 | * It defines the signature of a generic Extractor. | |
28 | * | |
29 | * @param <Input> the type of the input data to be processed. | |
30 | */ | |
31 | public interface Extractor<Input> { | |
32 | ||
33 | /** | |
34 | * This interface specializes an {@link Extractor} able to handle | |
35 | * {@link java.net.URI} as input format. Use it if you need to fetch a document before the extraction | |
36 | */ | |
37 | public interface BlindExtractor extends Extractor<URI> { | |
38 | } | |
39 | ||
40 | /** | |
41 | * This interface specializes an {@link Extractor} able to handle | |
42 | * {@link java.io.InputStream} as input format. | |
43 | */ | |
44 | public interface ContentExtractor extends Extractor<InputStream> { | |
45 | ||
46 | /** | |
47 | * If <code>true</code>, the extractor will stop at first parsing error, | |
48 | * if<code>false</code> the extractor will attempt to ignore all parsing errors. | |
49 | * | |
50 | * @param f tolerance flag. | |
51 | */ | |
52 | void setStopAtFirstError(boolean f); | |
53 | ||
54 | } | |
55 | ||
56 | /** | |
57 | * This interface specializes an {@link Extractor} able to handle | |
58 | * {@link org.w3c.dom.Document} as input format. | |
59 | */ | |
60 | public interface TagSoupDOMExtractor extends Extractor<Document> { | |
61 | } | |
62 | ||
63 | /** | |
64 | * Executes the extractor. Will be invoked only once, extractors are | |
65 | * not reusable. | |
66 | * | |
67 | * @param extractionParameters the parameters to be applied during the extraction. | |
68 | * @param context The document context. | |
69 | * @param in The extractor input data. | |
70 | * @param out the collector for the extracted data. | |
71 | * @throws IOException On error while reading from the input stream. | |
72 | * @throws ExtractionException On other error, such as parse errors. | |
73 | */ | |
74 | void run(ExtractionParameters extractionParameters, ExtractionContext context, Input in, ExtractionResult out) | |
75 | throws IOException, ExtractionException; | |
76 | ||
77 | /** | |
78 | * Returns a {@link ExtractorDescription} of this extractor. | |
79 | * | |
80 | * @return the object representing the extractor description. | |
81 | */ | |
82 | ExtractorDescription getDescription(); | |
83 | ||
84 | } |