Coverage Report

Coverage Report - org.apache.any23.extractor.Extractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

Extractor

N/A

Extractor$BlindExtractor

N/A

Extractor$ContentExtractor

N/A

Extractor$TagSoupDOMExtractor

N/A

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor;
 
 import org.openrdf.model.URI;
 import org.w3c.dom.Document;
 
 import java.io.IOException;
 import java.io.InputStream;
 
 /**
  * It defines the signature of a generic Extractor.
  *
  * @param <Input> the type of the input data to be processed.
  */
 public interface Extractor<Input> {
 
     /**
      * This interface specializes an {@link Extractor} able to handle
      * {@link java.net.URI} as input format. Use it if you need to fetch a document before the extraction
      */
     public interface BlindExtractor extends Extractor<URI> {
     }
 
     /**
      * This interface specializes an {@link Extractor} able to handle
      * {@link java.io.InputStream} as input format.
      */
     public interface ContentExtractor extends Extractor<InputStream> {
         
         /**
          * If <code>true</code>, the extractor will stop at first parsing error,
          * if<code>false</code> the extractor will attempt to ignore all parsing errors.
          *
          * @param f tolerance flag.
          */
         void setStopAtFirstError(boolean f);
 
     }
 
     /**
      * This interface specializes an {@link Extractor} able to handle
      * {@link org.w3c.dom.Document} as input format.
      */
     public interface TagSoupDOMExtractor extends Extractor<Document> {
     }
 
     /**
      * Executes the extractor. Will be invoked only once, extractors are
      * not reusable.
      *
      * @param extractionParameters the parameters to be applied during the extraction.
      * @param context The document context.
      * @param in The extractor input data.
      * @param out the collector for the extracted data.
      * @throws IOException On error while reading from the input stream.
      * @throws ExtractionException On other error, such as parse errors.
      */
     void run(ExtractionParameters extractionParameters, ExtractionContext context, Input in, ExtractionResult out)
     throws IOException, ExtractionException;
 
     /**
      * Returns a {@link ExtractorDescription} of this extractor.
      *
      * @return the object representing the extractor description.
      */
     ExtractorDescription getDescription();
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor;
19
20		import org.openrdf.model.URI;
21		import org.w3c.dom.Document;
22
23		import java.io.IOException;
24		import java.io.InputStream;
25
26		/**
27		* It defines the signature of a generic Extractor.
28		*
29		* @param <Input> the type of the input data to be processed.
30		*/
31		public interface Extractor<Input> {
32
33		/**
34		* This interface specializes an {@link Extractor} able to handle
35		* {@link java.net.URI} as input format. Use it if you need to fetch a document before the extraction
36		*/
37		public interface BlindExtractor extends Extractor<URI> {
38		}
39
40		/**
41		* This interface specializes an {@link Extractor} able to handle
42		* {@link java.io.InputStream} as input format.
43		*/
44		public interface ContentExtractor extends Extractor<InputStream> {
45
46		/**
47		* If <code>true</code>, the extractor will stop at first parsing error,
48		* if<code>false</code> the extractor will attempt to ignore all parsing errors.
49		*
50		* @param f tolerance flag.
51		*/
52		void setStopAtFirstError(boolean f);
53
54		}
55
56		/**
57		* This interface specializes an {@link Extractor} able to handle
58		* {@link org.w3c.dom.Document} as input format.
59		*/
60		public interface TagSoupDOMExtractor extends Extractor<Document> {
61		}
62
63		/**
64		* Executes the extractor. Will be invoked only once, extractors are
65		* not reusable.
66		*
67		* @param extractionParameters the parameters to be applied during the extraction.
68		* @param context The document context.
69		* @param in The extractor input data.
70		* @param out the collector for the extracted data.
71		* @throws IOException On error while reading from the input stream.
72		* @throws ExtractionException On other error, such as parse errors.
73		*/
74		void run(ExtractionParameters extractionParameters, ExtractionContext context, Input in, ExtractionResult out)
75		throws IOException, ExtractionException;
76
77		/**
78		* Returns a {@link ExtractorDescription} of this extractor.
79		*
80		* @return the object representing the extractor description.
81		*/
82		ExtractorDescription getDescription();
83
84		}