Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
EntityBasedMicroformatExtractor |
|
| 1.2;1.2 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.any23.extractor.html; | |
19 | ||
20 | import org.apache.any23.extractor.ExtractionException; | |
21 | import org.apache.any23.extractor.ExtractionResult; | |
22 | import org.apache.any23.rdf.RDFUtils; | |
23 | import org.openrdf.model.BNode; | |
24 | import org.w3c.dom.Node; | |
25 | ||
26 | import java.util.List; | |
27 | ||
28 | /** | |
29 | * Base class for microformat extractors based on entities. | |
30 | * | |
31 | * @author Gabriele Renzi | |
32 | */ | |
33 | 0 | public abstract class EntityBasedMicroformatExtractor extends MicroformatExtractor { |
34 | ||
35 | /** | |
36 | * Returns the base class name for the extractor. | |
37 | * | |
38 | * @return a string containing the base of the extractor. | |
39 | */ | |
40 | protected abstract String getBaseClassName(); | |
41 | ||
42 | /** | |
43 | * Resets the internal status of the extractor to prepare it to a new extraction section. | |
44 | */ | |
45 | protected abstract void resetExtractor(); | |
46 | ||
47 | /** | |
48 | * Extracts an entity from a <i>DOM</i> node. | |
49 | * | |
50 | * @param node the DOM node. | |
51 | * @param out the extraction result collector. | |
52 | * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise. | |
53 | * @throws ExtractionException | |
54 | */ | |
55 | protected abstract boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException; | |
56 | ||
57 | @Override | |
58 | public boolean extract() throws ExtractionException { | |
59 | 0 | List<Node> nodes = DomUtils.findAllByClassName( getHTMLDocument().getDocument(), getBaseClassName()); |
60 | 0 | boolean foundAny = false; |
61 | 0 | int count = 1; |
62 | 0 | for (Node node : nodes) { |
63 | 0 | resetExtractor(); |
64 | 0 | String contextID = Integer.toString(count); |
65 | 0 | ExtractionResult subResult = openSubResult( getExtractionContext().copy(contextID) ); |
66 | 0 | foundAny |= extractEntity(node, subResult); |
67 | 0 | subResult.close(); |
68 | 0 | count++; |
69 | 0 | } |
70 | 0 | return foundAny; |
71 | } | |
72 | ||
73 | /** | |
74 | * @param node a DOM node representing a blank node | |
75 | * @return an RDF blank node corresponding to that DOM node, by using a | |
76 | * blank node ID like "MD5 of http://doc-uri/#xpath/to/node" | |
77 | */ | |
78 | protected BNode getBlankNodeFor(Node node) { | |
79 | 0 | return RDFUtils.getBNode(getDocumentURI() + "#" + DomUtils.getXPathForNode(node)); |
80 | } | |
81 | ||
82 | } |