Coverage Report

Coverage Report - org.apache.any23.extractor.html.EntityBasedMicroformatExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

EntityBasedMicroformatExtractor

0/14

0/2

1.2

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.rdf.RDFUtils;
 import org.openrdf.model.BNode;
 import org.w3c.dom.Node;
 
 import java.util.List;
 
 /**
  * Base class for microformat extractors based on entities.
  *
  * @author Gabriele Renzi
  */
 public abstract class EntityBasedMicroformatExtractor extends MicroformatExtractor {
 
     /**
      * Returns the base class name for the extractor.
      *
      * @return a string containing the base of the extractor.
      */
     protected abstract String getBaseClassName();
 
     /**
      * Resets the internal status of the extractor to prepare it to a new extraction section.
      */
     protected abstract void resetExtractor();
 
     /**
      * Extracts an entity from a <i>DOM</i> node.
      *
      * @param node the DOM node.
      * @param out the extraction result collector.
      * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
      * @throws ExtractionException
      */
     protected abstract boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException;
 
     @Override
     public boolean extract() throws ExtractionException {
         List<Node> nodes = DomUtils.findAllByClassName( getHTMLDocument().getDocument(), getBaseClassName());
         boolean foundAny = false;
         int count = 1;
         for (Node node : nodes) {
             resetExtractor();
             String contextID = Integer.toString(count);
             ExtractionResult subResult = openSubResult( getExtractionContext().copy(contextID) );
             foundAny |= extractEntity(node, subResult);
             subResult.close();
             count++;
         }
         return foundAny;
     }
 
     /**
      * @param node a DOM node representing a blank node
      * @return an RDF blank node corresponding to that DOM node, by using a
      *         blank node ID like "MD5 of http://doc-uri/#xpath/to/node"
      */
     protected BNode getBlankNodeFor(Node node) {
         return RDFUtils.getBNode(getDocumentURI() + "#" + DomUtils.getXPathForNode(node));
     }
     
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.extractor.ExtractionException;
21		import org.apache.any23.extractor.ExtractionResult;
22		import org.apache.any23.rdf.RDFUtils;
23		import org.openrdf.model.BNode;
24		import org.w3c.dom.Node;
25
26		import java.util.List;
27
28		/**
29		* Base class for microformat extractors based on entities.
30		*
31		* @author Gabriele Renzi
32		*/
33	0	public abstract class EntityBasedMicroformatExtractor extends MicroformatExtractor {
34
35		/**
36		* Returns the base class name for the extractor.
37		*
38		* @return a string containing the base of the extractor.
39		*/
40		protected abstract String getBaseClassName();
41
42		/**
43		* Resets the internal status of the extractor to prepare it to a new extraction section.
44		*/
45		protected abstract void resetExtractor();
46
47		/**
48		* Extracts an entity from a <i>DOM</i> node.
49		*
50		* @param node the DOM node.
51		* @param out the extraction result collector.
52		* @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
53		* @throws ExtractionException
54		*/
55		protected abstract boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException;
56
57		@Override
58		public boolean extract() throws ExtractionException {
59	0	List<Node> nodes = DomUtils.findAllByClassName( getHTMLDocument().getDocument(), getBaseClassName());
60	0	boolean foundAny = false;
61	0	int count = 1;
62	0	for (Node node : nodes) {
63	0	resetExtractor();
64	0	String contextID = Integer.toString(count);
65	0	ExtractionResult subResult = openSubResult( getExtractionContext().copy(contextID) );
66	0	foundAny \|= extractEntity(node, subResult);
67	0	subResult.close();
68	0	count++;
69	0	}
70	0	return foundAny;
71		}
72
73		/**
74		* @param node a DOM node representing a blank node
75		* @return an RDF blank node corresponding to that DOM node, by using a
76		* blank node ID like "MD5 of http://doc-uri/#xpath/to/node"
77		*/
78		protected BNode getBlankNodeFor(Node node) {
79	0	return RDFUtils.getBNode(getDocumentURI() + "#" + DomUtils.getXPathForNode(node));
80		}
81
82		}