Coverage Report

Coverage Report - org.apache.any23.extractor.html.SpeciesExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

SpeciesExtractor

0/36

0/6

1.333

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.extractor.TagSoupExtractionResult;
 import org.apache.any23.rdf.PopularPrefixes;
 import org.apache.any23.vocab.WO;
 import org.openrdf.model.BNode;
 import org.openrdf.model.Resource;
 import org.openrdf.model.URI;
 import org.openrdf.model.vocabulary.RDF;
 import org.w3c.dom.Node;
 
 import java.util.Arrays;
 
 /**
  * Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
  * The data are represented using the
  * <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
  *
  * @see org.apache.any23.vocab.WO
  * @author Davide Palmisano (dpalmisano@gmail.com)
  */
 public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
 
     private static final WO vWO = WO.getInstance();
 
     private static final String[] classes = {
             "kingdom",
             "division",
             "phylum",
             "order",
             "family",
             "genus",
             "species",
             "class",
     };
 
     public final static ExtractorFactory<SpeciesExtractor> factory =
             SimpleExtractorFactory.create(
                     "html-mf-species",
                     PopularPrefixes.createSubset("rdf", "wo"),
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
                     "example-mf-species.html",
                     SpeciesExtractor.class
             );
 
     /**
      * Returns the description of this extractor.
      *
      * @return a human readable description.
      */
     @Override
     public ExtractorDescription getDescription() {
         return factory;
     }
 
     /**
      * Returns the base class name for the extractor.
      *
      * @return a string containing the base of the extractor.
      */
     @Override
     protected String getBaseClassName() {
         return "biota";
     }
 
     /**
      * Resets the internal status of the extractor to prepare it to a new extraction section.
      */
     @Override
     protected void resetExtractor() {
         // empty
     }
 
     /**
      * Extracts an entity from a <i>DOM</i> node.
      *
      * @param node the DOM node.
      * @param out  the extraction result collector.
      * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
      * @throws org.apache.any23.extractor.ExtractionException
      *
      */
     @Override
     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
         BNode biota = getBlankNodeFor(node);
         conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
 
         final HTMLDocument fragment = new HTMLDocument(node);
         addNames(fragment, biota);
         addClasses(fragment, biota);
 
         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addResourceRoot(
                 DomUtils.getXPathListForNode(node),
                 biota,
                 this.getClass()
         );
 
         return true;
     }
 
     private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
         HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
         conditionallyAddStringProperty(
                 binomial.source(), biota, vWO.scientificName, binomial.value()
         );
         HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
         conditionallyAddStringProperty(
                 vernacular.source(), biota, vWO.speciesName, vernacular.value()
         );
     }
 
     private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
         for (String clazz : classes) {
             HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
             conditionallyAddStringProperty(
                     classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
         }
     }
 
     private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
         for(String clazz : classes) {
             HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
             if(classTextField.source() != null) {
                 BNode classBNode = getBlankNodeFor(classTextField.source());
                 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
                 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
                 HTMLDocument fragment = new HTMLDocument(classTextField.source());
                 addClassesName(fragment, classBNode);
             }
         }
     }
 
     private URI resolvePropertyName(String clazz) {
         return vWO.getProperty(
                 String.format(
                         "%sName",
                         clazz
                 )
         );
     }
 
     private URI resolveClassName(String clazz) {
         String upperCaseClass = clazz.substring(0, 1);
         return vWO.getClass(
                 String.format("%s%s",
                         upperCaseClass.toUpperCase(),
                         clazz.substring(1)
                 )
         );
     }
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.extractor.ExtractionException;
21		import org.apache.any23.extractor.ExtractionResult;
22		import org.apache.any23.extractor.ExtractorDescription;
23		import org.apache.any23.extractor.ExtractorFactory;
24		import org.apache.any23.extractor.SimpleExtractorFactory;
25		import org.apache.any23.extractor.TagSoupExtractionResult;
26		import org.apache.any23.rdf.PopularPrefixes;
27		import org.apache.any23.vocab.WO;
28		import org.openrdf.model.BNode;
29		import org.openrdf.model.Resource;
30		import org.openrdf.model.URI;
31		import org.openrdf.model.vocabulary.RDF;
32		import org.w3c.dom.Node;
33
34		import java.util.Arrays;
35
36		/**
37		* Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
38		* The data are represented using the
39		* <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
40		*
41		* @see org.apache.any23.vocab.WO
42		* @author Davide Palmisano (dpalmisano@gmail.com)
43		*/
44	0	public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
45
46	0	private static final WO vWO = WO.getInstance();
47
48	0	private static final String[] classes = {
49		"kingdom",
50		"division",
51		"phylum",
52		"order",
53		"family",
54		"genus",
55		"species",
56		"class",
57		};
58
59	0	public final static ExtractorFactory<SpeciesExtractor> factory =
60		SimpleExtractorFactory.create(
61		"html-mf-species",
62		PopularPrefixes.createSubset("rdf", "wo"),
63		Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
64		"example-mf-species.html",
65		SpeciesExtractor.class
66		);
67
68		/**
69		* Returns the description of this extractor.
70		*
71		* @return a human readable description.
72		*/
73		@Override
74		public ExtractorDescription getDescription() {
75	0	return factory;
76		}
77
78		/**
79		* Returns the base class name for the extractor.
80		*
81		* @return a string containing the base of the extractor.
82		*/
83		@Override
84		protected String getBaseClassName() {
85	0	return "biota";
86		}
87
88		/**
89		* Resets the internal status of the extractor to prepare it to a new extraction section.
90		*/
91		@Override
92		protected void resetExtractor() {
93		// empty
94	0	}
95
96		/**
97		* Extracts an entity from a <i>DOM</i> node.
98		*
99		* @param node the DOM node.
100		* @param out the extraction result collector.
101		* @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
102		* @throws org.apache.any23.extractor.ExtractionException
103		*
104		*/
105		@Override
106		protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
107	0	BNode biota = getBlankNodeFor(node);
108	0	conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
109
110	0	final HTMLDocument fragment = new HTMLDocument(node);
111	0	addNames(fragment, biota);
112	0	addClasses(fragment, biota);
113
114	0	final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
115	0	tser.addResourceRoot(
116		DomUtils.getXPathListForNode(node),
117		biota,
118		this.getClass()
119		);
120
121	0	return true;
122		}
123
124		private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
125	0	HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
126	0	conditionallyAddStringProperty(
127		binomial.source(), biota, vWO.scientificName, binomial.value()
128		);
129	0	HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
130	0	conditionallyAddStringProperty(
131		vernacular.source(), biota, vWO.speciesName, vernacular.value()
132		);
133	0	}
134
135		private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
136	0	for (String clazz : classes) {
137	0	HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
138	0	conditionallyAddStringProperty(
139		classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
140		}
141	0	}
142
143		private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
144	0	for(String clazz : classes) {
145	0	HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
146	0	if(classTextField.source() != null) {
147	0	BNode classBNode = getBlankNodeFor(classTextField.source());
148	0	addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
149	0	conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
150	0	HTMLDocument fragment = new HTMLDocument(classTextField.source());
151	0	addClassesName(fragment, classBNode);
152		}
153		}
154	0	}
155
156		private URI resolvePropertyName(String clazz) {
157	0	return vWO.getProperty(
158		String.format(
159		"%sName",
160		clazz
161		)
162		);
163		}
164
165		private URI resolveClassName(String clazz) {
166	0	String upperCaseClass = clazz.substring(0, 1);
167	0	return vWO.getClass(
168		String.format("%s%s",
169		upperCaseClass.toUpperCase(),
170		clazz.substring(1)
171		)
172		);
173		}
174		}