Coverage Report - org.apache.any23.extractor.html.SpeciesExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
SpeciesExtractor
0%
0/36
0%
0/6
1.333
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionException;
 21  
 import org.apache.any23.extractor.ExtractionResult;
 22  
 import org.apache.any23.extractor.ExtractorDescription;
 23  
 import org.apache.any23.extractor.ExtractorFactory;
 24  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 25  
 import org.apache.any23.extractor.TagSoupExtractionResult;
 26  
 import org.apache.any23.rdf.PopularPrefixes;
 27  
 import org.apache.any23.vocab.WO;
 28  
 import org.openrdf.model.BNode;
 29  
 import org.openrdf.model.Resource;
 30  
 import org.openrdf.model.URI;
 31  
 import org.openrdf.model.vocabulary.RDF;
 32  
 import org.w3c.dom.Node;
 33  
 
 34  
 import java.util.Arrays;
 35  
 
 36  
 /**
 37  
  * Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
 38  
  * The data are represented using the
 39  
  * <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
 40  
  *
 41  
  * @see org.apache.any23.vocab.WO
 42  
  * @author Davide Palmisano (dpalmisano@gmail.com)
 43  
  */
 44  0
 public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
 45  
 
 46  0
     private static final WO vWO = WO.getInstance();
 47  
 
 48  0
     private static final String[] classes = {
 49  
             "kingdom",
 50  
             "division",
 51  
             "phylum",
 52  
             "order",
 53  
             "family",
 54  
             "genus",
 55  
             "species",
 56  
             "class",
 57  
     };
 58  
 
 59  0
     public final static ExtractorFactory<SpeciesExtractor> factory =
 60  
             SimpleExtractorFactory.create(
 61  
                     "html-mf-species",
 62  
                     PopularPrefixes.createSubset("rdf", "wo"),
 63  
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
 64  
                     "example-mf-species.html",
 65  
                     SpeciesExtractor.class
 66  
             );
 67  
 
 68  
     /**
 69  
      * Returns the description of this extractor.
 70  
      *
 71  
      * @return a human readable description.
 72  
      */
 73  
     @Override
 74  
     public ExtractorDescription getDescription() {
 75  0
         return factory;
 76  
     }
 77  
 
 78  
     /**
 79  
      * Returns the base class name for the extractor.
 80  
      *
 81  
      * @return a string containing the base of the extractor.
 82  
      */
 83  
     @Override
 84  
     protected String getBaseClassName() {
 85  0
         return "biota";
 86  
     }
 87  
 
 88  
     /**
 89  
      * Resets the internal status of the extractor to prepare it to a new extraction section.
 90  
      */
 91  
     @Override
 92  
     protected void resetExtractor() {
 93  
         // empty
 94  0
     }
 95  
 
 96  
     /**
 97  
      * Extracts an entity from a <i>DOM</i> node.
 98  
      *
 99  
      * @param node the DOM node.
 100  
      * @param out  the extraction result collector.
 101  
      * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
 102  
      * @throws org.apache.any23.extractor.ExtractionException
 103  
      *
 104  
      */
 105  
     @Override
 106  
     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
 107  0
         BNode biota = getBlankNodeFor(node);
 108  0
         conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
 109  
 
 110  0
         final HTMLDocument fragment = new HTMLDocument(node);
 111  0
         addNames(fragment, biota);
 112  0
         addClasses(fragment, biota);
 113  
 
 114  0
         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
 115  0
         tser.addResourceRoot(
 116  
                 DomUtils.getXPathListForNode(node),
 117  
                 biota,
 118  
                 this.getClass()
 119  
         );
 120  
 
 121  0
         return true;
 122  
     }
 123  
 
 124  
     private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
 125  0
         HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
 126  0
         conditionallyAddStringProperty(
 127  
                 binomial.source(), biota, vWO.scientificName, binomial.value()
 128  
         );
 129  0
         HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
 130  0
         conditionallyAddStringProperty(
 131  
                 vernacular.source(), biota, vWO.speciesName, vernacular.value()
 132  
         );
 133  0
     }
 134  
 
 135  
     private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
 136  0
         for (String clazz : classes) {
 137  0
             HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
 138  0
             conditionallyAddStringProperty(
 139  
                     classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
 140  
         }
 141  0
     }
 142  
 
 143  
     private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
 144  0
         for(String clazz : classes) {
 145  0
             HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
 146  0
             if(classTextField.source() != null) {
 147  0
                 BNode classBNode = getBlankNodeFor(classTextField.source());
 148  0
                 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
 149  0
                 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
 150  0
                 HTMLDocument fragment = new HTMLDocument(classTextField.source());
 151  0
                 addClassesName(fragment, classBNode);
 152  
             }
 153  
         }
 154  0
     }
 155  
 
 156  
     private URI resolvePropertyName(String clazz) {
 157  0
         return vWO.getProperty(
 158  
                 String.format(
 159  
                         "%sName",
 160  
                         clazz
 161  
                 )
 162  
         );
 163  
     }
 164  
 
 165  
     private URI resolveClassName(String clazz) {
 166  0
         String upperCaseClass = clazz.substring(0, 1);
 167  0
         return vWO.getClass(
 168  
                 String.format("%s%s",
 169  
                         upperCaseClass.toUpperCase(),
 170  
                         clazz.substring(1)
 171  
                 )
 172  
         );
 173  
     }
 174  
 }