Coverage Report - org.apache.any23.extractor.html.HResumeExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
HResumeExtractor
0%
0/77
0%
0/26
2.083
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionResult;
 21  
 import org.apache.any23.extractor.ExtractorDescription;
 22  
 import org.apache.any23.extractor.ExtractorFactory;
 23  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 24  
 import org.apache.any23.extractor.TagSoupExtractionResult;
 25  
 import org.apache.any23.rdf.PopularPrefixes;
 26  
 import org.apache.any23.vocab.DOAC;
 27  
 import org.apache.any23.vocab.FOAF;
 28  
 import org.openrdf.model.BNode;
 29  
 import org.openrdf.model.Resource;
 30  
 import org.openrdf.model.vocabulary.RDF;
 31  
 import org.w3c.dom.Node;
 32  
 
 33  
 import java.util.Arrays;
 34  
 import java.util.List;
 35  
 
 36  
 /**
 37  
  * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a>
 38  
  * microformat.
 39  
  *
 40  
  * @author Gabriele Renzi
 41  
  */
 42  0
 public class HResumeExtractor extends EntityBasedMicroformatExtractor {
 43  
 
 44  0
     private static final FOAF vFOAF = FOAF.getInstance();
 45  0
     private static final DOAC vDOAC = DOAC.getInstance();
 46  
 
 47  0
     public final static ExtractorFactory<HResumeExtractor> factory =
 48  
             SimpleExtractorFactory.create(
 49  
                     "html-mf-hresume",
 50  
                     PopularPrefixes.createSubset("rdf", "doac", "foaf"),
 51  
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
 52  
                     "example-mf-hresume.html",
 53  
                     HResumeExtractor.class
 54  
             );
 55  
 
 56  
     public ExtractorDescription getDescription() {
 57  0
         return factory;
 58  
     }
 59  
 
 60  
     public String getBaseClassName() {
 61  0
         return "hresume";
 62  
     }
 63  
 
 64  
     @Override
 65  
     protected void resetExtractor() {
 66  
         // Empty.
 67  0
     }
 68  
 
 69  
     @Override
 70  
     protected boolean extractEntity(Node node, ExtractionResult out) {
 71  0
         if (null == node) return false;
 72  0
         BNode person = getBlankNodeFor(node);
 73  
         // we have a person, at least
 74  0
         out.writeTriple(person, RDF.TYPE, vFOAF.Person);
 75  0
         final HTMLDocument fragment = new HTMLDocument(node);
 76  0
         addSummary(fragment, person);
 77  0
         addContact(fragment, person);
 78  0
         addExperiences(fragment, person);
 79  0
         addEducations(fragment, person);
 80  0
         addAffiliations(fragment, person);
 81  0
         addSkills(fragment, person);
 82  
 
 83  0
         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
 84  0
         tser.addResourceRoot(
 85  
                 DomUtils.getXPathListForNode(node),
 86  
                 person,
 87  
                 this.getClass()
 88  
         );
 89  
 
 90  0
         return true;
 91  
     }
 92  
 
 93  
     private void addSummary(HTMLDocument doc, Resource person) {
 94  0
         HTMLDocument.TextField summary = doc.getSingularTextField("summary");
 95  0
         conditionallyAddStringProperty(
 96  
                 summary.source(),
 97  
                 person,
 98  
                 vDOAC.summary,
 99  
                 summary.value()
 100  
         );
 101  0
     }
 102  
 
 103  
     private void addContact(HTMLDocument doc, Resource person) {
 104  0
         List<Node> nodes = doc.findAllByClassName("contact");
 105  0
         if (nodes.size() > 0)
 106  0
             addBNodeProperty(
 107  
                     nodes.get(0),
 108  
                     person, vFOAF.isPrimaryTopicOf, getBlankNodeFor(nodes.get(0))
 109  
             );
 110  0
     }
 111  
 
 112  
     private void addExperiences(HTMLDocument doc, Resource person) {
 113  0
         List<Node> nodes = doc.findAllByClassName("experience");
 114  0
         for (Node node : nodes) {
 115  0
             BNode exp = valueFactory.createBNode();
 116  0
             if (addExperience(exp, new HTMLDocument(node)))
 117  0
             addBNodeProperty(
 118  
                     node,
 119  
                     person, vDOAC.experience, exp
 120  
             );
 121  0
         }
 122  0
     }
 123  
 
 124  
     private boolean addExperience(Resource exp, HTMLDocument document) {
 125  0
         final Node documentNode    = document.getDocument();
 126  0
         String check = "";
 127  
 
 128  0
         HTMLDocument.TextField value = document.getSingularTextField("title");
 129  0
         check += value;
 130  0
         conditionallyAddStringProperty(value.source(), exp, vDOAC.title, value.value().trim());
 131  
 
 132  0
         value = document.getSingularTextField("dtstart");
 133  0
         check += value;
 134  0
         conditionallyAddStringProperty(documentNode, exp, vDOAC.start_date, value.value().trim());
 135  
 
 136  0
         value = document.getSingularTextField("dtend");
 137  0
         check += value;
 138  0
         conditionallyAddStringProperty(documentNode, exp, vDOAC.end_date, value.value().trim());
 139  
 
 140  0
         value = document.getSingularTextField("summary");
 141  0
         check += value;
 142  0
         conditionallyAddStringProperty(documentNode, exp, vDOAC.organization, value.value().trim());
 143  
 
 144  0
         return !"".equals(check);
 145  
     }
 146  
 
 147  
     private void addEducations(HTMLDocument doc, Resource person) {
 148  0
         List<Node> nodes = doc.findAllByClassName("education");
 149  0
         for (Node node : nodes) {
 150  0
             BNode exp = valueFactory.createBNode();
 151  0
             if (addExperience(exp, new HTMLDocument(node)))
 152  0
             addBNodeProperty(
 153  
                     node,
 154  
                     person, vDOAC.education, exp
 155  
             );
 156  0
         }
 157  0
     }
 158  
 
 159  
     private void addAffiliations(HTMLDocument doc, Resource person) {
 160  0
         List<Node> nodes = doc.findAllByClassName("affiliation");
 161  0
         for (Node node : nodes) {
 162  0
             addBNodeProperty(
 163  
                     node,
 164  
                     person, vDOAC.affiliation, getBlankNodeFor(node)
 165  
             );
 166  
         }
 167  0
     }
 168  
 
 169  
     private void addSkills(HTMLDocument doc, Resource person) {
 170  
         List<Node> nodes;
 171  
 
 172  
         // Extracting data from single node.
 173  0
         nodes = doc.findAllByClassName("skill");
 174  0
         for (Node node : nodes) {
 175  0
             conditionallyAddStringProperty(
 176  
                     node,
 177  
                     person, vDOAC.skill, extractSkillValue(node)
 178  
             );
 179  
         }
 180  
         // Extracting from enlisting node.
 181  0
         nodes = doc.findAllByClassName("skills");
 182  0
         for(Node node : nodes) {
 183  0
             String nodeText = node.getTextContent();
 184  0
             String[] skills = nodeText.split(",");
 185  0
             for(String skill : skills) {
 186  0
                 conditionallyAddStringProperty(
 187  
                         node,
 188  
                         person, vDOAC.skill, skill.trim()
 189  
                 );
 190  
             }
 191  0
         }
 192  0
     }
 193  
 
 194  
     private String extractSkillValue(Node n) {
 195  0
         String name = n.getNodeName();
 196  0
         String skill = null;
 197  0
         if ("A".equals(name) && DomUtils.hasAttribute(n, "rel", "tag")) {
 198  0
             skill = n.getAttributes().getNamedItem("href").getTextContent();
 199  
         } else {
 200  0
             skill = n.getTextContent();
 201  
         }
 202  0
         return skill;
 203  
     }
 204  
 
 205  
 }