Coverage Report - org.apache.any23.extractor.html.HCardExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
HCardExtractor
0%
0/184
0%
0/78
2.88
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionException;
 21  
 import org.apache.any23.extractor.ExtractionResult;
 22  
 import org.apache.any23.extractor.ExtractorDescription;
 23  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 24  
 import org.apache.any23.extractor.TagSoupExtractionResult;
 25  
 import org.apache.any23.extractor.html.annotations.Includes;
 26  
 import org.apache.any23.rdf.PopularPrefixes;
 27  
 import org.apache.any23.vocab.VCARD;
 28  
 import org.apache.commons.lang.StringUtils;
 29  
 import org.apache.any23.extractor.ExtractorFactory;
 30  
 import org.openrdf.model.BNode;
 31  
 import org.openrdf.model.Resource;
 32  
 import org.openrdf.model.URI;
 33  
 import org.openrdf.model.vocabulary.RDF;
 34  
 import org.w3c.dom.NamedNodeMap;
 35  
 import org.w3c.dom.Node;
 36  
 
 37  
 import java.util.ArrayList;
 38  
 import java.util.Arrays;
 39  
 import java.util.Collection;
 40  
 import java.util.List;
 41  
 
 42  
 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
 43  
 
 44  
 
 45  
 /**
 46  
  * Extractor for the <a href="http://microformats.org/wiki/hcard">hCard</a>
 47  
  * microformat.
 48  
  *
 49  
  * @author Gabriele Renzi
 50  
  */
 51  
 @Includes( extractors = AdrExtractor.class )
 52  0
 public class HCardExtractor extends EntityBasedMicroformatExtractor {
 53  
 
 54  0
     private static final VCARD vCARD = VCARD.getInstance();
 55  
 
 56  0
     private HCardName name = new HCardName();
 57  
 
 58  
     private HTMLDocument fragment;
 59  
 
 60  0
     public final static ExtractorFactory<HCardExtractor> factory =
 61  
             SimpleExtractorFactory.create(
 62  
                     "html-mf-hcard",
 63  
                     PopularPrefixes.createSubset("rdf", "vcard"),
 64  
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
 65  
                     "example-mf-hcard.html",
 66  
                     HCardExtractor.class
 67  
             );
 68  
 
 69  
     public ExtractorDescription getDescription() {
 70  0
         return factory;
 71  
     }
 72  
 
 73  
     @Override
 74  
     protected String getBaseClassName() {
 75  0
         return "vcard";
 76  
     }
 77  
 
 78  
     @Override
 79  
     protected void resetExtractor() {
 80  0
         name.reset(); // Cleanup of the HCardName content.
 81  0
     }
 82  
 
 83  
     private void fixIncludes(HTMLDocument document, Node node) {
 84  0
         NamedNodeMap attributes = node.getAttributes();
 85  
         // header case test 32
 86  0
         if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
 87  0
             String id = attributes.getNamedItem("headers").getNodeValue();
 88  0
             Node header = document.findNodeById(id);
 89  0
             if (null != header) {
 90  0
                 node.appendChild(header.cloneNode(true));
 91  0
                 attributes.removeNamedItem("headers");
 92  
             }
 93  
         }
 94  
         // include pattern, test 31
 95  
 
 96  0
         for (Node current : document.findAll("//*[@class]")) {
 97  0
             if (!DomUtils.hasClassName(current, "include")) continue;
 98  
             // we have to remove the field soon to avoid infinite loops
 99  
             // no null check, we know it's there or we won't be in the loop
 100  0
             current.getAttributes().removeNamedItem("class");
 101  0
             ArrayList<TextField> res = new ArrayList<TextField>();
 102  0
             HTMLDocument.readUrlField(res, current);
 103  0
             TextField id = res.get(0);
 104  0
             if (null == id)
 105  0
                 continue;
 106  0
             id = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
 107  0
             Node included = document.findNodeById(id.value());
 108  0
             if (null == included)
 109  0
                 continue;
 110  0
             current.appendChild(included.cloneNode(true));
 111  0
         }
 112  0
     }
 113  
 
 114  
     @Override
 115  
     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
 116  0
         this.fragment = new HTMLDocument(node);
 117  0
         fixIncludes(getHTMLDocument(), node);
 118  0
         final BNode card = getBlankNodeFor(node);
 119  0
         boolean foundSomething = false;
 120  
 
 121  0
         readFn();
 122  0
         readNames();
 123  0
         readOrganization();
 124  0
         foundSomething |= addFn(card);
 125  0
         foundSomething |= addNames(card);
 126  0
         foundSomething |= addOrganizationName(card);
 127  0
         foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
 128  0
         foundSomething |= addUrl(card);
 129  0
         foundSomething |= addEmail(card);
 130  0
         foundSomething |= addPhoto(card);
 131  0
         foundSomething |= addLogo(card);
 132  0
         foundSomething |= addUid(card);
 133  0
         foundSomething |= addClass(card);
 134  0
         foundSomething |= addStringProperty("bday", card, vCARD.bday);
 135  0
         foundSomething |= addStringProperty("rev", card, vCARD.rev);
 136  0
         foundSomething |= addStringProperty("tz", card, vCARD.tz);
 137  0
         foundSomething |= addCategory(card);
 138  0
         foundSomething |= addStringProperty("card", card, vCARD.class_);
 139  0
         foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
 140  0
         foundSomething |= addTelephones(card);
 141  0
         foundSomething |= addStringProperty("title", card, vCARD.title);
 142  0
         foundSomething |= addStringProperty("role", card, vCARD.role);
 143  0
         foundSomething |= addStringMultiProperty("note", card, vCARD.note);
 144  0
         foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
 145  
 
 146  0
         if (!foundSomething) return false;
 147  0
         out.writeTriple(card, RDF.TYPE, vCARD.VCard);
 148  
 
 149  0
         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
 150  0
         tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() );
 151  
 
 152  0
         return true;
 153  
     }
 154  
 
 155  
     private boolean addTelephones(Resource card) {
 156  0
         boolean found = false;
 157  0
         for (Node node : fragment.findAll(".//*[contains(@class,'tel')]")) {
 158  0
             HTMLDocument telFragment = new HTMLDocument(node);
 159  0
             TextField[] values = telFragment.getPluralUrlField("value");
 160  0
             if (values.length == 0) {
 161  
                 //no sub values
 162  0
                 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
 163  
                 //modem:goo fax:foo tel:bar
 164  0
                 if (typeAndValue.length > 1) {
 165  0
                     found |= addTel(card, "tel", typeAndValue[1]);
 166  
                 } else {
 167  0
                     found |= addTel(card, "tel", typeAndValue[0]);
 168  
                 }
 169  0
             } else {
 170  0
                 final String[] valuesStr = new String[values.length];
 171  0
                 for(int i = 0; i < values.length; i++) {
 172  0
                     valuesStr[i] = values[i].value();
 173  
                 }
 174  0
                 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
 175  0
                 if (types.length == 0) {
 176  0
                     found |= addTel(card, "tel", StringUtils.join(valuesStr));
 177  
                 }
 178  0
                 for (HTMLDocument.TextField type : types) {
 179  0
                     found |= addTel(card, type.value(), StringUtils.join(valuesStr));
 180  
                 }
 181  
             }
 182  0
         }
 183  0
         return found;
 184  
     }
 185  
 
 186  
     private boolean addTel(Resource card, String type, String value) {
 187  0
         URI tel = super.fixLink(value, "tel");
 188  0
         URI composed = vCARD.getProperty(type + "Tel", null);
 189  0
         if (composed == null) {
 190  0
             URI simple = vCARD.getProperty(type, null);
 191  0
             if (simple == null) {
 192  0
                 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
 193  
             }
 194  0
             return conditionallyAddResourceProperty(card, simple, tel);
 195  
         }
 196  0
         return conditionallyAddResourceProperty(card, composed, tel);
 197  
     }
 198  
 
 199  
     private boolean addSubMicroformat(String className, Resource resource, URI property) {
 200  0
         List<Node> nodes = fragment.findAllByClassName(className);
 201  0
         if (nodes.isEmpty()) return false;
 202  0
         for (Node node : nodes) {
 203  0
             addBNodeProperty(
 204  
                     node,
 205  
                     resource, property, getBlankNodeFor(node)
 206  
             );
 207  
         }
 208  0
         return true;
 209  
     }
 210  
 
 211  
     private boolean addStringProperty(String className, Resource resource, URI property) {
 212  0
         final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
 213  0
         return conditionallyAddStringProperty(
 214  
                 textField.source(),
 215  
                 resource, property, textField.value()
 216  
         );
 217  
     }
 218  
 
 219  
     /**
 220  
      * Adds a property that can be associated to multiple values.
 221  
      *
 222  
      * @param className
 223  
      * @param resource
 224  
      * @param property
 225  
      * @return <code>true</code> if the multi property has been added, <code>false</code> otherwise.
 226  
      */
 227  
     private boolean addStringMultiProperty(String className, Resource resource, URI property) {
 228  0
         HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
 229  0
         boolean found = false;
 230  0
         final String extractorName = getDescription().getExtractorName();
 231  0
         for(HTMLDocument.TextField field : fields) {
 232  0
             found |= conditionallyAddStringProperty(
 233  
                     field.source(),
 234  
                     resource, property, field.value()
 235  
             );
 236  
         }
 237  0
         return found;
 238  
     }
 239  
 
 240  
     private boolean addCategory(Resource card) {
 241  0
         HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
 242  0
         boolean found = false;
 243  0
         for (HTMLDocument.TextField category : categories) {
 244  0
             found |= conditionallyAddStringProperty(
 245  
                     category.source(),
 246  
                     card, vCARD.category, category.value()
 247  
             );
 248  
         }
 249  0
         return found;
 250  
     }
 251  
 
 252  
     private boolean addUid(Resource card) {
 253  0
         TextField uid = fragment.getSingularUrlField("uid");
 254  0
         return conditionallyAddStringProperty(
 255  
                 fragment.getDocument(),
 256  
                 card, vCARD.uid, uid.value()
 257  
         );
 258  
     }
 259  
 
 260  
     private boolean addClass(Resource card) {
 261  0
         TextField class_ = fragment.getSingularUrlField("class");
 262  0
         return conditionallyAddStringProperty(
 263  
                 fragment.getDocument(),
 264  
                 card, vCARD.class_, class_.value()
 265  
         );
 266  
     }
 267  
 
 268  
     private boolean addLogo(Resource card) throws ExtractionException {
 269  0
         TextField[] links = fragment.getPluralUrlField("logo");
 270  0
         boolean found = false;
 271  0
         for (TextField link : links) {
 272  0
             found |= conditionallyAddResourceProperty(
 273  
                     card, vCARD.logo, getHTMLDocument().resolveURI(link.value())
 274  
             );
 275  
         }
 276  0
         return found;
 277  
     }
 278  
 
 279  
     private boolean addPhoto(Resource card) throws ExtractionException {
 280  0
         TextField[] links = fragment.getPluralUrlField("photo");
 281  0
         boolean found = false;
 282  0
         for (TextField link : links) {
 283  0
             found |= conditionallyAddResourceProperty(
 284  
                     card, vCARD.photo, getHTMLDocument().resolveURI(link.value())
 285  
             );
 286  
         }
 287  0
         return found;
 288  
     }
 289  
 
 290  
     private boolean addEmail(Resource card) {
 291  0
         String email = dropSubject(fragment.getSingularUrlField("email").value());
 292  0
         return conditionallyAddResourceProperty(
 293  
                 card,
 294  
                 vCARD.email,
 295  
                 fixLink(email, "mailto")
 296  
         );
 297  
     }
 298  
 
 299  
     private String dropSubject(String mail) {
 300  0
         if (mail == null) return null;
 301  0
         return mail.split("\\?")[0];
 302  
     }
 303  
 
 304  
     private void readNames() {
 305  0
         for (String field : HCardName.FIELDS) {
 306  0
             HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
 307  0
             for (HTMLDocument.TextField text : values) {
 308  0
                 if ("".equals(text.value())) continue;
 309  0
                 name.setField(field, text);
 310  
             }
 311  
         }
 312  0
     }
 313  
 
 314  
     private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
 315  0
         conditionallyAddLiteralProperty(
 316  
                 n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue)
 317  
         );
 318  0
     }
 319  
 
 320  
     private boolean addNames(Resource card) {
 321  0
         BNode n = valueFactory.createBNode();
 322  0
         addBNodeProperty(
 323  
                 this.fragment.getDocument(),
 324  
                 card, vCARD.n, n
 325  
         );
 326  0
         addURIProperty(n, RDF.TYPE, vCARD.Name);
 327  
 
 328  0
         for (String fieldName : HCardName.FIELDS) {
 329  0
             if (!name.containsField(fieldName)) {
 330  0
                 continue;
 331  
             }
 332  0
             if (name.isMultiField(fieldName)) {
 333  0
                 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
 334  0
                 for(TextField value : values) {
 335  0
                     addFieldTriple(
 336  
                             value.source(),
 337  
                             n, fieldName, value.value()
 338  
                     );
 339  
                 }
 340  0
             } else {
 341  0
                 TextField value =  name.getField(fieldName);
 342  0
                 if(value == null) { continue; }
 343  0
                 addFieldTriple(
 344  
                         value.source(),
 345  
                         n, fieldName, value.value()
 346  
                 );
 347  
             }
 348  
         }
 349  0
         return true;
 350  
     }
 351  
 
 352  
     private void readFn() {
 353  0
         name.setFullName(fragment.getSingularTextField("fn"));
 354  0
     }
 355  
 
 356  
     private boolean addFn(Resource card) {
 357  0
         final TextField fullNameTextField = name.getFullName();
 358  0
         if(fullNameTextField == null) {
 359  0
             return false;
 360  
         }
 361  0
         return conditionallyAddStringProperty(
 362  
                 fullNameTextField.source(),
 363  
                 card, vCARD.fn, fullNameTextField.value()
 364  
         );
 365  
     }
 366  
 
 367  
     private void readOrganization() {
 368  0
         Node node = fragment.findMicroformattedObjectNode("*", "org");
 369  0
         if (node == null) return;
 370  0
         HTMLDocument doc = new HTMLDocument(node);
 371  0
         String nodeText = doc.getText();
 372  0
         if(nodeText != null) {
 373  0
             name.setOrganization( new HTMLDocument.TextField(nodeText, node) );
 374  
         }
 375  0
         nodeText = doc.getSingularTextField("organization-name").value();
 376  0
         if(nodeText == null || "".equals(nodeText) ) {
 377  0
             nodeText = HTMLDocument.readTextField(node).value();
 378  
         }
 379  0
         name.setOrganization( new TextField(nodeText, node) );
 380  
 
 381  0
         name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
 382  0
     }
 383  
 
 384  
     private boolean addOrganizationName(Resource card) {
 385  0
         if (name.getOrganization() == null) return false;
 386  0
         BNode org = valueFactory.createBNode();
 387  0
         final String extractorName =  getDescription().getExtractorName();
 388  0
         addBNodeProperty(
 389  
                 this.fragment.getDocument(),
 390  
                 card, vCARD.org, org
 391  
         );
 392  0
         addURIProperty(org, RDF.TYPE, vCARD.Organization);
 393  0
         final TextField organizationTextField = name.getOrganization();
 394  0
         conditionallyAddLiteralProperty(
 395  
                 organizationTextField.source(),
 396  
                 org, vCARD.organization_name, valueFactory.createLiteral( organizationTextField.value() )
 397  
         );
 398  0
         final TextField organizationUnitTextField = name.getOrganizationUnit();
 399  0
         if(organizationUnitTextField != null) {
 400  0
             conditionallyAddStringProperty(
 401  
                     organizationUnitTextField.source(),
 402  
                     org, vCARD.organization_unit, organizationUnitTextField.value()
 403  
             );
 404  
         }
 405  0
         return true;
 406  
     }
 407  
 
 408  
     private boolean addUrl(Resource card) throws ExtractionException {
 409  0
         TextField[] links = fragment.getPluralUrlField("url");
 410  0
         boolean found = false;
 411  0
         for (TextField link : links) {
 412  0
             found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveURI(link.value()));
 413  
         }
 414  0
         return found;
 415  
     }
 416  
 
 417  
 }