Coverage Report - org.apache.any23.extractor.html.HListingExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
HListingExtractor
0%
0/117
0%
0/26
1.682
HListingExtractor$1
0%
0/11
N/A
1.682
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionException;
 21  
 import org.apache.any23.extractor.ExtractionResult;
 22  
 import org.apache.any23.extractor.ExtractorDescription;
 23  
 import org.apache.any23.extractor.ExtractorFactory;
 24  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 25  
 import org.apache.any23.extractor.TagSoupExtractionResult;
 26  
 import org.apache.any23.rdf.PopularPrefixes;
 27  
 import org.apache.any23.vocab.FOAF;
 28  
 import org.apache.any23.vocab.HLISTING;
 29  
 import org.openrdf.model.BNode;
 30  
 import org.openrdf.model.Resource;
 31  
 import org.openrdf.model.URI;
 32  
 import org.openrdf.model.vocabulary.RDF;
 33  
 import org.w3c.dom.Node;
 34  
 
 35  
 import java.util.ArrayList;
 36  
 import java.util.Arrays;
 37  
 import java.util.HashSet;
 38  
 import java.util.List;
 39  
 import java.util.Set;
 40  
 
 41  
 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
 42  
 
 43  
 /**
 44  
  * Extractor for the <a href="http://microformats.org/wiki/hlisting">hListing</a>
 45  
  * microformat.
 46  
  *
 47  
  * @author Gabriele Renzi
 48  
  */
 49  0
 public class HListingExtractor extends EntityBasedMicroformatExtractor {
 50  
 
 51  0
     private static final HLISTING hLISTING = HLISTING.getInstance();
 52  0
     private static final FOAF foaf     = FOAF.getInstance();
 53  
 
 54  0
     private static final Set<String> ActionClasses = new HashSet<String>() {
 55  
         {
 56  0
             add("sell"    );
 57  0
             add("rent"    );
 58  0
             add("trade"   );
 59  0
             add("meet"    );
 60  0
             add("announce");
 61  0
             add("offer"   );
 62  0
             add("wanted"  );
 63  0
             add("event"   );
 64  0
             add("service" );
 65  0
         }
 66  
     };
 67  
 
 68  0
     private static final List<String> validClassesForAddress = Arrays.asList(
 69  
             "post-office-box",
 70  
             "extended-address",
 71  
             "street-address",
 72  
             "locality",
 73  
             "region",
 74  
             "postal-code",
 75  
             "country-name"
 76  
     );
 77  
 
 78  
     private HTMLDocument fragment;
 79  
 
 80  0
     public final static ExtractorFactory<HListingExtractor> factory =
 81  
             SimpleExtractorFactory.create(
 82  
                     "html-mf-hlisting",
 83  
                     PopularPrefixes.createSubset("rdf", "hlisting"),
 84  
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
 85  
                     "example-mf-hlisting.html",
 86  
                     HListingExtractor.class
 87  
             );
 88  
 
 89  
     public ExtractorDescription getDescription() {
 90  0
         return factory;
 91  
     }
 92  
 
 93  
     protected String getBaseClassName() {
 94  0
         return "hlisting";
 95  
     }
 96  
 
 97  
     @Override
 98  
     protected void resetExtractor() {
 99  
         // Empty.
 100  0
     }
 101  
 
 102  
     @Override
 103  
     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
 104  0
         this.fragment = new HTMLDocument(node);
 105  0
         BNode listing = getBlankNodeFor(node);
 106  0
         out.writeTriple(listing, RDF.TYPE, hLISTING.Listing);
 107  
 
 108  0
         for (String action : findActions(fragment)) {
 109  0
             out.writeTriple(listing, hLISTING.action, hLISTING.getClass(action));
 110  
         }
 111  0
         out.writeTriple(listing, hLISTING.lister, addLister() );
 112  0
         addItem(listing);
 113  0
         addDateTimes(listing);
 114  0
         addPrice(listing);
 115  0
         addDescription(listing);
 116  0
         addSummary(listing);
 117  0
         addPermalink(listing);
 118  
 
 119  0
         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
 120  0
         tser.addResourceRoot(
 121  
                 DomUtils.getXPathListForNode(node),
 122  
                 listing,
 123  
                 this.getClass()
 124  
         );
 125  
 
 126  0
         return true;
 127  
     }
 128  
 
 129  
     private void addItem(Resource listing) throws ExtractionException {
 130  0
         Node node = fragment.findMicroformattedObjectNode("*", "item");
 131  0
         if (null == node) return;
 132  0
         BNode blankItem = valueFactory.createBNode();
 133  0
         addBNodeProperty(
 134  
                 node,
 135  
                 listing, hLISTING.item, blankItem
 136  
         );
 137  0
         addURIProperty(blankItem, RDF.TYPE, hLISTING.Item);
 138  
 
 139  0
         HTMLDocument item = new HTMLDocument(node);
 140  
 
 141  0
         addItemName(item, blankItem);
 142  0
         addItemUrl(item, blankItem);
 143  
         // the format is specified with photo into item, but kelkoo has it into the top level
 144  0
         addItemPhoto(fragment, blankItem);
 145  0
         addItemAddresses(fragment, blankItem);
 146  0
     }
 147  
 
 148  
     private void addItemAddresses(HTMLDocument doc, Resource blankItem) {
 149  0
         final String extractorName = getDescription().getExtractorName();
 150  0
         for (Node node : doc.findAll(".//*[contains(@class,'adr')]//*[@class]")) {
 151  0
             String[] klasses = node.getAttributes().getNamedItem("class").getNodeValue().split("\\s+");
 152  0
             for (String klass : klasses)
 153  0
                 if (validClassesForAddress.contains(klass)) {
 154  0
                     String value = node.getNodeValue();
 155  
                     // do not use conditionallyAdd, it won't work cause of evaluation rules
 156  0
                     if (!(null == value || "".equals(value))) {
 157  0
                         URI property = hLISTING.getPropertyCamelCase(klass);
 158  0
                         conditionallyAddLiteralProperty(
 159  
                                 node,
 160  
                                 blankItem, property, valueFactory.createLiteral(value)
 161  
                         );
 162  
                     }
 163  
                 }
 164  0
         }
 165  0
     }
 166  
 
 167  
     private void addPermalink(Resource listing) {
 168  0
         String link = fragment.find(".//A[contains(@rel,'self') and contains(@rel,'bookmark')]/@href");
 169  0
         conditionallyAddStringProperty(
 170  
                 fragment.getDocument(),
 171  
                 listing, hLISTING.permalink, link
 172  
         );
 173  0
     }
 174  
 
 175  
     private void addPrice(Resource listing) {
 176  0
         TextField price = fragment.getSingularTextField("price");
 177  0
         conditionallyAddStringProperty(
 178  
                 price.source(),
 179  
                 listing, hLISTING.price, price.value()
 180  
         );
 181  0
     }
 182  
 
 183  
     private void addDescription(Resource listing) {
 184  0
         TextField description = fragment.getSingularTextField("description");
 185  0
         conditionallyAddStringProperty(
 186  
                 description.source(),
 187  
                 listing, hLISTING.description, description.value()
 188  
         );
 189  0
     }
 190  
 
 191  
     private void addSummary(Resource listing) {
 192  0
         TextField summary = fragment.getSingularTextField("summary");
 193  0
         conditionallyAddStringProperty(
 194  
                 summary.source(),
 195  
                 listing, hLISTING.summary, summary.value()
 196  
         );
 197  0
     }
 198  
 
 199  
     private void addDateTimes(Resource listing) {
 200  0
         TextField listed = fragment.getSingularTextField("dtlisted");
 201  0
         conditionallyAddStringProperty(
 202  
                 listed.source(),
 203  
                 listing, hLISTING.dtlisted, listed.value()
 204  
         );
 205  0
         HTMLDocument.TextField expired = fragment.getSingularTextField("dtexpired");
 206  0
         conditionallyAddStringProperty(
 207  
                 expired.source(),
 208  
                 listing, hLISTING.dtexpired, expired.value()
 209  
         );
 210  0
     }
 211  
 
 212  
     private Resource addLister() throws ExtractionException {
 213  0
         Resource blankLister = valueFactory.createBNode();
 214  0
         addURIProperty(blankLister, RDF.TYPE, hLISTING.Lister);
 215  0
         Node node = fragment.findMicroformattedObjectNode("*", "lister");
 216  0
         if (null == node)
 217  0
             return blankLister;
 218  0
         HTMLDocument listerNode = new HTMLDocument(node);
 219  0
         addListerFn(listerNode, blankLister);
 220  0
         addListerOrg(listerNode, blankLister);
 221  0
         addListerEmail(listerNode, blankLister);
 222  0
         addListerUrl(listerNode, blankLister);
 223  0
         addListerTel(listerNode, blankLister);
 224  0
         addListerLogo(listerNode, blankLister);
 225  0
         return blankLister;
 226  
     }
 227  
 
 228  
     private void addListerTel(HTMLDocument doc, Resource blankLister) {
 229  0
         HTMLDocument.TextField tel = doc.getSingularTextField("tel");
 230  0
         conditionallyAddStringProperty(
 231  
                 tel.source(),
 232  
                 blankLister, hLISTING.tel, tel.value()
 233  
         );
 234  0
     }
 235  
 
 236  
     private void addListerUrl(HTMLDocument doc, Resource blankLister) throws ExtractionException {
 237  0
         TextField url = doc.getSingularUrlField("url");
 238  0
         conditionallyAddResourceProperty(blankLister, hLISTING.listerUrl, getHTMLDocument().resolveURI(url.value()));
 239  0
     }
 240  
 
 241  
     private void addListerEmail(HTMLDocument doc, Resource blankLister) {
 242  0
         TextField email = doc.getSingularUrlField("email");
 243  0
         conditionallyAddResourceProperty(blankLister, foaf.mbox, fixLink(email.value(), "mailto"));
 244  0
     }
 245  
 
 246  
     private void addListerFn(HTMLDocument doc, Resource blankLister) {
 247  0
         TextField fn = doc.getSingularTextField("fn");
 248  0
         conditionallyAddStringProperty(
 249  
                 fn.source(),
 250  
                 blankLister, hLISTING.listerName, fn.value()
 251  
         );
 252  0
     }
 253  
 
 254  
     private void addListerLogo(HTMLDocument doc, Resource blankLister) throws ExtractionException {
 255  0
         TextField logo = doc.getSingularUrlField("logo");
 256  0
         conditionallyAddResourceProperty(blankLister, hLISTING.listerLogo, getHTMLDocument().resolveURI(logo.value()));
 257  0
     }
 258  
 
 259  
     private void addListerOrg(HTMLDocument doc, Resource blankLister) {
 260  0
         TextField org = doc.getSingularTextField("org");
 261  0
         conditionallyAddStringProperty(
 262  
                 org.source(),
 263  
                 blankLister, hLISTING.listerOrg, org.value()
 264  
         );
 265  0
     }
 266  
 
 267  
     private void addItemName(HTMLDocument item, Resource blankItem) {
 268  0
         HTMLDocument.TextField fn = item.getSingularTextField("fn");
 269  0
         conditionallyAddStringProperty(
 270  
                 fn.source(),
 271  
                 blankItem, hLISTING.itemName, fn.value()
 272  
         );
 273  0
     }
 274  
 
 275  
     private void addItemUrl(HTMLDocument item, Resource blankItem) throws ExtractionException {
 276  0
         TextField url = item.getSingularUrlField("url");
 277  0
         conditionallyAddResourceProperty(blankItem, hLISTING.itemUrl, getHTMLDocument().resolveURI(url.value()));
 278  0
     }
 279  
 
 280  
     private void addItemPhoto(HTMLDocument doc, Resource blankLister) throws ExtractionException {
 281  
         // as per spec
 282  0
         String url = doc.findMicroformattedValue("*", "item", "A", "photo", "@href");
 283  0
         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
 284  0
         url = doc.findMicroformattedValue("*", "item", "IMG", "photo", "@src");
 285  0
         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
 286  
         // as per kelkoo. Remember that contains(foo,'') is true in xpath
 287  0
         url = doc.findMicroformattedValue("*", "photo", "IMG", "", "@src");
 288  0
         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveURI(url));
 289  0
     }
 290  
 
 291  
     private List<String> findActions(HTMLDocument doc) {
 292  0
         List<String> actions = new ArrayList<String>(0);
 293  
         // first check if values are inlined
 294  0
         String[] classes = doc.readAttribute("class").split("\\s+");
 295  0
         for (String klass : classes) {
 296  0
             if (ActionClasses.contains(klass))
 297  0
                 actions.add(klass);
 298  
         }
 299  
 
 300  0
         for (Node action : doc.findAll("./*[@class]/@class")) {
 301  0
             for (String substring : action.getNodeValue().split("\\s+")) {
 302  0
                 if (ActionClasses.contains(substring))
 303  0
                     actions.add(substring);
 304  
             }
 305  
         }
 306  0
         return actions;
 307  
     }
 308  
 
 309  
 }