Coverage Report - org.apache.any23.extractor.microdata.MicrodataExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
MicrodataExtractor
0%
0/171
0%
0/102
5.059
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.microdata;
 19  
 
 20  
 import org.apache.any23.extractor.ErrorReporter;
 21  
 import org.apache.any23.extractor.ExtractionContext;
 22  
 import org.apache.any23.extractor.ExtractionException;
 23  
 import org.apache.any23.extractor.ExtractionParameters;
 24  
 import org.apache.any23.extractor.ExtractionResult;
 25  
 import org.apache.any23.extractor.Extractor;
 26  
 import org.apache.any23.extractor.ExtractorDescription;
 27  
 import org.apache.any23.extractor.ExtractorFactory;
 28  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 29  
 import org.apache.any23.extractor.html.DomUtils;
 30  
 import org.apache.any23.rdf.PopularPrefixes;
 31  
 import org.apache.any23.rdf.RDFUtils;
 32  
 import org.apache.any23.vocab.DCTERMS;
 33  
 import org.apache.any23.vocab.XHTML;
 34  
 import org.openrdf.model.Literal;
 35  
 import org.openrdf.model.Resource;
 36  
 import org.openrdf.model.URI;
 37  
 import org.openrdf.model.Value;
 38  
 import org.openrdf.model.vocabulary.RDF;
 39  
 import org.openrdf.model.vocabulary.XMLSchema;
 40  
 import org.w3c.dom.Document;
 41  
 import org.w3c.dom.Node;
 42  
 import org.w3c.dom.NodeList;
 43  
 
 44  
 import java.io.IOException;
 45  
 import java.net.MalformedURLException;
 46  
 import java.net.URL;
 47  
 import java.util.Arrays;
 48  
 import java.util.Date;
 49  
 import java.util.HashMap;
 50  
 import java.util.HashSet;
 51  
 import java.util.List;
 52  
 import java.util.Map;
 53  
 import java.util.Set;
 54  
 
 55  
 /**
 56  
  * Default implementation of <a href="http://www.w3.org/TR/microdata/">Microdata</a> extractor,
 57  
  * based on {@link TagSoupDOMExtractor}.
 58  
  *
 59  
  * @author Michele Mostarda (mostarda@fbk.eu)
 60  
  * @author Davide Palmisano ( dpalmisano@gmail.com )
 61  
  */
 62  0
 public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
 63  
 
 64  0
     private static final URI MICRODATA_ITEM
 65  
             = RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item");
 66  
 
 67  0
     public final static ExtractorFactory<MicrodataExtractor> factory =
 68  
             SimpleExtractorFactory.create(
 69  
                     "html-microdata",
 70  
                     PopularPrefixes.createSubset("rdf", "doac", "foaf"),
 71  
                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
 72  
                     "example-microdata.html",
 73  
                     MicrodataExtractor.class
 74  
             );
 75  
 
 76  
     private String documentLanguage;
 77  
 
 78  
     private boolean isStrict;
 79  
 
 80  
     private String defaultNamespace;
 81  
 
 82  
     public ExtractorDescription getDescription() {
 83  0
         return factory;
 84  
     }
 85  
 
 86  
     /**
 87  
      * This extraction performs the
 88  
      * <a href="http://www.w3.org/TR/microdata/#rdf">Microdata to RDF conversion algorithm</a>.
 89  
      * A slight modification of the specification algorithm has been introduced
 90  
      * to avoid performing actions 5.2.1, 5.2.2, 5.2.3, 5.2.4 if step 5.2.6 doesn't detect any
 91  
      * Microdata.
 92  
      */
 93  
     public void run(
 94  
             ExtractionParameters extractionParameters,
 95  
             ExtractionContext extractionContext,
 96  
             Document in,
 97  
             ExtractionResult out
 98  
     ) throws IOException, ExtractionException {
 99  
 
 100  0
         final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
 101  0
         if(parserReport.getErrors().length > 0) {
 102  0
             notifyError(parserReport.getErrors(), out);
 103  
         }
 104  0
         final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
 105  0
         if (itemScopes.length == 0) {
 106  0
             return;
 107  
         }
 108  
 
 109  0
         isStrict = extractionParameters.getFlag("any23.microdata.strict");
 110  0
         if (!isStrict) {
 111  0
             defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default");
 112  
         }
 113  
 
 114  0
         documentLanguage = getDocumentLanguage(in);
 115  
 
 116  
         /**
 117  
          * 5.2.6
 118  
          */
 119  0
         final URI documentURI = extractionContext.getDocumentURI();
 120  0
         final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>();
 121  0
         for (ItemScope itemScope : itemScopes) {
 122  0
             Resource subject = processType(itemScope, documentURI, out, mappings);
 123  0
             out.writeTriple(
 124  
                     documentURI,
 125  
                     MICRODATA_ITEM,
 126  
                     subject
 127  
             );
 128  
         }
 129  
 
 130  
         /**
 131  
          * 5.2.1
 132  
          */
 133  0
         processTitle(in, documentURI, out);
 134  
         /**
 135  
          * 5.2.2
 136  
          */
 137  0
         processHREFElements(in, documentURI, out);
 138  
         /**
 139  
          * 5.2.3
 140  
          */
 141  0
         processMetaElements(in, documentURI, out);
 142  
 
 143  
         /**
 144  
          * 5.2.4
 145  
          */
 146  0
         processCiteElements(in, documentURI, out);
 147  0
     }
 148  
 
 149  
     /**
 150  
      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
 151  
      *
 152  
      * @param in a instance of {@link Document}.
 153  
      * @return the language declared, could be <code>null</code>.
 154  
      */
 155  
     private String getDocumentLanguage(Document in) {
 156  0
         String lang = DomUtils.find(in, "string(/HTML/@lang)");
 157  0
         if (lang.equals("")) {
 158  0
             return null;
 159  
         }
 160  0
         return lang;
 161  
     }
 162  
 
 163  
     /**
 164  
      * Returns the {@link Node} language if declared, or the {@link Document} one
 165  
      * if not defined.
 166  
      *
 167  
      * @param node a {@link Node} instance.
 168  
      * @return the {@link Node} language or the {@link Document} one. Could be <code>null</code>
 169  
      */
 170  
     private String getLanguage(Node node) {
 171  0
         Node nodeLang = node.getAttributes().getNamedItem("lang");
 172  0
         if (nodeLang == null) {
 173  
             // if the element does not specify a lang, use the document one
 174  0
             return documentLanguage;
 175  
         }
 176  0
         return nodeLang.getTextContent();
 177  
     }
 178  
 
 179  
     /**
 180  
      * Implements step 5.2.1 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 181  
      * extraction algorithm.
 182  
      *
 183  
      * @param in          {@link Document} to be processed.
 184  
      * @param documentURI Document current {@link URI}.
 185  
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
 186  
      */
 187  
     private void processTitle(Document in, URI documentURI, ExtractionResult out) {
 188  0
         NodeList titles = in.getElementsByTagName("title");
 189  
         // just one title is allowed.
 190  0
         if (titles.getLength() == 1) {
 191  0
             Node title = titles.item(0);
 192  0
             String titleValue = title.getTextContent();
 193  
             Literal object;
 194  0
             String lang = getLanguage(title);
 195  0
             if (lang == null) {
 196  
                 // unable to decide the language, leave it unknown
 197  0
                 object = RDFUtils.literal(titleValue);
 198  
             } else {
 199  0
                 object = RDFUtils.literal(titleValue, lang);
 200  
             }
 201  0
             out.writeTriple(
 202  
                     documentURI,
 203  
                     DCTERMS.getInstance().title,
 204  
                     object
 205  
             );
 206  
         }
 207  0
     }
 208  
 
 209  
     /**
 210  
      * Implements step 5.2.2 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 211  
      * extraction algorithm.
 212  
      *
 213  
      * @param in          {@link Document} to be processed.
 214  
      * @param documentURI Document current {@link URI}.
 215  
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
 216  
      */
 217  
     private void processHREFElements(Document in, URI documentURI, ExtractionResult out) {
 218  0
         NodeList anchors = in.getElementsByTagName("a");
 219  0
         for (int i = 0; i < anchors.getLength(); i++) {
 220  0
             processHREFElement(anchors.item(i), documentURI, out);
 221  
         }
 222  0
         NodeList areas = in.getElementsByTagName("area");
 223  0
         for (int i = 0; i < areas.getLength(); i++) {
 224  0
             processHREFElement(areas.item(i), documentURI, out);
 225  
         }
 226  0
         NodeList links = in.getElementsByTagName("link");
 227  0
         for (int i = 0; i < links.getLength(); i++) {
 228  0
             processHREFElement(links.item(i), documentURI, out);
 229  
         }
 230  0
     }
 231  
 
 232  
     /**
 233  
      * Implements sub-step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 234  
      * extraction algorithm.
 235  
      *
 236  
      * @param item        {@link Node} to be processed.
 237  
      * @param documentURI Document current {@link URI}.
 238  
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
 239  
      */
 240  
     private void processHREFElement(Node item, URI documentURI, ExtractionResult out) {
 241  0
         Node rel = item.getAttributes().getNamedItem("rel");
 242  0
         if (rel == null) {
 243  0
             return;
 244  
         }
 245  0
         Node href = item.getAttributes().getNamedItem("href");
 246  0
         if (href == null) {
 247  0
             return;
 248  
         }
 249  
         URL absoluteURL;
 250  0
         if (!isAbsoluteURL(href.getTextContent())) {
 251  
             try {
 252  0
                 absoluteURL = toAbsoluteURL(
 253  
                         documentURI.toString(),
 254  
                         href.getTextContent(),
 255  
                         '/'
 256  
                 );
 257  0
             } catch (MalformedURLException e) {
 258  
                 // okay, it's not an absolute URL, return
 259  0
                 return;
 260  0
             }
 261  
         } else {
 262  
             try {
 263  0
                 absoluteURL = new URL(href.getTextContent());
 264  0
             } catch (MalformedURLException e) {
 265  
                 // cannot happen
 266  0
                 return;
 267  0
             }
 268  
         }
 269  0
         String[] relTokens = rel.getTextContent().split(" ");
 270  0
         Set<String> tokensWithNoDuplicates = new HashSet<String>();
 271  0
         for (String relToken : relTokens) {
 272  0
             if (relToken.contains(":")) {
 273  
                 // if contain semi-colon, skip
 274  0
                 continue;
 275  
             }
 276  0
             if (relToken.equals("alternate") || relToken.equals("stylesheet")) {
 277  0
                 tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET");
 278  0
                 continue;
 279  
             }
 280  0
             tokensWithNoDuplicates.add(relToken.toLowerCase());
 281  
         }
 282  0
         for (String token : tokensWithNoDuplicates) {
 283  
             URI predicate;
 284  0
             if (isAbsoluteURL(token)) {
 285  0
                 predicate = RDFUtils.uri(token);
 286  
             } else {
 287  0
                 predicate = RDFUtils.uri(XHTML.NS + token);
 288  
             }
 289  0
             out.writeTriple(
 290  
                     documentURI,
 291  
                     predicate,
 292  
                     RDFUtils.uri(absoluteURL.toString())
 293  
             );
 294  0
         }
 295  0
     }
 296  
 
 297  
     /**
 298  
      * Implements step 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 299  
      * extraction algorithm.
 300  
      *
 301  
      * @param in          {@link Document} to be processed.
 302  
      * @param documentURI Document current {@link URI}.
 303  
      * @param out         a valid not <code>null</code> {@link ExtractionResult}
 304  
      */
 305  
     private void processMetaElements(Document in, URI documentURI, ExtractionResult out) {
 306  0
         NodeList metas = in.getElementsByTagName("meta");
 307  0
         for (int i = 0; i < metas.getLength(); i++) {
 308  0
             Node meta = metas.item(i);
 309  0
             String name    = DomUtils.readAttribute(meta, "name"   , null);
 310  0
             String content = DomUtils.readAttribute(meta, "content", null);
 311  0
             if (name != null && content != null) {
 312  0
                 if (isAbsoluteURL(name)) {
 313  0
                     processMetaElement(
 314  
                             RDFUtils.uri(name),
 315  
                             content,
 316  
                             getLanguage(meta),
 317  
                             documentURI,
 318  
                             out
 319  
                     );
 320  
                 } else {
 321  0
                     processMetaElement(
 322  
                             name,
 323  
                             content,
 324  
                             getLanguage(meta),
 325  
                             documentURI,
 326  
                             out
 327  
                     );
 328  
                 }
 329  
             }
 330  
         }
 331  0
     }
 332  
 
 333  
     /**
 334  
      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 335  
      * extraction algorithm.
 336  
      *
 337  
      * @param uri
 338  
      * @param content
 339  
      * @param language
 340  
      * @param documentURI
 341  
      * @param out
 342  
      */
 343  
     private void processMetaElement(
 344  
             URI uri,
 345  
             String content,
 346  
             String language,
 347  
             URI documentURI,
 348  
             ExtractionResult out
 349  
     ) {
 350  0
         if (content.contains(":")) {
 351  
             // if it contains U+003A COLON, exit
 352  0
             return;
 353  
         }
 354  
         Literal subject;
 355  0
         if (language == null) {
 356  
             // ok, we don't know the language
 357  0
             subject = RDFUtils.literal(content);
 358  
         } else {
 359  0
             subject = RDFUtils.literal(content, language);
 360  
         }
 361  0
         out.writeTriple(
 362  
                 documentURI,
 363  
                 uri,
 364  
                 subject
 365  
         );
 366  0
     }
 367  
 
 368  
     /**
 369  
      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 370  
      * extraction algorithm.
 371  
      *
 372  
      * @param name
 373  
      * @param content
 374  
      * @param language
 375  
      * @param documentURI
 376  
      * @param out
 377  
      */
 378  
     private void processMetaElement(
 379  
             String name,
 380  
             String content,
 381  
             String language,
 382  
             URI documentURI,
 383  
             ExtractionResult out) {
 384  
         Literal subject;
 385  0
         if (language == null) {
 386  
             // ok, we don't know the language
 387  0
             subject = RDFUtils.literal(content);
 388  
         } else {
 389  0
             subject = RDFUtils.literal(content, language);
 390  
         }
 391  0
         out.writeTriple(
 392  
                 documentURI,
 393  
                 RDFUtils.uri(XHTML.NS + name.toLowerCase()),
 394  
                 subject
 395  
         );
 396  0
     }
 397  
 
 398  
     /**
 399  
      * Implements sub step for 5.2.4 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 400  
      * extraction algorithm.
 401  
      *
 402  
      * @param in
 403  
      * @param documentURI
 404  
      * @param out
 405  
      */
 406  
     private void processCiteElements(Document in, URI documentURI, ExtractionResult out) {
 407  0
         NodeList blockQuotes = in.getElementsByTagName("blockquote");
 408  0
         for (int i = 0; i < blockQuotes.getLength(); i++) {
 409  0
             processCiteElement(blockQuotes.item(i), documentURI, out);
 410  
         }
 411  0
         NodeList quotes = in.getElementsByTagName("q");
 412  0
         for (int i = 0; i < quotes.getLength(); i++) {
 413  0
             processCiteElement(quotes.item(i), documentURI, out);
 414  
         }
 415  0
     }
 416  
 
 417  
     private void processCiteElement(Node item, URI documentURI, ExtractionResult out) {
 418  0
         if (item.getAttributes().getNamedItem("cite") != null) {
 419  0
             out.writeTriple(
 420  
                     documentURI,
 421  
                     DCTERMS.getInstance().source,
 422  
                     RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent())
 423  
             );
 424  
         }
 425  0
     }
 426  
 
 427  
     /**
 428  
      * Recursive method implementing 5.2.6.1 "generate the triple for the item" of
 429  
      * <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
 430  
      * extraction algorithm.
 431  
      *
 432  
      * @param itemScope
 433  
      * @param documentURI
 434  
      * @param out
 435  
      * @param mappings
 436  
      * @return
 437  
      * @throws ExtractionException
 438  
      */
 439  
     private Resource processType(
 440  
             ItemScope itemScope,
 441  
             URI documentURI, ExtractionResult out,
 442  
             Map<ItemScope, Resource> mappings
 443  
     ) throws ExtractionException {
 444  
         Resource subject;
 445  0
         if (mappings.containsKey(itemScope)) {
 446  0
             subject = mappings.get(itemScope);
 447  0
         } else if (isAbsoluteURL(itemScope.getItemId())) {
 448  0
             subject = RDFUtils.uri(itemScope.getItemId());
 449  
         } else {
 450  0
             subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode()));
 451  
         }
 452  0
         mappings.put(itemScope, subject);
 453  
 
 454  
         // ItemScope.type could be null, but surely it's a valid URL
 455  0
         String itemScopeType = "";
 456  0
         if (itemScope.getType() != null) {
 457  
             String itemType;
 458  0
             itemType = itemScope.getType().toString();
 459  0
             out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType));
 460  0
             itemScopeType = itemScope.getType().toString();
 461  
         }
 462  0
         for (String propName : itemScope.getProperties().keySet()) {
 463  0
             List<ItemProp> itemProps = itemScope.getProperties().get(propName);
 464  0
             for (ItemProp itemProp : itemProps) {
 465  
                 try {
 466  0
                     processProperty(
 467  
                             subject,
 468  
                             propName,
 469  
                             itemProp,
 470  
                             itemScopeType,
 471  
                             documentURI,
 472  
                             mappings,
 473  
                             out
 474  
                     );
 475  0
                 } catch (MalformedURLException e) {
 476  0
                     throw new ExtractionException(
 477  
                             "Error while processing on subject '" + subject +
 478  
                                     "' the itemProp: '" + itemProp + "' "
 479  
                     );
 480  0
                 }
 481  
             }
 482  0
         }
 483  0
         return subject;
 484  
     }
 485  
 
 486  
     private void processProperty(
 487  
             Resource subject,
 488  
             String propName,
 489  
             ItemProp itemProp,
 490  
             String itemScopeType,
 491  
             URI documentURI,
 492  
             Map<ItemScope, Resource> mappings,
 493  
             ExtractionResult out
 494  
     ) throws MalformedURLException, ExtractionException {
 495  
         URI predicate;
 496  0
         if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) {
 497  0
             return;
 498  0
         } else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) {
 499  0
             predicate = RDFUtils.uri(
 500  
                     toAbsoluteURL(
 501  
                             defaultNamespace,
 502  
                             propName,
 503  
                             '/'
 504  
                     ).toString()
 505  
             );
 506  
         } else {
 507  0
             predicate = RDFUtils.uri(
 508  
                     toAbsoluteURL(
 509  
                             itemScopeType,
 510  
                             propName,
 511  
                             '/'
 512  
                     ).toString());
 513  
         }
 514  
         Value value;
 515  0
         Object propValue = itemProp.getValue().getContent();
 516  0
         ItemPropValue.Type propType = itemProp.getValue().getType();
 517  0
         if (propType.equals(ItemPropValue.Type.Nested)) {
 518  0
             value = processType((ItemScope) propValue, documentURI, out, mappings);
 519  0
         } else if (propType.equals(ItemPropValue.Type.Plain)) {
 520  0
             value = RDFUtils.literal((String) propValue, documentLanguage);
 521  0
         } else if (propType.equals(ItemPropValue.Type.Link)) {
 522  0
             value = RDFUtils.uri(
 523  
                     toAbsoluteURL(
 524  
                             documentURI.toString(),
 525  
                             (String) propValue,
 526  
                             '/'
 527  
                     ).toString()
 528  
             );
 529  0
         } else if (propType.equals(ItemPropValue.Type.Date)) {
 530  0
             value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
 531  
         } else {
 532  0
             throw new RuntimeException("Invalid Type '" +
 533  
                     propType + "' for ItemPropValue with name: '" + propName + "'");
 534  
         }
 535  0
         out.writeTriple(subject, predicate, value);
 536  0
     }
 537  
 
 538  
     private boolean isAbsoluteURL(String urlString) {
 539  0
         boolean result = false;
 540  
         try {
 541  0
             URL url = new URL(urlString);
 542  0
             String protocol = url.getProtocol();
 543  0
             if (protocol != null && protocol.trim().length() > 0)
 544  0
                 result = true;
 545  0
         } catch (MalformedURLException e) {
 546  0
             return false;
 547  0
         }
 548  0
         return result;
 549  
     }
 550  
 
 551  
     private URL toAbsoluteURL(String ns, String part, char trailing)
 552  
             throws MalformedURLException {
 553  0
         if (isAbsoluteURL(part)) {
 554  0
             return new URL(part);
 555  
         }
 556  0
         char lastChar = ns.charAt(ns.length() - 1);
 557  0
         if (lastChar == '#' || lastChar == '/')
 558  0
             return new URL(ns + part);
 559  0
         return new URL(ns + trailing + part);
 560  
     }
 561  
 
 562  
     private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
 563  0
         for(MicrodataParserException mpe : errors) {
 564  0
             out.notifyError(
 565  
                     ErrorReporter.ErrorLevel.ERROR,
 566  
                     mpe.toJSON(),
 567  
                     mpe.getErrorLocationBeginRow() ,
 568  
                     mpe.getErrorLocationBeginCol()
 569  
             );
 570  
         }
 571  0
     }
 572  
 
 573  
 }