Coverage Report - org.apache.any23.extractor.microdata.MicrodataParser
 
Classes in this File Line Coverage Branch Coverage Complexity
MicrodataParser
0%
0/167
0%
0/94
4.778
MicrodataParser$ErrorMode
0%
0/3
N/A
4.778
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 package org.apache.any23.extractor.microdata;
 18  
 
 19  
 import org.apache.any23.extractor.html.DomUtils;
 20  
 import org.apache.any23.util.StringUtils;
 21  
 import org.w3c.dom.Document;
 22  
 import org.w3c.dom.Element;
 23  
 import org.w3c.dom.Node;
 24  
 
 25  
 import java.io.PrintStream;
 26  
 import java.text.ParseException;
 27  
 import java.util.ArrayList;
 28  
 import java.util.Arrays;
 29  
 import java.util.Collections;
 30  
 import java.util.Date;
 31  
 import java.util.HashMap;
 32  
 import java.util.HashSet;
 33  
 import java.util.List;
 34  
 import java.util.Map;
 35  
 import java.util.Set;
 36  
 
 37  
 /**
 38  
  * This class provides utility methods for handling <b>Microdata</b>
 39  
  * nodes contained within a <i>DOM</i> document.
 40  
  *
 41  
  * @author Michele Mostarda (mostarda@fbk.eu)
 42  
  */
 43  
 public class MicrodataParser {
 44  
 
 45  0
     enum ErrorMode {
 46  
         /** This mode raises an exception at first encountered error. */
 47  0
         StopAtFirstError,
 48  
         /**  This mode produces a full error report. */
 49  0
         FullReport
 50  
     }
 51  
 
 52  
     public static final String ITEMSCOPE_ATTRIBUTE = "itemscope";
 53  
     public static final String ITEMPROP_ATTRIBUTE  = "itemprop";
 54  
 
 55  
     /**
 56  
      * List of tags providing the <code>src</code> property.
 57  
      */
 58  0
     public static final Set<String> SRC_TAGS =  Collections.unmodifiableSet(
 59  
             new HashSet<String>( Arrays.asList("audio", "embed", "iframe", "img", "source", "track", "video") )
 60  
     );
 61  
 
 62  
     /**
 63  
      * List of tags providing the <code>href</code> property.
 64  
      */
 65  0
     public static final Set<String> HREF_TAGS =  Collections.unmodifiableSet(
 66  
             new HashSet<String>( Arrays.asList("a", "area", "link") )
 67  
     );
 68  
 
 69  
     private final Document document;
 70  
 
 71  
     /**
 72  
      * This set holds the name of properties being dereferenced.
 73  
      * The {@link #deferProperties(String...)} checks first if the
 74  
      * required dereference has been already asked, if so raises
 75  
      * a loop detection error. This map works in coordination
 76  
      * with {@link #dereferenceRecursionCounter}, so that at the end of
 77  
      * {@link #deferProperties(String...)} call recursion the
 78  
      * {@link #loopDetectorSet} can be cleaned up.
 79  
      */
 80  0
     private final Set<String> loopDetectorSet = new HashSet<String>();
 81  
 
 82  
     /**
 83  
      * {@link ItemScope} cache.
 84  
      */
 85  0
     private final Map<Node,ItemScope> itemScopes = new HashMap<Node,ItemScope>();
 86  
 
 87  
     /**
 88  
      * {@link ItemPropValue} cache.
 89  
      */
 90  0
     private final Map<Node, ItemPropValue> itemPropValues = new HashMap<Node, ItemPropValue>();
 91  
 
 92  
    /**
 93  
      * Counts the recursive call of {@link #deferProperties(String...)}.
 94  
      * It helps to cleanup the {@link #loopDetectorSet} when recursion ends.
 95  
      */
 96  0
     private int dereferenceRecursionCounter = 0;
 97  
 
 98  
     /**
 99  
      * Current error mode.
 100  
      */
 101  0
     private ErrorMode errorMode = ErrorMode.FullReport;
 102  
 
 103  
     /**
 104  
      * List of collected errors. Used when {@link #errorMode} <code>==</code> {@link ErrorMode#FullReport}.
 105  
      */
 106  0
     private List<MicrodataParserException> errors = new ArrayList<MicrodataParserException>();
 107  
 
 108  
     /**
 109  
      * Returns all the <i>itemScope</i>s detected within the given root node.
 110  
      *
 111  
      * @param node root node to search in.
 112  
      * @return list of detected items.
 113  
      */
 114  
     public static List<Node> getItemScopeNodes(Node node) {
 115  0
         return DomUtils.findAllByAttributeName(node, ITEMSCOPE_ATTRIBUTE);
 116  
     }
 117  
 
 118  
     /**
 119  
      * Check whether a node is an <i>itemScope</i>.
 120  
      *
 121  
      * @param node node to check.
 122  
      * @return <code>true</code> if the node is an <i>itemScope</i>., <code>false</code> otherwise.
 123  
      */
 124  
     public static boolean isItemScope(Node node) {
 125  0
         return DomUtils.readAttribute(node, ITEMSCOPE_ATTRIBUTE, null) != null;
 126  
     }
 127  
 
 128  
     /**
 129  
      * Returns all the <i>itemProp</i>s detected within the given root node.
 130  
      *
 131  
      * @param node root node to search in.
 132  
      * @return list of detected items.
 133  
      */
 134  
     public static List<Node> getItemPropNodes(Node node) {
 135  0
         return DomUtils.findAllByAttributeName(node, ITEMPROP_ATTRIBUTE);
 136  
     }
 137  
 
 138  
     /**
 139  
      * Check whether a node is an <i>itemProp</i>.
 140  
      *
 141  
      * @param node node to check.
 142  
      * @return <code>true</code> if the node is an <i>itemProp</i>., <code>false</code> otherwise.
 143  
      */
 144  
     public static boolean isItemProp(Node node) {
 145  0
         return DomUtils.readAttribute(node, ITEMPROP_ATTRIBUTE, null) != null;
 146  
     }
 147  
 
 148  
     /**
 149  
      * Returns only the <i>itemScope<i>s that are top level items.
 150  
      *
 151  
      * @param node root node to search in.
 152  
      * @return list of detected top item scopes.
 153  
      */
 154  
     public static List<Node> getTopLevelItemScopeNodes(Node node)  {
 155  0
         final List<Node> itemScopes = getItemScopeNodes(node);
 156  0
         final List<Node> topLevelItemScopes = new ArrayList<Node>();
 157  0
         for(Node itemScope : itemScopes) {
 158  0
             if( ! isItemProp(itemScope) ) {
 159  0
                 topLevelItemScopes.add(itemScope);
 160  
             }
 161  
         }
 162  0
         return getUnnestedNodes( topLevelItemScopes );
 163  
     }
 164  
 
 165  
     /**
 166  
      * Returns all the <b>Microdata items</b> detected within the given <code>document</code>.
 167  
      *
 168  
      * @param document document to be processed.
 169  
      * @param errorMode error management policy.
 170  
      * @return list of <b>itemscope</b> items.
 171  
      * @throws MicrodataParserException if <code>errorMode == {@link ErrorMode#StopAtFirstError}</code>
 172  
      *                                  and an error occurs.
 173  
      */
 174  
     public static MicrodataParserReport getMicrodata(Document document, ErrorMode errorMode)
 175  
     throws MicrodataParserException {
 176  0
         final List<Node> itemNodes = getTopLevelItemScopeNodes(document);
 177  0
         final List<ItemScope> items = new ArrayList<ItemScope>();
 178  0
         final MicrodataParser microdataParser = new MicrodataParser(document);
 179  0
         microdataParser.setErrorMode(errorMode);
 180  0
         for(Node itemNode : itemNodes) {
 181  0
             items.add( microdataParser.getItemScope(itemNode) );
 182  
         }
 183  0
         return new MicrodataParserReport(
 184  
                 items.toArray( new ItemScope[items.size()] ),
 185  
                 microdataParser.getErrors()
 186  
         );
 187  
     }
 188  
 
 189  
     /**
 190  
      * Returns all the <b>Microdata items</b> detected within the given <code>document</code>,
 191  
      * works in full report mode.
 192  
      *
 193  
      * @param document document to be processed.
 194  
      * @return list of <b>itemscope</b> items.
 195  
      */
 196  
     public static MicrodataParserReport getMicrodata(Document document) {
 197  
         try {
 198  0
             return getMicrodata(document, ErrorMode.FullReport);
 199  0
         } catch (MicrodataParserException mpe) {
 200  0
              throw new IllegalStateException("Unexpected exception.", mpe);
 201  
         }
 202  
     }
 203  
 
 204  
     /**
 205  
      * Returns a <i>JSON</i> containing the list of all extracted Microdata,
 206  
      * as described at <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
 207  
      *
 208  
      * @param document document to be processed.
 209  
      * @param ps
 210  
      */
 211  
     public static void getMicrodataAsJSON(Document document, PrintStream ps) {
 212  0
         final MicrodataParserReport report = getMicrodata(document);
 213  0
         final ItemScope[] itemScopes            = report.getDetectedItemScopes();
 214  0
         final MicrodataParserException[] errors = report.getErrors();
 215  
 
 216  0
         ps.append("{ ");
 217  
 
 218  
         // Results.
 219  0
         ps.append("\"result\" : [");
 220  0
         for(int i = 0; i < itemScopes.length; i++) {
 221  0
             ps.print( itemScopes[i].toJSON() );
 222  0
             if( i < itemScopes.length - 1 ) {
 223  0
                 ps.print(", ");
 224  
             }
 225  
         }
 226  0
         ps.append("] ");
 227  
 
 228  
         // Errors.
 229  0
         if(errors != null && errors.length > 0) {
 230  0
             ps.append(", ");
 231  0
             ps.append("\"errors\" : [");
 232  0
             for (int i = 0; i < errors.length; i++) {
 233  0
                 ps.print( errors[i].toJSON() );
 234  0
                 if (i < itemScopes.length - 1) {
 235  0
                     ps.print(", ");
 236  
                 }
 237  
             }
 238  0
             ps.append("] ");
 239  
         }
 240  
 
 241  0
         ps.append("}");
 242  0
     }
 243  
 
 244  
     /**
 245  
      * Returns only nodes that are <b>not</b> nested one each other.
 246  
      *
 247  
      * @param candidates list of candidate nodes.
 248  
      * @return list of unnested nodes.
 249  
      */
 250  
     private static List<Node> getUnnestedNodes(List<Node> candidates) {
 251  0
         final List<Node> unnesteds  = new ArrayList<Node>();
 252  0
         for(int i = 0; i < candidates.size(); i++) {
 253  0
             boolean skip = false;
 254  0
             for(int j = 0; j < candidates.size(); j++) {
 255  0
                 if(i == j) continue;
 256  0
                 if(
 257  
                         StringUtils.isPrefix(
 258  
                                 DomUtils.getXPathForNode(candidates.get(j)),
 259  
                                 DomUtils.getXPathForNode(candidates.get(i))
 260  
                         )
 261  
                 ) {
 262  0
                     skip = true;
 263  0
                     break;
 264  
                 }
 265  
             }
 266  0
             if(!skip) {
 267  0
                 unnesteds.add( candidates.get(i) );
 268  
             }
 269  
         }
 270  0
         return unnesteds;
 271  
     }
 272  
 
 273  0
     public MicrodataParser(Document document) {
 274  0
         if(document == null) {
 275  0
             throw new NullPointerException("Document cannot be null.");
 276  
         }
 277  0
         this.document = document;
 278  0
     }
 279  
 
 280  
     public void setErrorMode(ErrorMode errorMode) {
 281  0
         if(errorMode == null) throw new IllegalArgumentException("errorMode must be not null.");
 282  0
         this.errorMode = errorMode;
 283  0
     }
 284  
 
 285  
     public ErrorMode getErrorMode() {
 286  0
         return this.errorMode;
 287  
     }
 288  
 
 289  
     public MicrodataParserException[] getErrors() {
 290  0
         return errors == null
 291  
                 ?
 292  
                 new MicrodataParserException[0]
 293  
                 :
 294  
                 errors.toArray( new MicrodataParserException[errors.size()] );
 295  
     }
 296  
 
 297  
     /**
 298  
      * Reads the value of a <b>itemprop</code> node.
 299  
      *
 300  
      * @param node itemprop node.
 301  
      * @return value detected within the given <code>node</code>.
 302  
      * @throws MicrodataParserException if an error occurs while extracting a nested item scope.
 303  
      */
 304  
     public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException {
 305  0
         final ItemPropValue itemPropValue = itemPropValues.get(node);
 306  0
         if(itemPropValue != null) return itemPropValue;
 307  
 
 308  0
         final String nodeName = node.getNodeName().toLowerCase();
 309  0
         if ("meta".equals(nodeName)) {
 310  0
             return new ItemPropValue(DomUtils.readAttribute(node, "content"), ItemPropValue.Type.Plain);
 311  
         }
 312  
 
 313  0
         if( SRC_TAGS.contains(nodeName) ) {
 314  0
             return new ItemPropValue( DomUtils.readAttribute(node, "src"), ItemPropValue.Type.Link);
 315  
         }
 316  0
         if( HREF_TAGS.contains(nodeName) ) {
 317  0
             return new ItemPropValue( DomUtils.readAttribute(node, "href"), ItemPropValue.Type.Link);
 318  
         }
 319  
 
 320  0
         if( "object".equals(nodeName) ) {
 321  0
             return new ItemPropValue( DomUtils.readAttribute(node, "data"), ItemPropValue.Type.Link);
 322  
         }
 323  0
         if( "time".equals(nodeName) ) {
 324  0
             final String dateTimeStr = DomUtils.readAttribute(node, "datetime");
 325  
             final Date dateTime;
 326  
             try {
 327  0
                 dateTime = ItemPropValue.parseDateTime(dateTimeStr);
 328  0
             } catch (ParseException pe) {
 329  0
                 throw new MicrodataParserException(
 330  
                         String.format("Invalid format for datetime '%s'", dateTimeStr),
 331  
                         node
 332  
                 );
 333  0
             }
 334  0
             return new ItemPropValue(dateTime, ItemPropValue.Type.Date);
 335  
         }
 336  
 
 337  0
         if( isItemScope(node) ) {
 338  0
             return new ItemPropValue( getItemScope(node), ItemPropValue.Type.Nested );
 339  
         }
 340  
 
 341  0
         final ItemPropValue newItemPropValue = new ItemPropValue( node.getTextContent(), ItemPropValue.Type.Plain);
 342  0
         itemPropValues.put(node, newItemPropValue);
 343  0
         return newItemPropValue;
 344  
     }
 345  
 
 346  
     /**
 347  
      * Returns all the <b>itemprop</b>s for the given <b>itemscope</b> node.
 348  
      *
 349  
      * @param node node representing the <b>itemscope</>
 350  
      * @param skipRoot if <code>true</code> the given root <code>node</node>
 351  
      *        will be not read as a property, even if it contains the <b>itemprop</b> attribute.
 352  
      * @return the list of <b>itemprop<b>s detected within the given <b>itemscope</b>.
 353  
      * @throws MicrodataParserException if an error occurs while retrieving an property value.
 354  
      */
 355  
     public List<ItemProp> getItemProps(Node node, boolean skipRoot) throws MicrodataParserException {
 356  0
         final List<Node> itemPropNodes = getItemPropNodes(node);
 357  
 
 358  
         // Skipping itemScopes nested to this item prop.
 359  0
         final List<Node> subItemScopes = getItemScopeNodes(node);
 360  0
         subItemScopes.remove(node);
 361  0
         final List<Node> accepted = new ArrayList<Node>();
 362  
         String subItemScopeXpath;
 363  
         String subItemPropXPath;
 364  0
         for(Node itemPropNode : itemPropNodes) {
 365  0
             boolean skip = false;
 366  0
             for(Node subItemScope : subItemScopes) {
 367  0
                 subItemScopeXpath = DomUtils.getXPathForNode(subItemScope);
 368  0
                 subItemPropXPath  = DomUtils.getXPathForNode(itemPropNode);
 369  0
                 if(
 370  
                     StringUtils.isPrefix(subItemScopeXpath, subItemPropXPath)
 371  
                             &&
 372  
                     // This prevent removal of itemprop that is also itemscope
 373  
                     subItemScopeXpath.length() < subItemPropXPath.length()
 374  
                 ) {
 375  0
                     skip = true;
 376  0
                     break;
 377  
                 }
 378  
             }
 379  0
             if(!skip) accepted.add(itemPropNode);
 380  0
         }
 381  
 
 382  0
         final List<ItemProp> result = new ArrayList<ItemProp>();
 383  0
         for(Node itemPropNode :  accepted) {
 384  0
             if(itemPropNode.equals(node) && skipRoot) {
 385  0
                 continue;
 386  
             }
 387  0
             final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null);
 388  0
             final String[] propertyNames = itemProp.split(" ");
 389  
             ItemPropValue itemPropValue;
 390  0
             for (String propertyName : propertyNames) {
 391  
                 try {
 392  0
                     itemPropValue = getPropertyValue(itemPropNode);
 393  0
                 } catch (MicrodataParserException mpe) {
 394  0
                     manageError(mpe);
 395  0
                     continue;
 396  0
                 }
 397  0
                 result.add(
 398  
                         new ItemProp(
 399  
                                 DomUtils.getXPathForNode(itemPropNode),
 400  
                                 propertyName,
 401  
                                 itemPropValue
 402  
                         )
 403  
                 );
 404  
             }
 405  0
         }
 406  0
         return result;
 407  
     }
 408  
 
 409  
     /**
 410  
      * Given a document and a list of <b>itemprop</b> names this method will return
 411  
      * such <b>itemprops</b>.
 412  
      *
 413  
      * @param refs list of references.
 414  
      * @return list of retrieved <b>itemprop</b>s.
 415  
      * @throws MicrodataParserException if a loop is detected or a property name is missing.
 416  
      */
 417  
     public ItemProp[] deferProperties(String... refs) throws MicrodataParserException {
 418  0
         dereferenceRecursionCounter++;
 419  0
         final List<ItemProp> result = new ArrayList<ItemProp>();
 420  
         try {
 421  0
             for (String ref : refs) {
 422  0
                 if (loopDetectorSet.contains(ref)) {
 423  0
                         throw new MicrodataParserException(
 424  
                                 String.format(
 425  
                                         "Loop detected with depth %d while dereferencing itemProp '%s' .",
 426  
                                         dereferenceRecursionCounter - 1, ref
 427  
                                 ),
 428  
                                 null
 429  
                         );
 430  
                 }
 431  0
                 loopDetectorSet.add(ref);
 432  0
                 final Element element = document.getElementById(ref);
 433  0
                 if (element == null) {
 434  0
                     manageError(
 435  
                             new MicrodataParserException( String.format("Unknown itemProp id '%s'", ref ), null )
 436  
                     );
 437  0
                     continue;
 438  
                 }
 439  0
                 result.addAll(getItemProps(element, false));
 440  
             }
 441  0
         } catch (MicrodataParserException mpe) {
 442  0
             if(dereferenceRecursionCounter == 1)
 443  0
                 manageError(mpe); else throw mpe;  // Recursion end, this the the top call.
 444  
         } finally {
 445  0
             dereferenceRecursionCounter--;
 446  0
             if(dereferenceRecursionCounter == 0) { // Recursion end, this the the top call.
 447  0
                 loopDetectorSet.clear();
 448  
             }
 449  
         }
 450  0
         return result.toArray( new ItemProp[result.size()] );
 451  
     }
 452  
 
 453  
     /**
 454  
      * Returns the {@link ItemScope} instance described within the specified <code>node</code>.
 455  
      *
 456  
      * @param node node describing an <i>itemscope</i>.
 457  
      * @return instance of ItemScope object.
 458  
      * @throws MicrodataParserException if an error occurs while dereferencing properties.
 459  
      */
 460  
     public ItemScope getItemScope(Node node) throws MicrodataParserException {
 461  0
         final ItemScope itemScope = itemScopes.get(node);
 462  0
         if(itemScope != null) return itemScope;
 463  
 
 464  0
         final String id       = DomUtils.readAttribute(node, "id"      , null);
 465  0
         final String itemref  = DomUtils.readAttribute(node, "itemref" , null);
 466  0
         final String itemType = DomUtils.readAttribute(node, "itemtype", null);
 467  0
         final String itemId   = DomUtils.readAttribute(node, "itemid"  , null);
 468  
 
 469  0
         final List<ItemProp> itemProps = getItemProps(node, true);
 470  0
         final String[] itemrefIDs = itemref == null ? new String[0] : itemref.split(" ");
 471  
         final ItemProp[] deferredProperties;
 472  
         try {
 473  0
             deferredProperties = deferProperties(itemrefIDs);
 474  0
         } catch (MicrodataParserException mpe) {
 475  0
             mpe.setErrorNode(node);
 476  0
             throw mpe;
 477  0
         }
 478  0
         for(ItemProp deferredProperty : deferredProperties) {
 479  0
             if( itemProps.contains(deferredProperty) ) {
 480  0
                 manageError(
 481  
                         new MicrodataParserException(
 482  
                             String.format("Duplicated deferred itemProp '%s'.", deferredProperty.getName() ),
 483  
                             node
 484  
                         )
 485  
                 );
 486  0
                 continue;
 487  
             }
 488  0
             itemProps.add(deferredProperty);
 489  
         }
 490  
 
 491  0
         final ItemScope newItemScope = new ItemScope(
 492  
                 DomUtils.getXPathForNode(node),
 493  
                 itemProps.toArray(new ItemProp[itemProps.size()]),
 494  
                 id,
 495  
                 itemrefIDs,
 496  
                 itemType,
 497  
                 itemId
 498  
         );
 499  0
         itemScopes.put(node, newItemScope);
 500  0
         return newItemScope;
 501  
     }
 502  
 
 503  
     private void manageError(MicrodataParserException mpe) throws MicrodataParserException {
 504  0
         if(errorMode == ErrorMode.StopAtFirstError) {
 505  0
             throw mpe;
 506  
         }
 507  0
         if(errorMode != ErrorMode.FullReport) throw new IllegalStateException("Unsupported mode " + errorMode);
 508  0
         if(errors == null) {
 509  0
             errors = new ArrayList<MicrodataParserException>();
 510  
         }
 511  0
         errors.add(mpe);
 512  0
     }
 513  
 
 514  
 }