Coverage Report - org.apache.any23.extractor.html.DomUtils
 
Classes in this File Line Coverage Branch Coverage Complexity
DomUtils
0%
0/119
0%
0/68
3.591
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.w3c.dom.NamedNodeMap;
 21  
 import org.w3c.dom.Node;
 22  
 import org.w3c.dom.NodeList;
 23  
 
 24  
 import javax.xml.transform.OutputKeys;
 25  
 import javax.xml.transform.Transformer;
 26  
 import javax.xml.transform.TransformerException;
 27  
 import javax.xml.transform.TransformerFactory;
 28  
 import javax.xml.transform.dom.DOMSource;
 29  
 import javax.xml.transform.stream.StreamResult;
 30  
 import javax.xml.xpath.XPath;
 31  
 import javax.xml.xpath.XPathConstants;
 32  
 import javax.xml.xpath.XPathExpressionException;
 33  
 import javax.xml.xpath.XPathFactory;
 34  
 import java.io.IOException;
 35  
 import java.io.StringWriter;
 36  
 import java.util.ArrayList;
 37  
 import java.util.List;
 38  
 
 39  
 /**
 40  
  * This class provides utility methods for DOM manipulation.
 41  
  * It is separated from {@link HTMLDocument} so that its methods
 42  
  * can be run on single DOM nodes without having to wrap them
 43  
  * into an HTMLDocument.
 44  
  * We use a mix of XPath and DOM manipulation.
 45  
  * <p/>
 46  
  * This is likely to be a performance bottleneck but at least
 47  
  * everything is localized here.
 48  
  * <p/>
 49  
  */
 50  
 public class DomUtils {
 51  
 
 52  0
     private static final String[] EMPTY_STRING_ARRAY = new String[0];
 53  
         
 54  0
     private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
 55  
 
 56  0
     private DomUtils(){}
 57  
 
 58  
     /**
 59  
      * Given a node this method returns the index corresponding to such node
 60  
      * within the list of the children of its parent node.
 61  
      *
 62  
      * @param n the node of which returning the index.
 63  
      * @return a non negative number.
 64  
      */
 65  
     public static int getIndexInParent(Node n) {
 66  0
         Node parent = n.getParentNode();
 67  0
         if(parent == null) {
 68  0
             return 0;
 69  
         }
 70  0
         NodeList nodes = parent.getChildNodes();
 71  0
         int counter = -1;
 72  0
         for(int i = 0; i < nodes.getLength(); i++) {
 73  0
             Node current = nodes.item(i);
 74  0
             if ( current.getNodeType() == n.getNodeType() && current.getNodeName().equals( n.getNodeName() ) ) {
 75  0
                 counter++;
 76  
             }
 77  0
             if( current.equals(n) ) {
 78  0
                 return counter;
 79  
             }
 80  
         }
 81  0
         throw new IllegalStateException("Cannot find a child within its parent node list.");
 82  
     }
 83  
 
 84  
     /**
 85  
      * Does a reverse walking of the DOM tree to generate a unique XPath
 86  
      * expression leading to this node. The XPath generated is the canonical
 87  
      * one based on sibling index: /html[1]/body[1]/div[2]/span[3] etc..
 88  
      *
 89  
      * @param node the input node.
 90  
      * @return the XPath location of node as String.
 91  
      */
 92  
     public static String getXPathForNode(Node node) {
 93  0
         final StringBuilder sb = new StringBuilder();
 94  0
         Node parent = node;
 95  0
         while(parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) {
 96  0
             sb.insert(0, "]");
 97  0
             sb.insert(0, getIndexInParent(parent) + 1);
 98  0
             sb.insert(0, "[");
 99  0
             sb.insert(0, parent.getNodeName());
 100  0
             sb.insert(0, "/");
 101  0
             parent = parent.getParentNode();
 102  
         }
 103  0
         return sb.toString();
 104  
     }
 105  
 
 106  
     /**
 107  
      * Returns a list of tag names representing the path from
 108  
      * the document root to the given node <i>n</i>.
 109  
      *
 110  
      * @param n the node for which retrieve the path.
 111  
      * @return a sequence of HTML tag names.
 112  
      */
 113  
     public static String[] getXPathListForNode(Node n) {
 114  0
         if(n == null) {
 115  0
             return EMPTY_STRING_ARRAY;
 116  
         }
 117  0
         List<String> ancestors = new ArrayList<String>();
 118  0
         ancestors.add( String.format("%s[%s]", n.getNodeName(), getIndexInParent(n) ) );
 119  0
         Node parent = n.getParentNode();
 120  0
         while(parent != null) {
 121  0
             ancestors.add(0, String.format("%s[%s]", parent.getNodeName(), getIndexInParent(parent) ) );
 122  0
             parent = parent.getParentNode();
 123  
         }
 124  0
         return ancestors.toArray( new String[ancestors.size()] );
 125  
     }
 126  
 
 127  
     /**
 128  
      * Returns the row/col location of the given node.
 129  
      *
 130  
      * @param n input node.
 131  
      * @return an array of two elements of type
 132  
      *         <code>[&lt;begin-row&gt;, &lt;begin-col&gt;, &lt;end-row&gt; &lt;end-col&gt;]</code>
 133  
      *         or <code>null</code> if not possible to extract such data.
 134  
      */
 135  
     public static int[] getNodeLocation(Node n) {
 136  0
         if(n == null) throw new NullPointerException("node cannot be null.");
 137  0
         final TagSoupParser.ElementLocation elementLocation =
 138  
             (TagSoupParser.ElementLocation) n.getUserData( TagSoupParser.ELEMENT_LOCATION );
 139  0
         if(elementLocation == null) return null;
 140  0
         return new int[]{
 141  
                 elementLocation.getBeginLineNumber(),
 142  
                 elementLocation.getBeginColumnNumber(),
 143  
                 elementLocation.getEndLineNumber(),
 144  
                 elementLocation.getEndColumnNumber()
 145  
         };
 146  
     }
 147  
 
 148  
     /**
 149  
      * Checks whether a node is ancestor or same of another node.
 150  
      *
 151  
      * @param candidateAncestor the candidate ancestor node.
 152  
      * @param candidateSibling the candidate sibling node.
 153  
      * @param strict if <code>true</code> is not allowed that the ancestor and sibling can be the same node.
 154  
      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
 155  
      *         <code>false</code> otherwise.
 156  
      */
 157  
     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) {
 158  0
         if(candidateAncestor == null) throw new NullPointerException("candidate ancestor cannot be null null.");
 159  0
         if(candidateSibling  == null) throw new NullPointerException("candidate sibling cannot be null null." );
 160  0
         if(strict && candidateAncestor.equals(candidateSibling)) return false;
 161  0
         Node parent = candidateSibling;
 162  0
         while(parent != null) {
 163  0
             if(parent.equals(candidateAncestor)) return true;
 164  0
             parent = parent.getParentNode();
 165  
         }
 166  0
         return false;
 167  
     }
 168  
 
 169  
     /**
 170  
      * Checks whether a node is ancestor or same of another node. As
 171  
      * {@link #isAncestorOf(org.w3c.dom.Node, org.w3c.dom.Node, boolean)} with <code>strict=false</code>.
 172  
      *
 173  
      * @param candidateAncestor the candidate ancestor node.
 174  
      * @param candidateSibling the candidate sibling node.
 175  
      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
 176  
      *         <code>false</code> otherwise.
 177  
      */
 178  
     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) {
 179  0
         return isAncestorOf(candidateAncestor, candidateSibling, false);
 180  
     }
 181  
 
 182  
     /**
 183  
      * Finds all nodes that have a declared class.
 184  
      * Note that the className is transformed to lower case before being
 185  
      * matched against the DOM.
 186  
      * @param root the root node from which start searching.
 187  
      * @param className the name of the filtered class.
 188  
      * @return list of matching nodes or an empty list.
 189  
      */
 190  
     public static List<Node> findAllByClassName(Node root, String className) {
 191  0
         return findAllByTagAndClassName(root, "*", className.toLowerCase());
 192  
     }
 193  
 
 194  
     /**
 195  
      * Finds all nodes that have a declared attribute.
 196  
      * Note that the className is transformed to lower case before being
 197  
      * matched against the DOM.
 198  
      * @param root the root node from which start searching.
 199  
      * @param attrName the name of the filtered attribue.
 200  
      * @return list of matching nodes or an empty list.
 201  
      */
 202  
     public static List<Node> findAllByAttributeName(Node root, String attrName) {
 203  0
         List<Node> result = new ArrayList<Node>();
 204  0
         for (Node node : findAll(root, String.format("./descendant-or-self::*[@%s]", attrName) ) ) {
 205  0
                 result.add(node);
 206  
         }
 207  0
         return result;
 208  
     }
 209  
 
 210  
     public static List<Node> findAllByTag(Node root, String tagName) {
 211  0
         List<Node> result = new ArrayList<Node>();
 212  0
         for (Node node : findAll(root, "./descendant-or-self::" + tagName)) {
 213  0
             result.add(node);
 214  
         }
 215  0
         return result;
 216  
     }
 217  
 
 218  
     public static List<Node> findAllByTagAndClassName(Node root, String tagName, String className) {
 219  0
         List<Node> result = new ArrayList<Node>();
 220  0
         for (Node node : findAll(
 221  
                 root,
 222  
                 "./descendant-or-self::" +
 223  
                 tagName +
 224  
                 "[contains(translate(@class,'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" +
 225  
                 className + "')]")
 226  
         ) {
 227  0
             if (DomUtils.hasClassName(node, className)) {
 228  0
                 result.add(node);
 229  
             }
 230  
         }
 231  0
         return result;
 232  
     }
 233  
 
 234  
     /**
 235  
      * Mimics the JS DOM API, or prototype's $()
 236  
      */
 237  
     public static Node findNodeById(Node root, String id) {
 238  
         Node node;
 239  
         try {
 240  0
             String xpath = "//*[@id='" + id + "']";
 241  0
             node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE);
 242  0
         } catch (XPathExpressionException ex) {
 243  0
             throw new RuntimeException("Should not happen", ex);
 244  0
         }
 245  0
         return node;
 246  
     }
 247  
 
 248  
     /**
 249  
      * Returns a NodeList composed of all the nodes that match an XPath
 250  
      * expression, which must be valid.
 251  
      */
 252  
     public static List<Node> findAll(Node node, String xpath) {
 253  0
         if(node == null) {
 254  0
             throw new NullPointerException("node cannot be null.");
 255  
         }
 256  
         try {
 257  0
             NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET);
 258  0
             List<Node> result = new ArrayList<Node>(nodes.getLength());
 259  0
             for (int i = 0; i < nodes.getLength(); i++) {
 260  0
                 result.add(nodes.item(i));
 261  
             }
 262  0
             return result;
 263  0
         } catch (XPathExpressionException ex) {
 264  0
             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
 265  
         }
 266  
     }
 267  
 
 268  
     /**
 269  
      * Gets the string value of an XPath expression.
 270  
      */
 271  
     public static String find(Node node, String xpath) {
 272  
         try {
 273  0
             String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
 274  0
             if (null == val)
 275  0
                 return "";
 276  0
             return val;
 277  0
         } catch (XPathExpressionException ex) {
 278  0
             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
 279  
         }
 280  
     }
 281  
 
 282  
     /**
 283  
      * Tells if an element has a class name <b>not checking the parents
 284  
      * in the hierarchy</b> mimicking the <i>CSS</i> .foo match.
 285  
      */
 286  
     public static boolean hasClassName(Node node, String className) {
 287  0
         return hasAttribute(node, "class", className);
 288  
     }
 289  
 
 290  
     /**
 291  
      * Checks the presence of an attribute value in attributes that
 292  
      * contain whitespace-separated lists of values. The semantic is the
 293  
      * CSS classes' ones: "foo" matches "bar foo", "foo" but not "foob"
 294  
      */
 295  
     public static boolean hasAttribute(Node node, String attributeName, String className) {
 296  
         // regex love, maybe faster but less easy to understand
 297  
         // Pattern pattern = Pattern.compile("(^|\\s+)"+className+"(\\s+|$)");
 298  0
         String attr = readAttribute(node, attributeName);
 299  0
         for (String c : attr.split("\\s+"))
 300  0
             if (c.equalsIgnoreCase(className))
 301  0
                 return true;
 302  0
         return false;
 303  
     }
 304  
 
 305  
      /**
 306  
      * Checks the presence of an attribute in the given <code>node</code>.
 307  
       *
 308  
       * @param node the node container.
 309  
       * @param attributeName the name of the attribute.
 310  
       */
 311  
     public static boolean hasAttribute(Node node, String attributeName) {
 312  0
         return readAttribute(node, attributeName, null) != null;
 313  
     }
 314  
 
 315  
     /**
 316  
      * Verifies if the given target node is an element.
 317  
      *
 318  
      * @param target
 319  
      * @return <code>true</code> if the element the node is an element,
 320  
      *         <code>false</code> otherwise.
 321  
      */
 322  
     public static boolean isElementNode(Node target) {
 323  0
         return Node.ELEMENT_NODE == target.getNodeType();
 324  
     }
 325  
 
 326  
     /**
 327  
      * Reads the value of the specified <code>attribute</code>, returning the
 328  
      * <code>defaultValue</code> string if not present.
 329  
      *
 330  
      * @param node node to read the attribute.
 331  
      * @param attribute attribute name.
 332  
      * @param defaultValue the default value to return if attribute is not found.
 333  
      * @return the attribute value or <code>defaultValue</code> if not found.
 334  
      */
 335  
     public static String readAttribute(Node node, String attribute, String defaultValue) {
 336  0
         NamedNodeMap attributes = node.getAttributes();
 337  0
         if (null == attributes)
 338  0
             return defaultValue;
 339  0
         Node attr = attributes.getNamedItem(attribute);
 340  0
         if (null==attr)
 341  0
             return defaultValue;
 342  0
         return attr.getNodeValue();
 343  
     }
 344  
 
 345  
     /**
 346  
      * Reads the value of the first <i>attribute</i> which name matches with the specified <code>attributePrefix</code>.
 347  
      * Returns the <code>defaultValue</code> if not found.
 348  
      *
 349  
      * @param node node to look for attributes.
 350  
      * @param attributePrefix attribute prefix.
 351  
      * @param defaultValue default returned value.
 352  
      * @return the value found or default.
 353  
      */
 354  
     public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) {
 355  0
         final NamedNodeMap attributes = node.getAttributes();
 356  0
         if (null == attributes) {
 357  0
             return defaultValue;
 358  
         }
 359  
         Node attribute;
 360  0
         for (int a = 0; a < attributes.getLength(); a++) {
 361  0
             attribute = attributes.item(a);
 362  0
             if (attribute.getNodeName().startsWith(attributePrefix)) {
 363  0
                 return attribute.getNodeValue();
 364  
             }
 365  
         }
 366  0
         return defaultValue;
 367  
     }
 368  
 
 369  
     /**
 370  
      * Reads the value of an <code>attribute</code>, returning the
 371  
      * empty string if not present.
 372  
      *
 373  
      * @param node node to read the attribute.
 374  
      * @param attribute attribute name.
 375  
      * @return the attribute value or <code>""</code> if not found.
 376  
      */
 377  
     public static String readAttribute(Node node, String attribute) {
 378  0
         return readAttribute(node, attribute, "");
 379  
     }
 380  
 
 381  
     /**
 382  
      * Given a <i>DOM</i> {@link Node} produces the <i>XML</i> serialization
 383  
      * omitting the <i>XML declaration</i>.
 384  
      *
 385  
      * @param node node to be serialized.
 386  
      * @param indent if <code>true</code> the output is indented.
 387  
      * @return the XML serialization.
 388  
      * @throws TransformerException if an error occurs during the
 389  
      *         serializator initialization and activation.
 390  
      * @throws java.io.IOException
 391  
      */
 392  
     public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException {
 393  0
         final DOMSource domSource = new DOMSource(node);
 394  0
         final Transformer transformer = TransformerFactory.newInstance().newTransformer();
 395  0
         transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
 396  0
         transformer.setOutputProperty(OutputKeys.METHOD, "xml");
 397  0
         transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
 398  0
         if(indent) {
 399  0
             transformer.setOutputProperty(OutputKeys.INDENT, "yes");
 400  0
             transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
 401  
         }
 402  0
         final StringWriter sw = new StringWriter();
 403  0
         final StreamResult sr = new StreamResult(sw);
 404  0
         transformer.transform(domSource, sr);
 405  0
         sw.close();
 406  0
         return sw.toString();
 407  
     }
 408  
 
 409  
 }