Coverage Report - org.apache.any23.extractor.html.HTMLDocument
 
Classes in this File Line Coverage Branch Coverage Complexity
HTMLDocument
0%
0/129
0%
0/66
3
HTMLDocument$TextField
0%
0/6
N/A
3
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ExtractionException;
 21  
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 22  
 import org.apache.any23.rdf.RDFUtils;
 23  
 import org.openrdf.model.URI;
 24  
 import org.openrdf.model.impl.ValueFactoryImpl;
 25  
 import org.slf4j.Logger;
 26  
 import org.slf4j.LoggerFactory;
 27  
 import org.w3c.dom.NamedNodeMap;
 28  
 import org.w3c.dom.Node;
 29  
 import org.w3c.dom.NodeList;
 30  
 import org.w3c.dom.Text;
 31  
 
 32  
 import javax.xml.xpath.XPath;
 33  
 import javax.xml.xpath.XPathConstants;
 34  
 import javax.xml.xpath.XPathExpressionException;
 35  
 import javax.xml.xpath.XPathFactory;
 36  
 import java.net.URISyntaxException;
 37  
 import java.util.ArrayList;
 38  
 import java.util.List;
 39  
 
 40  
 /**
 41  
  * A wrapper around the DOM representation of an HTML document.
 42  
  * Provides convenience access to various parts of the document.
 43  
  *
 44  
  * @author Gabriele Renzi
 45  
  * @author Michele Mostarda
 46  
  */
 47  
 public class HTMLDocument {
 48  
 
 49  0
     private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
 50  0
     private final static Logger log        = LoggerFactory.getLogger(HTMLDocument.class);
 51  
 
 52  
     private Node         document;
 53  
     private java.net.URI baseURI;
 54  
 
 55  0
     private final Any23ValueFactoryWrapper valueFactory =
 56  
             new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
 57  
 
 58  
     /**
 59  
      * Reads a text field from the given node adding the content to the given <i>res</i> list.
 60  
      *
 61  
      * @param node the node from which read the content.
 62  
      * @return a valid TextField
 63  
      */
 64  
     public static TextField readTextField(Node node) {
 65  
         TextField result;
 66  0
         final String name = node.getNodeName();
 67  0
         final NamedNodeMap attributes = node.getAttributes();
 68  
         // excess of safety check, should be impossible
 69  0
         if (attributes == null ) {
 70  0
             return new TextField( node.getTextContent(), node);
 71  
         }
 72  
         // first check if there are values inside
 73  0
         List<Node> values = DomUtils.findAllByClassName(node, "value");
 74  0
         if (!values.isEmpty()) {
 75  0
             String val = "";
 76  0
             for (Node n : values)
 77  0
                 val += n.getTextContent();
 78  0
             return new TextField( val.trim(), node);
 79  
         }
 80  0
         if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
 81  0
             result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
 82  0
         } else if ("A".equals(name)) {
 83  0
             if (DomUtils.hasAttribute(node, "rel", "tag")) {
 84  0
                 String href = extractRelTag(attributes);
 85  0
                 result = new TextField(href, node);
 86  0
             } else
 87  0
                 result = new TextField(node.getTextContent(), node);
 88  0
         } else if ("IMG".equals(name) || "AREA".equals(name)) {
 89  0
             result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
 90  
         } else {
 91  0
             result = new TextField(node.getTextContent(), node);
 92  
         }
 93  0
         return result;
 94  
     }
 95  
 
 96  
     /**
 97  
      * Reads an URL field from the given node adding the content to the given <i>res</i> list.
 98  
      *
 99  
      * @param res
 100  
      * @param node
 101  
      */
 102  
     public static void readUrlField(List<TextField> res, Node node) {
 103  0
         String name = node.getNodeName();
 104  0
         NamedNodeMap attributes = node.getAttributes();
 105  0
         if (null == attributes) {
 106  0
             res.add( new TextField(node.getTextContent(), node) );
 107  0
             return;
 108  
         }
 109  0
         if ("A".equals(name) || "AREA".equals(name)) {
 110  0
             Node n = attributes.getNamedItem("href");
 111  0
             res.add( new TextField(n.getNodeValue(), n) );
 112  0
         } else if ("ABBR".equals(name)) {
 113  0
             Node n = attributes.getNamedItem("title");
 114  0
             res.add( new TextField(n.getNodeValue(), n) );
 115  0
         } else if ("IMG".equals(name)) {
 116  0
             Node n = attributes.getNamedItem("src");
 117  0
             res.add( new TextField(n.getNodeValue(), n) );
 118  0
         } else if ("OBJECT".equals(name)) {
 119  0
             Node n = attributes.getNamedItem("data");
 120  0
             res.add( new TextField(n.getNodeValue(), n) );
 121  0
         } else {
 122  0
             res.add( new TextField(node.getTextContent().trim(), node) );
 123  
         }
 124  0
     }
 125  
 
 126  
     /**
 127  
      * Extracts the href specific rel-tag string.
 128  
      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
 129  
      *
 130  
      * @param hrefAttributeContent the content of the <i>href</i> attribute.
 131  
      * @return the rel-tag specification.
 132  
      */
 133  
     public static String extractRelTag(String hrefAttributeContent) {
 134  0
         String[] all = hrefAttributeContent.split("[#?]");
 135  
         // Cleanup spurious segments.
 136  0
         String path = all[0];
 137  0
         int pathLenghtMin1 = path.length() - 1;
 138  0
         if( '/' == path.charAt(pathLenghtMin1) ) {
 139  0
             path = path.substring(0, pathLenghtMin1);
 140  
         }
 141  0
         return path;
 142  
     }
 143  
 
 144  
     /**
 145  
      * Extracts the href specific rel-tag string.
 146  
      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
 147  
      *
 148  
      * @param attributes the list of attributes of a node.
 149  
      * @return the rel-tag specification.
 150  
      */
 151  
     public static String extractRelTag(NamedNodeMap attributes) {
 152  0
         return extractRelTag(attributes.getNamedItem("href").getNodeValue());
 153  
     }
 154  
 
 155  
     /**
 156  
      * Reads the text content of the given node and returns it.
 157  
      * If the <code>prettify</code> flag is <code>true</code>
 158  
      * the text is cleaned up.
 159  
      *
 160  
      * @param node node to read content.
 161  
      * @param prettify if <code>true</code> blank chars will be removed.
 162  
      * @return the read text.
 163  
      */
 164  
     public static String readNodeContent(Node node, boolean prettify) {
 165  0
         final String content = node.getTextContent();
 166  0
         return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
 167  
     }
 168  
 
 169  
     /**
 170  
      * Constructor accepting the root node.
 171  
      * 
 172  
      * @param document
 173  
      */
 174  0
     public HTMLDocument(Node document) {
 175  0
         if (null == document)
 176  0
             throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
 177  0
         this.document = document;
 178  0
     }
 179  
 
 180  
     /**
 181  
      * @return An absolute URI, or null if the URI is not fixable
 182  
      * @throws org.apache.any23.extractor.ExtractionException If the base URI is invalid
 183  
      */
 184  
     public URI resolveURI(String uri) throws ExtractionException {
 185  0
         return valueFactory.resolveURI(uri, getBaseURI());
 186  
     }
 187  
 
 188  
     public String find(String xpath) {
 189  0
         return DomUtils.find(getDocument(), xpath);
 190  
     }
 191  
 
 192  
     public Node findNodeById(String id) {
 193  0
         return DomUtils.findNodeById(getDocument(), id);
 194  
     }
 195  
 
 196  
     public List<Node> findAll(String xpath) {
 197  0
         return DomUtils.findAll(getDocument(), xpath);
 198  
     }
 199  
 
 200  
     public String findMicroformattedValue(
 201  
             String objectTag,
 202  
             String object,
 203  
             String fieldTag,
 204  
             String field,
 205  
             String key
 206  
     ) {
 207  0
         Node node = findMicroformattedObjectNode(objectTag, object);
 208  0
         if (null == node)
 209  0
             return "";
 210  
         // try to check if it is inline
 211  0
         if (DomUtils.hasClassName(node, field))
 212  0
             return node.getTextContent();
 213  
 
 214  
         // failed, try to find it in a child
 215  
         try {
 216  0
             String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
 217  0
             String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
 218  0
             if (null == value) {
 219  0
                 return "";
 220  
             }
 221  0
             return value;
 222  0
         } catch (XPathExpressionException ex) {
 223  0
             throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
 224  
         }
 225  
 
 226  
     }
 227  
 
 228  
     public Node getDocument() {
 229  0
         return document;
 230  
     }
 231  
 
 232  
     /**
 233  
      * Returns a singular text field. 
 234  
      *
 235  
      * @param className name of class containing text.
 236  
      * @return if multiple values are found just the first is returned,
 237  
      * if we want to check that there are no n-ary values use plural finder
 238  
      */
 239  
     public TextField getSingularTextField(String className) {
 240  0
         TextField[] res = getPluralTextField(className);
 241  0
         if (res.length == 0)
 242  0
             return new TextField("", null);
 243  0
         return res[0];
 244  
     }
 245  
 
 246  
     /**
 247  
      * Returns a plural text field.
 248  
      * 
 249  
      * @param className name of class node containing text.
 250  
      * @return list of fields.
 251  
      */
 252  
     public TextField[] getPluralTextField(String className) {
 253  0
         List<TextField> res = new ArrayList<TextField>();
 254  0
         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
 255  0
         for (Node node : nodes) {
 256  0
             res.add( readTextField(node) );
 257  
         }
 258  0
         return res.toArray( new TextField[res.size()] );
 259  
     }
 260  
 
 261  
     /**
 262  
      * Returns the URL associated to the field marked with class <i>className</i>.
 263  
      *
 264  
      * @param className name of node class containing the URL field.
 265  
      * @return if multiple values are found just the first is returned,
 266  
      *  if we want to check that there are no n-ary values use plural finder
 267  
      */
 268  
     public TextField getSingularUrlField(String className) {
 269  0
         TextField[] res = getPluralUrlField(className);
 270  0
         if (res.length < 1)
 271  0
             return new TextField("", null);
 272  0
         return res[0];
 273  
     }
 274  
 
 275  
     /**
 276  
      * Returns the list of URLs associated to the fields marked with class <i>className</i>.
 277  
      *
 278  
      * @param className name of node class containing the URL field.
 279  
      * @return the list of {@link HTMLDocument.TextField} found.
 280  
      */
 281  
     public TextField[] getPluralUrlField(String className) {
 282  0
         List<TextField> res = new ArrayList<TextField>();
 283  0
         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
 284  0
         for (Node node : nodes)
 285  0
             readUrlField(res, node);
 286  0
         return res.toArray( new TextField[res.size()] );
 287  
     }
 288  
 
 289  
     public Node findMicroformattedObjectNode(String objectTag, String name) {
 290  0
         List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
 291  0
         if (nodes.isEmpty())
 292  0
             return null;
 293  0
         return nodes.get(0);
 294  
     }
 295  
 
 296  
     /**
 297  
      * Read an attribute avoiding NullPointerExceptions, if the attr is
 298  
      * missing it just returns an empty string.
 299  
      *
 300  
      * @param attribute the attribute name.
 301  
      * @return the string representing the attribute.
 302  
      */
 303  
     public String readAttribute(String attribute) {
 304  0
         return DomUtils.readAttribute(getDocument(), attribute);
 305  
     }
 306  
 
 307  
     /**
 308  
      * Finds all the nodes by class name.
 309  
      *
 310  
      * @param clazz the class name.
 311  
      * @return list of matching nodes.
 312  
      */
 313  
     public List<Node> findAllByClassName(String clazz) {
 314  0
         return DomUtils.findAllByClassName(getDocument(), clazz);
 315  
     }
 316  
 
 317  
     /**
 318  
      * Returns the text contained inside a node if leaf,
 319  
      * <code>null</code> otherwise.
 320  
      *
 321  
      * @return the text of a leaf node.
 322  
      */
 323  
     public String getText() {
 324  0
         NodeList children = getDocument().getChildNodes();
 325  0
         if(children.getLength() == 1 && children.item(0) instanceof Text) {
 326  0
             return children.item(0).getTextContent();
 327  
         }
 328  0
         return null;
 329  
     }
 330  
 
 331  
     /**
 332  
      * Returns the document default language.
 333  
      *
 334  
      * @return default language if any, <code>null</code> otherwise.
 335  
      */
 336  
     public String getDefaultLanguage() {
 337  0
         final String xpathLanguageSelector = "/HTML";
 338  
         Node html;
 339  
         try {
 340  0
             html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
 341  0
         } catch (XPathExpressionException xpeee) {
 342  0
             throw new IllegalStateException();
 343  0
         }
 344  0
         if (html == null) {
 345  0
             return null;
 346  
         }
 347  0
         Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
 348  0
         return langAttribute == null ? null : langAttribute.getTextContent();
 349  
     }
 350  
 
 351  
     /**
 352  
      * Returns the sequence of ancestors from the document root to the local root (document).
 353  
      *
 354  
      * @return a sequence of node names.
 355  
      */
 356  
     public String[] getPathToLocalRoot() {
 357  0
         return DomUtils.getXPathListForNode(document);
 358  
     }
 359  
 
 360  
     /**
 361  
      * Extracts all the <code>rel</code> tag nodes.
 362  
      *
 363  
      * @return list of rel tag nodes.
 364  
      */
 365  
     public TextField[] extractRelTagNodes() {
 366  0
         final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
 367  0
         final List<TextField> result = new ArrayList<TextField>();
 368  0
         for(Node relTagNode : relTagNodes) {
 369  0
             readUrlField(result, relTagNode);
 370  
         }
 371  0
         return result.toArray( new TextField[result.size()] );
 372  
     }
 373  
 
 374  
     private java.net.URI getBaseURI() throws ExtractionException {
 375  0
         if (baseURI == null) {
 376  
             try {
 377  0
                 if (document.getBaseURI() == null) {
 378  0
                     log.warn("document.getBaseURI() is null, this should not happen");
 379  
                 }
 380  0
                 baseURI = new java.net.URI(RDFUtils.fixAbsoluteURI(document.getBaseURI()));
 381  0
             } catch (IllegalArgumentException ex) {
 382  0
                 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
 383  0
             } catch (URISyntaxException ex) {
 384  0
                 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
 385  0
             }
 386  
         }
 387  0
         return baseURI;
 388  
     }
 389  
 
 390  
     /**
 391  
      * This class represents a text extracted from the <i>HTML</i> DOM related
 392  
      * to the node from which such test has been retrieved.
 393  
      */
 394  
     public static class TextField {
 395  
         private String value;
 396  
         private Node   source;
 397  
 
 398  0
         public TextField(String value, Node source) {
 399  0
             this.value = value;
 400  0
             this.source = source;
 401  0
         }
 402  
 
 403  
         public String value() {
 404  0
             return value;
 405  
         }
 406  
 
 407  
         public Node source() {
 408  0
             return source;
 409  
         }
 410  
     }
 411  
 
 412  
 }