Coverage Report

Coverage Report - org.apache.any23.extractor.html.HTMLDocument

Classes in this File

Line Coverage

Branch Coverage

Complexity

HTMLDocument

0/129

0/66

HTMLDocument$TextField

0/6

N/A

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 import org.apache.any23.rdf.RDFUtils;
 import org.openrdf.model.URI;
 import org.openrdf.model.impl.ValueFactoryImpl;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.w3c.dom.Text;
 
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.List;
 
 /**
  * A wrapper around the DOM representation of an HTML document.
  * Provides convenience access to various parts of the document.
  *
  * @author Gabriele Renzi
  * @author Michele Mostarda
  */
 public class HTMLDocument {
 
     private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
     private final static Logger log        = LoggerFactory.getLogger(HTMLDocument.class);
 
     private Node         document;
     private java.net.URI baseURI;
 
     private final Any23ValueFactoryWrapper valueFactory =
             new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
 
     /**
      * Reads a text field from the given node adding the content to the given <i>res</i> list.
      *
      * @param node the node from which read the content.
      * @return a valid TextField
      */
     public static TextField readTextField(Node node) {
         TextField result;
         final String name = node.getNodeName();
         final NamedNodeMap attributes = node.getAttributes();
         // excess of safety check, should be impossible
         if (attributes == null ) {
             return new TextField( node.getTextContent(), node);
         }
         // first check if there are values inside
         List<Node> values = DomUtils.findAllByClassName(node, "value");
         if (!values.isEmpty()) {
             String val = "";
             for (Node n : values)
                 val += n.getTextContent();
             return new TextField( val.trim(), node);
         }
         if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
             result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
         } else if ("A".equals(name)) {
             if (DomUtils.hasAttribute(node, "rel", "tag")) {
                 String href = extractRelTag(attributes);
                 result = new TextField(href, node);
             } else
                 result = new TextField(node.getTextContent(), node);
         } else if ("IMG".equals(name) || "AREA".equals(name)) {
             result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
         } else {
             result = new TextField(node.getTextContent(), node);
         }
         return result;
     }
 
     /**
      * Reads an URL field from the given node adding the content to the given <i>res</i> list.
      *
      * @param res
      * @param node
      */
     public static void readUrlField(List<TextField> res, Node node) {
         String name = node.getNodeName();
         NamedNodeMap attributes = node.getAttributes();
         if (null == attributes) {
             res.add( new TextField(node.getTextContent(), node) );
             return;
         }
         if ("A".equals(name) || "AREA".equals(name)) {
             Node n = attributes.getNamedItem("href");
             res.add( new TextField(n.getNodeValue(), n) );
         } else if ("ABBR".equals(name)) {
             Node n = attributes.getNamedItem("title");
             res.add( new TextField(n.getNodeValue(), n) );
         } else if ("IMG".equals(name)) {
             Node n = attributes.getNamedItem("src");
             res.add( new TextField(n.getNodeValue(), n) );
         } else if ("OBJECT".equals(name)) {
             Node n = attributes.getNamedItem("data");
             res.add( new TextField(n.getNodeValue(), n) );
         } else {
             res.add( new TextField(node.getTextContent().trim(), node) );
         }
     }
 
     /**
      * Extracts the href specific rel-tag string.
      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
      *
      * @param hrefAttributeContent the content of the <i>href</i> attribute.
      * @return the rel-tag specification.
      */
     public static String extractRelTag(String hrefAttributeContent) {
         String[] all = hrefAttributeContent.split("[#?]");
         // Cleanup spurious segments.
         String path = all[0];
         int pathLenghtMin1 = path.length() - 1;
         if( '/' == path.charAt(pathLenghtMin1) ) {
             path = path.substring(0, pathLenghtMin1);
         }
         return path;
     }
 
     /**
      * Extracts the href specific rel-tag string.
      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
      *
      * @param attributes the list of attributes of a node.
      * @return the rel-tag specification.
      */
     public static String extractRelTag(NamedNodeMap attributes) {
         return extractRelTag(attributes.getNamedItem("href").getNodeValue());
     }
 
     /**
      * Reads the text content of the given node and returns it.
      * If the <code>prettify</code> flag is <code>true</code>
      * the text is cleaned up.
      *
      * @param node node to read content.
      * @param prettify if <code>true</code> blank chars will be removed.
      * @return the read text.
      */
     public static String readNodeContent(Node node, boolean prettify) {
         final String content = node.getTextContent();
         return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
     }
 
     /**
      * Constructor accepting the root node.
      * 
      * @param document
      */
     public HTMLDocument(Node document) {
         if (null == document)
             throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
         this.document = document;
     }
 
     /**
      * @return An absolute URI, or null if the URI is not fixable
      * @throws org.apache.any23.extractor.ExtractionException If the base URI is invalid
      */
     public URI resolveURI(String uri) throws ExtractionException {
         return valueFactory.resolveURI(uri, getBaseURI());
     }
 
     public String find(String xpath) {
         return DomUtils.find(getDocument(), xpath);
     }
 
     public Node findNodeById(String id) {
         return DomUtils.findNodeById(getDocument(), id);
     }
 
     public List<Node> findAll(String xpath) {
         return DomUtils.findAll(getDocument(), xpath);
     }
 
     public String findMicroformattedValue(
             String objectTag,
             String object,
             String fieldTag,
             String field,
             String key
     ) {
         Node node = findMicroformattedObjectNode(objectTag, object);
         if (null == node)
             return "";
         // try to check if it is inline
         if (DomUtils.hasClassName(node, field))
             return node.getTextContent();
 
         // failed, try to find it in a child
         try {
             String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
             String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
             if (null == value) {
                 return "";
             }
             return value;
         } catch (XPathExpressionException ex) {
             throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
         }
 
     }
 
     public Node getDocument() {
         return document;
     }
 
     /**
      * Returns a singular text field. 
      *
      * @param className name of class containing text.
      * @return if multiple values are found just the first is returned,
      * if we want to check that there are no n-ary values use plural finder
      */
     public TextField getSingularTextField(String className) {
         TextField[] res = getPluralTextField(className);
         if (res.length == 0)
             return new TextField("", null);
         return res[0];
     }
 
     /**
      * Returns a plural text field.
      * 
      * @param className name of class node containing text.
      * @return list of fields.
      */
     public TextField[] getPluralTextField(String className) {
         List<TextField> res = new ArrayList<TextField>();
         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
         for (Node node : nodes) {
             res.add( readTextField(node) );
         }
         return res.toArray( new TextField[res.size()] );
     }
 
     /**
      * Returns the URL associated to the field marked with class <i>className</i>.
      *
      * @param className name of node class containing the URL field.
      * @return if multiple values are found just the first is returned,
      *  if we want to check that there are no n-ary values use plural finder
      */
     public TextField getSingularUrlField(String className) {
         TextField[] res = getPluralUrlField(className);
         if (res.length < 1)
             return new TextField("", null);
         return res[0];
     }
 
     /**
      * Returns the list of URLs associated to the fields marked with class <i>className</i>.
      *
      * @param className name of node class containing the URL field.
      * @return the list of {@link HTMLDocument.TextField} found.
      */
     public TextField[] getPluralUrlField(String className) {
         List<TextField> res = new ArrayList<TextField>();
         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
         for (Node node : nodes)
             readUrlField(res, node);
         return res.toArray( new TextField[res.size()] );
     }
 
     public Node findMicroformattedObjectNode(String objectTag, String name) {
         List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
         if (nodes.isEmpty())
             return null;
         return nodes.get(0);
     }
 
     /**
      * Read an attribute avoiding NullPointerExceptions, if the attr is
      * missing it just returns an empty string.
      *
      * @param attribute the attribute name.
      * @return the string representing the attribute.
      */
     public String readAttribute(String attribute) {
         return DomUtils.readAttribute(getDocument(), attribute);
     }
 
     /**
      * Finds all the nodes by class name.
      *
      * @param clazz the class name.
      * @return list of matching nodes.
      */
     public List<Node> findAllByClassName(String clazz) {
         return DomUtils.findAllByClassName(getDocument(), clazz);
     }
 
     /**
      * Returns the text contained inside a node if leaf,
      * <code>null</code> otherwise.
      *
      * @return the text of a leaf node.
      */
     public String getText() {
         NodeList children = getDocument().getChildNodes();
         if(children.getLength() == 1 && children.item(0) instanceof Text) {
             return children.item(0).getTextContent();
         }
         return null;
     }
 
     /**
      * Returns the document default language.
      *
      * @return default language if any, <code>null</code> otherwise.
      */
     public String getDefaultLanguage() {
         final String xpathLanguageSelector = "/HTML";
         Node html;
         try {
             html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
         } catch (XPathExpressionException xpeee) {
             throw new IllegalStateException();
         }
         if (html == null) {
             return null;
         }
         Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
         return langAttribute == null ? null : langAttribute.getTextContent();
     }
 
     /**
      * Returns the sequence of ancestors from the document root to the local root (document).
      *
      * @return a sequence of node names.
      */
     public String[] getPathToLocalRoot() {
         return DomUtils.getXPathListForNode(document);
     }
 
     /**
      * Extracts all the <code>rel</code> tag nodes.
      *
      * @return list of rel tag nodes.
      */
     public TextField[] extractRelTagNodes() {
         final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
         final List<TextField> result = new ArrayList<TextField>();
         for(Node relTagNode : relTagNodes) {
             readUrlField(result, relTagNode);
         }
         return result.toArray( new TextField[result.size()] );
     }
 
     private java.net.URI getBaseURI() throws ExtractionException {
         if (baseURI == null) {
             try {
                 if (document.getBaseURI() == null) {
                     log.warn("document.getBaseURI() is null, this should not happen");
                 }
                 baseURI = new java.net.URI(RDFUtils.fixAbsoluteURI(document.getBaseURI()));
             } catch (IllegalArgumentException ex) {
                 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
             } catch (URISyntaxException ex) {
                 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
             }
         }
         return baseURI;
     }
 
     /**
      * This class represents a text extracted from the <i>HTML</i> DOM related
      * to the node from which such test has been retrieved.
      */
     public static class TextField {
         private String value;
         private Node   source;
 
         public TextField(String value, Node source) {
             this.value = value;
             this.source = source;
         }
 
         public String value() {
             return value;
         }
 
         public Node source() {
             return source;
         }
     }
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.extractor.ExtractionException;
21		import org.apache.any23.rdf.Any23ValueFactoryWrapper;
22		import org.apache.any23.rdf.RDFUtils;
23		import org.openrdf.model.URI;
24		import org.openrdf.model.impl.ValueFactoryImpl;
25		import org.slf4j.Logger;
26		import org.slf4j.LoggerFactory;
27		import org.w3c.dom.NamedNodeMap;
28		import org.w3c.dom.Node;
29		import org.w3c.dom.NodeList;
30		import org.w3c.dom.Text;
31
32		import javax.xml.xpath.XPath;
33		import javax.xml.xpath.XPathConstants;
34		import javax.xml.xpath.XPathExpressionException;
35		import javax.xml.xpath.XPathFactory;
36		import java.net.URISyntaxException;
37		import java.util.ArrayList;
38		import java.util.List;
39
40		/**
41		* A wrapper around the DOM representation of an HTML document.
42		* Provides convenience access to various parts of the document.
43		*
44		* @author Gabriele Renzi
45		* @author Michele Mostarda
46		*/
47		public class HTMLDocument {
48
49	0	private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
50	0	private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class);
51
52		private Node document;
53		private java.net.URI baseURI;
54
55	0	private final Any23ValueFactoryWrapper valueFactory =
56		new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
57
58		/**
59		* Reads a text field from the given node adding the content to the given <i>res</i> list.
60		*
61		* @param node the node from which read the content.
62		* @return a valid TextField
63		*/
64		public static TextField readTextField(Node node) {
65		TextField result;
66	0	final String name = node.getNodeName();
67	0	final NamedNodeMap attributes = node.getAttributes();
68		// excess of safety check, should be impossible
69	0	if (attributes == null ) {
70	0	return new TextField( node.getTextContent(), node);
71		}
72		// first check if there are values inside
73	0	List<Node> values = DomUtils.findAllByClassName(node, "value");
74	0	if (!values.isEmpty()) {
75	0	String val = "";
76	0	for (Node n : values)
77	0	val += n.getTextContent();
78	0	return new TextField( val.trim(), node);
79		}
80	0	if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
81	0	result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
82	0	} else if ("A".equals(name)) {
83	0	if (DomUtils.hasAttribute(node, "rel", "tag")) {
84	0	String href = extractRelTag(attributes);
85	0	result = new TextField(href, node);
86	0	} else
87	0	result = new TextField(node.getTextContent(), node);
88	0	} else if ("IMG".equals(name) \|\| "AREA".equals(name)) {
89	0	result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
90		} else {
91	0	result = new TextField(node.getTextContent(), node);
92		}
93	0	return result;
94		}
95
96		/**
97		* Reads an URL field from the given node adding the content to the given <i>res</i> list.
98		*
99		* @param res
100		* @param node
101		*/
102		public static void readUrlField(List<TextField> res, Node node) {
103	0	String name = node.getNodeName();
104	0	NamedNodeMap attributes = node.getAttributes();
105	0	if (null == attributes) {
106	0	res.add( new TextField(node.getTextContent(), node) );
107	0	return;
108		}
109	0	if ("A".equals(name) \|\| "AREA".equals(name)) {
110	0	Node n = attributes.getNamedItem("href");
111	0	res.add( new TextField(n.getNodeValue(), n) );
112	0	} else if ("ABBR".equals(name)) {
113	0	Node n = attributes.getNamedItem("title");
114	0	res.add( new TextField(n.getNodeValue(), n) );
115	0	} else if ("IMG".equals(name)) {
116	0	Node n = attributes.getNamedItem("src");
117	0	res.add( new TextField(n.getNodeValue(), n) );
118	0	} else if ("OBJECT".equals(name)) {
119	0	Node n = attributes.getNamedItem("data");
120	0	res.add( new TextField(n.getNodeValue(), n) );
121	0	} else {
122	0	res.add( new TextField(node.getTextContent().trim(), node) );
123		}
124	0	}
125
126		/**
127		* Extracts the href specific rel-tag string.
128		* See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
129		*
130		* @param hrefAttributeContent the content of the <i>href</i> attribute.
131		* @return the rel-tag specification.
132		*/
133		public static String extractRelTag(String hrefAttributeContent) {
134	0	String[] all = hrefAttributeContent.split("[#?]");
135		// Cleanup spurious segments.
136	0	String path = all[0];
137	0	int pathLenghtMin1 = path.length() - 1;
138	0	if( '/' == path.charAt(pathLenghtMin1) ) {
139	0	path = path.substring(0, pathLenghtMin1);
140		}
141	0	return path;
142		}
143
144		/**
145		* Extracts the href specific rel-tag string.
146		* See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
147		*
148		* @param attributes the list of attributes of a node.
149		* @return the rel-tag specification.
150		*/
151		public static String extractRelTag(NamedNodeMap attributes) {
152	0	return extractRelTag(attributes.getNamedItem("href").getNodeValue());
153		}
154
155		/**
156		* Reads the text content of the given node and returns it.
157		* If the <code>prettify</code> flag is <code>true</code>
158		* the text is cleaned up.
159		*
160		* @param node node to read content.
161		* @param prettify if <code>true</code> blank chars will be removed.
162		* @return the read text.
163		*/
164		public static String readNodeContent(Node node, boolean prettify) {
165	0	final String content = node.getTextContent();
166	0	return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
167		}
168
169		/**
170		* Constructor accepting the root node.
171		*
172		* @param document
173		*/
174	0	public HTMLDocument(Node document) {
175	0	if (null == document)
176	0	throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
177	0	this.document = document;
178	0	}
179
180		/**
181		* @return An absolute URI, or null if the URI is not fixable
182		* @throws org.apache.any23.extractor.ExtractionException If the base URI is invalid
183		*/
184		public URI resolveURI(String uri) throws ExtractionException {
185	0	return valueFactory.resolveURI(uri, getBaseURI());
186		}
187
188		public String find(String xpath) {
189	0	return DomUtils.find(getDocument(), xpath);
190		}
191
192		public Node findNodeById(String id) {
193	0	return DomUtils.findNodeById(getDocument(), id);
194		}
195
196		public List<Node> findAll(String xpath) {
197	0	return DomUtils.findAll(getDocument(), xpath);
198		}
199
200		public String findMicroformattedValue(
201		String objectTag,
202		String object,
203		String fieldTag,
204		String field,
205		String key
206		) {
207	0	Node node = findMicroformattedObjectNode(objectTag, object);
208	0	if (null == node)
209	0	return "";
210		// try to check if it is inline
211	0	if (DomUtils.hasClassName(node, field))
212	0	return node.getTextContent();
213
214		// failed, try to find it in a child
215		try {
216	0	String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
217	0	String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
218	0	if (null == value) {
219	0	return "";
220		}
221	0	return value;
222	0	} catch (XPathExpressionException ex) {
223	0	throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
224		}
225
226		}
227
228		public Node getDocument() {
229	0	return document;
230		}
231
232		/**
233		* Returns a singular text field.
234		*
235		* @param className name of class containing text.
236		* @return if multiple values are found just the first is returned,
237		* if we want to check that there are no n-ary values use plural finder
238		*/
239		public TextField getSingularTextField(String className) {
240	0	TextField[] res = getPluralTextField(className);
241	0	if (res.length == 0)
242	0	return new TextField("", null);
243	0	return res[0];
244		}
245
246		/**
247		* Returns a plural text field.
248		*
249		* @param className name of class node containing text.
250		* @return list of fields.
251		*/
252		public TextField[] getPluralTextField(String className) {
253	0	List<TextField> res = new ArrayList<TextField>();
254	0	List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
255	0	for (Node node : nodes) {
256	0	res.add( readTextField(node) );
257		}
258	0	return res.toArray( new TextField[res.size()] );
259		}
260
261		/**
262		* Returns the URL associated to the field marked with class <i>className</i>.
263		*
264		* @param className name of node class containing the URL field.
265		* @return if multiple values are found just the first is returned,
266		* if we want to check that there are no n-ary values use plural finder
267		*/
268		public TextField getSingularUrlField(String className) {
269	0	TextField[] res = getPluralUrlField(className);
270	0	if (res.length < 1)
271	0	return new TextField("", null);
272	0	return res[0];
273		}
274
275		/**
276		* Returns the list of URLs associated to the fields marked with class <i>className</i>.
277		*
278		* @param className name of node class containing the URL field.
279		* @return the list of {@link HTMLDocument.TextField} found.
280		*/
281		public TextField[] getPluralUrlField(String className) {
282	0	List<TextField> res = new ArrayList<TextField>();
283	0	List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
284	0	for (Node node : nodes)
285	0	readUrlField(res, node);
286	0	return res.toArray( new TextField[res.size()] );
287		}
288
289		public Node findMicroformattedObjectNode(String objectTag, String name) {
290	0	List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
291	0	if (nodes.isEmpty())
292	0	return null;
293	0	return nodes.get(0);
294		}
295
296		/**
297		* Read an attribute avoiding NullPointerExceptions, if the attr is
298		* missing it just returns an empty string.
299		*
300		* @param attribute the attribute name.
301		* @return the string representing the attribute.
302		*/
303		public String readAttribute(String attribute) {
304	0	return DomUtils.readAttribute(getDocument(), attribute);
305		}
306
307		/**
308		* Finds all the nodes by class name.
309		*
310		* @param clazz the class name.
311		* @return list of matching nodes.
312		*/
313		public List<Node> findAllByClassName(String clazz) {
314	0	return DomUtils.findAllByClassName(getDocument(), clazz);
315		}
316
317		/**
318		* Returns the text contained inside a node if leaf,
319		* <code>null</code> otherwise.
320		*
321		* @return the text of a leaf node.
322		*/
323		public String getText() {
324	0	NodeList children = getDocument().getChildNodes();
325	0	if(children.getLength() == 1 && children.item(0) instanceof Text) {
326	0	return children.item(0).getTextContent();
327		}
328	0	return null;
329		}
330
331		/**
332		* Returns the document default language.
333		*
334		* @return default language if any, <code>null</code> otherwise.
335		*/
336		public String getDefaultLanguage() {
337	0	final String xpathLanguageSelector = "/HTML";
338		Node html;
339		try {
340	0	html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
341	0	} catch (XPathExpressionException xpeee) {
342	0	throw new IllegalStateException();
343	0	}
344	0	if (html == null) {
345	0	return null;
346		}
347	0	Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
348	0	return langAttribute == null ? null : langAttribute.getTextContent();
349		}
350
351		/**
352		* Returns the sequence of ancestors from the document root to the local root (document).
353		*
354		* @return a sequence of node names.
355		*/
356		public String[] getPathToLocalRoot() {
357	0	return DomUtils.getXPathListForNode(document);
358		}
359
360		/**
361		* Extracts all the <code>rel</code> tag nodes.
362		*
363		* @return list of rel tag nodes.
364		*/
365		public TextField[] extractRelTagNodes() {
366	0	final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
367	0	final List<TextField> result = new ArrayList<TextField>();
368	0	for(Node relTagNode : relTagNodes) {
369	0	readUrlField(result, relTagNode);
370		}
371	0	return result.toArray( new TextField[result.size()] );
372		}
373
374		private java.net.URI getBaseURI() throws ExtractionException {
375	0	if (baseURI == null) {
376		try {
377	0	if (document.getBaseURI() == null) {
378	0	log.warn("document.getBaseURI() is null, this should not happen");
379		}
380	0	baseURI = new java.net.URI(RDFUtils.fixAbsoluteURI(document.getBaseURI()));
381	0	} catch (IllegalArgumentException ex) {
382	0	throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
383	0	} catch (URISyntaxException ex) {
384	0	throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
385	0	}
386		}
387	0	return baseURI;
388		}
389
390		/**
391		* This class represents a text extracted from the <i>HTML</i> DOM related
392		* to the node from which such test has been retrieved.
393		*/
394		public static class TextField {
395		private String value;
396		private Node source;
397
398	0	public TextField(String value, Node source) {
399	0	this.value = value;
400	0	this.source = source;
401	0	}
402
403		public String value() {
404	0	return value;
405		}
406
407		public Node source() {
408	0	return source;
409		}
410		}
411
412		}