1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor.html; |
19 | |
|
20 | |
import org.w3c.dom.NamedNodeMap; |
21 | |
import org.w3c.dom.Node; |
22 | |
import org.w3c.dom.NodeList; |
23 | |
|
24 | |
import javax.xml.transform.OutputKeys; |
25 | |
import javax.xml.transform.Transformer; |
26 | |
import javax.xml.transform.TransformerException; |
27 | |
import javax.xml.transform.TransformerFactory; |
28 | |
import javax.xml.transform.dom.DOMSource; |
29 | |
import javax.xml.transform.stream.StreamResult; |
30 | |
import javax.xml.xpath.XPath; |
31 | |
import javax.xml.xpath.XPathConstants; |
32 | |
import javax.xml.xpath.XPathExpressionException; |
33 | |
import javax.xml.xpath.XPathFactory; |
34 | |
import java.io.IOException; |
35 | |
import java.io.StringWriter; |
36 | |
import java.util.ArrayList; |
37 | |
import java.util.List; |
38 | |
|
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
public class DomUtils { |
51 | |
|
52 | 0 | private static final String[] EMPTY_STRING_ARRAY = new String[0]; |
53 | |
|
54 | 0 | private final static XPath xPathEngine = XPathFactory.newInstance().newXPath(); |
55 | |
|
56 | 0 | private DomUtils(){} |
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
public static int getIndexInParent(Node n) { |
66 | 0 | Node parent = n.getParentNode(); |
67 | 0 | if(parent == null) { |
68 | 0 | return 0; |
69 | |
} |
70 | 0 | NodeList nodes = parent.getChildNodes(); |
71 | 0 | int counter = -1; |
72 | 0 | for(int i = 0; i < nodes.getLength(); i++) { |
73 | 0 | Node current = nodes.item(i); |
74 | 0 | if ( current.getNodeType() == n.getNodeType() && current.getNodeName().equals( n.getNodeName() ) ) { |
75 | 0 | counter++; |
76 | |
} |
77 | 0 | if( current.equals(n) ) { |
78 | 0 | return counter; |
79 | |
} |
80 | |
} |
81 | 0 | throw new IllegalStateException("Cannot find a child within its parent node list."); |
82 | |
} |
83 | |
|
84 | |
|
85 | |
|
86 | |
|
87 | |
|
88 | |
|
89 | |
|
90 | |
|
91 | |
|
92 | |
public static String getXPathForNode(Node node) { |
93 | 0 | final StringBuilder sb = new StringBuilder(); |
94 | 0 | Node parent = node; |
95 | 0 | while(parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) { |
96 | 0 | sb.insert(0, "]"); |
97 | 0 | sb.insert(0, getIndexInParent(parent) + 1); |
98 | 0 | sb.insert(0, "["); |
99 | 0 | sb.insert(0, parent.getNodeName()); |
100 | 0 | sb.insert(0, "/"); |
101 | 0 | parent = parent.getParentNode(); |
102 | |
} |
103 | 0 | return sb.toString(); |
104 | |
} |
105 | |
|
106 | |
|
107 | |
|
108 | |
|
109 | |
|
110 | |
|
111 | |
|
112 | |
|
113 | |
public static String[] getXPathListForNode(Node n) { |
114 | 0 | if(n == null) { |
115 | 0 | return EMPTY_STRING_ARRAY; |
116 | |
} |
117 | 0 | List<String> ancestors = new ArrayList<String>(); |
118 | 0 | ancestors.add( String.format("%s[%s]", n.getNodeName(), getIndexInParent(n) ) ); |
119 | 0 | Node parent = n.getParentNode(); |
120 | 0 | while(parent != null) { |
121 | 0 | ancestors.add(0, String.format("%s[%s]", parent.getNodeName(), getIndexInParent(parent) ) ); |
122 | 0 | parent = parent.getParentNode(); |
123 | |
} |
124 | 0 | return ancestors.toArray( new String[ancestors.size()] ); |
125 | |
} |
126 | |
|
127 | |
|
128 | |
|
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | |
|
134 | |
|
135 | |
public static int[] getNodeLocation(Node n) { |
136 | 0 | if(n == null) throw new NullPointerException("node cannot be null."); |
137 | 0 | final TagSoupParser.ElementLocation elementLocation = |
138 | |
(TagSoupParser.ElementLocation) n.getUserData( TagSoupParser.ELEMENT_LOCATION ); |
139 | 0 | if(elementLocation == null) return null; |
140 | 0 | return new int[]{ |
141 | |
elementLocation.getBeginLineNumber(), |
142 | |
elementLocation.getBeginColumnNumber(), |
143 | |
elementLocation.getEndLineNumber(), |
144 | |
elementLocation.getEndColumnNumber() |
145 | |
}; |
146 | |
} |
147 | |
|
148 | |
|
149 | |
|
150 | |
|
151 | |
|
152 | |
|
153 | |
|
154 | |
|
155 | |
|
156 | |
|
157 | |
public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) { |
158 | 0 | if(candidateAncestor == null) throw new NullPointerException("candidate ancestor cannot be null null."); |
159 | 0 | if(candidateSibling == null) throw new NullPointerException("candidate sibling cannot be null null." ); |
160 | 0 | if(strict && candidateAncestor.equals(candidateSibling)) return false; |
161 | 0 | Node parent = candidateSibling; |
162 | 0 | while(parent != null) { |
163 | 0 | if(parent.equals(candidateAncestor)) return true; |
164 | 0 | parent = parent.getParentNode(); |
165 | |
} |
166 | 0 | return false; |
167 | |
} |
168 | |
|
169 | |
|
170 | |
|
171 | |
|
172 | |
|
173 | |
|
174 | |
|
175 | |
|
176 | |
|
177 | |
|
178 | |
public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) { |
179 | 0 | return isAncestorOf(candidateAncestor, candidateSibling, false); |
180 | |
} |
181 | |
|
182 | |
|
183 | |
|
184 | |
|
185 | |
|
186 | |
|
187 | |
|
188 | |
|
189 | |
|
190 | |
public static List<Node> findAllByClassName(Node root, String className) { |
191 | 0 | return findAllByTagAndClassName(root, "*", className.toLowerCase()); |
192 | |
} |
193 | |
|
194 | |
|
195 | |
|
196 | |
|
197 | |
|
198 | |
|
199 | |
|
200 | |
|
201 | |
|
202 | |
public static List<Node> findAllByAttributeName(Node root, String attrName) { |
203 | 0 | List<Node> result = new ArrayList<Node>(); |
204 | 0 | for (Node node : findAll(root, String.format("./descendant-or-self::*[@%s]", attrName) ) ) { |
205 | 0 | result.add(node); |
206 | |
} |
207 | 0 | return result; |
208 | |
} |
209 | |
|
210 | |
public static List<Node> findAllByTag(Node root, String tagName) { |
211 | 0 | List<Node> result = new ArrayList<Node>(); |
212 | 0 | for (Node node : findAll(root, "./descendant-or-self::" + tagName)) { |
213 | 0 | result.add(node); |
214 | |
} |
215 | 0 | return result; |
216 | |
} |
217 | |
|
218 | |
public static List<Node> findAllByTagAndClassName(Node root, String tagName, String className) { |
219 | 0 | List<Node> result = new ArrayList<Node>(); |
220 | 0 | for (Node node : findAll( |
221 | |
root, |
222 | |
"./descendant-or-self::" + |
223 | |
tagName + |
224 | |
"[contains(translate(@class,'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'),'" + |
225 | |
className + "')]") |
226 | |
) { |
227 | 0 | if (DomUtils.hasClassName(node, className)) { |
228 | 0 | result.add(node); |
229 | |
} |
230 | |
} |
231 | 0 | return result; |
232 | |
} |
233 | |
|
234 | |
|
235 | |
|
236 | |
|
237 | |
public static Node findNodeById(Node root, String id) { |
238 | |
Node node; |
239 | |
try { |
240 | 0 | String xpath = "//*[@id='" + id + "']"; |
241 | 0 | node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE); |
242 | 0 | } catch (XPathExpressionException ex) { |
243 | 0 | throw new RuntimeException("Should not happen", ex); |
244 | 0 | } |
245 | 0 | return node; |
246 | |
} |
247 | |
|
248 | |
|
249 | |
|
250 | |
|
251 | |
|
252 | |
public static List<Node> findAll(Node node, String xpath) { |
253 | 0 | if(node == null) { |
254 | 0 | throw new NullPointerException("node cannot be null."); |
255 | |
} |
256 | |
try { |
257 | 0 | NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET); |
258 | 0 | List<Node> result = new ArrayList<Node>(nodes.getLength()); |
259 | 0 | for (int i = 0; i < nodes.getLength(); i++) { |
260 | 0 | result.add(nodes.item(i)); |
261 | |
} |
262 | 0 | return result; |
263 | 0 | } catch (XPathExpressionException ex) { |
264 | 0 | throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex); |
265 | |
} |
266 | |
} |
267 | |
|
268 | |
|
269 | |
|
270 | |
|
271 | |
public static String find(Node node, String xpath) { |
272 | |
try { |
273 | 0 | String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING); |
274 | 0 | if (null == val) |
275 | 0 | return ""; |
276 | 0 | return val; |
277 | 0 | } catch (XPathExpressionException ex) { |
278 | 0 | throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex); |
279 | |
} |
280 | |
} |
281 | |
|
282 | |
|
283 | |
|
284 | |
|
285 | |
|
286 | |
public static boolean hasClassName(Node node, String className) { |
287 | 0 | return hasAttribute(node, "class", className); |
288 | |
} |
289 | |
|
290 | |
|
291 | |
|
292 | |
|
293 | |
|
294 | |
|
295 | |
public static boolean hasAttribute(Node node, String attributeName, String className) { |
296 | |
|
297 | |
|
298 | 0 | String attr = readAttribute(node, attributeName); |
299 | 0 | for (String c : attr.split("\\s+")) |
300 | 0 | if (c.equalsIgnoreCase(className)) |
301 | 0 | return true; |
302 | 0 | return false; |
303 | |
} |
304 | |
|
305 | |
|
306 | |
|
307 | |
|
308 | |
|
309 | |
|
310 | |
|
311 | |
public static boolean hasAttribute(Node node, String attributeName) { |
312 | 0 | return readAttribute(node, attributeName, null) != null; |
313 | |
} |
314 | |
|
315 | |
|
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | |
|
321 | |
|
322 | |
public static boolean isElementNode(Node target) { |
323 | 0 | return Node.ELEMENT_NODE == target.getNodeType(); |
324 | |
} |
325 | |
|
326 | |
|
327 | |
|
328 | |
|
329 | |
|
330 | |
|
331 | |
|
332 | |
|
333 | |
|
334 | |
|
335 | |
public static String readAttribute(Node node, String attribute, String defaultValue) { |
336 | 0 | NamedNodeMap attributes = node.getAttributes(); |
337 | 0 | if (null == attributes) |
338 | 0 | return defaultValue; |
339 | 0 | Node attr = attributes.getNamedItem(attribute); |
340 | 0 | if (null==attr) |
341 | 0 | return defaultValue; |
342 | 0 | return attr.getNodeValue(); |
343 | |
} |
344 | |
|
345 | |
|
346 | |
|
347 | |
|
348 | |
|
349 | |
|
350 | |
|
351 | |
|
352 | |
|
353 | |
|
354 | |
public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) { |
355 | 0 | final NamedNodeMap attributes = node.getAttributes(); |
356 | 0 | if (null == attributes) { |
357 | 0 | return defaultValue; |
358 | |
} |
359 | |
Node attribute; |
360 | 0 | for (int a = 0; a < attributes.getLength(); a++) { |
361 | 0 | attribute = attributes.item(a); |
362 | 0 | if (attribute.getNodeName().startsWith(attributePrefix)) { |
363 | 0 | return attribute.getNodeValue(); |
364 | |
} |
365 | |
} |
366 | 0 | return defaultValue; |
367 | |
} |
368 | |
|
369 | |
|
370 | |
|
371 | |
|
372 | |
|
373 | |
|
374 | |
|
375 | |
|
376 | |
|
377 | |
public static String readAttribute(Node node, String attribute) { |
378 | 0 | return readAttribute(node, attribute, ""); |
379 | |
} |
380 | |
|
381 | |
|
382 | |
|
383 | |
|
384 | |
|
385 | |
|
386 | |
|
387 | |
|
388 | |
|
389 | |
|
390 | |
|
391 | |
|
392 | |
public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException { |
393 | 0 | final DOMSource domSource = new DOMSource(node); |
394 | 0 | final Transformer transformer = TransformerFactory.newInstance().newTransformer(); |
395 | 0 | transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); |
396 | 0 | transformer.setOutputProperty(OutputKeys.METHOD, "xml"); |
397 | 0 | transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); |
398 | 0 | if(indent) { |
399 | 0 | transformer.setOutputProperty(OutputKeys.INDENT, "yes"); |
400 | 0 | transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4"); |
401 | |
} |
402 | 0 | final StringWriter sw = new StringWriter(); |
403 | 0 | final StreamResult sr = new StreamResult(sw); |
404 | 0 | transformer.transform(domSource, sr); |
405 | 0 | sw.close(); |
406 | 0 | return sw.toString(); |
407 | |
} |
408 | |
|
409 | |
} |