1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor.html; |
19 | |
|
20 | |
import org.apache.any23.extractor.ExtractionException; |
21 | |
import org.apache.any23.rdf.Any23ValueFactoryWrapper; |
22 | |
import org.apache.any23.rdf.RDFUtils; |
23 | |
import org.openrdf.model.URI; |
24 | |
import org.openrdf.model.impl.ValueFactoryImpl; |
25 | |
import org.slf4j.Logger; |
26 | |
import org.slf4j.LoggerFactory; |
27 | |
import org.w3c.dom.NamedNodeMap; |
28 | |
import org.w3c.dom.Node; |
29 | |
import org.w3c.dom.NodeList; |
30 | |
import org.w3c.dom.Text; |
31 | |
|
32 | |
import javax.xml.xpath.XPath; |
33 | |
import javax.xml.xpath.XPathConstants; |
34 | |
import javax.xml.xpath.XPathExpressionException; |
35 | |
import javax.xml.xpath.XPathFactory; |
36 | |
import java.net.URISyntaxException; |
37 | |
import java.util.ArrayList; |
38 | |
import java.util.List; |
39 | |
|
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
public class HTMLDocument { |
48 | |
|
49 | 0 | private final static XPath xPathEngine = XPathFactory.newInstance().newXPath(); |
50 | 0 | private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class); |
51 | |
|
52 | |
private Node document; |
53 | |
private java.net.URI baseURI; |
54 | |
|
55 | 0 | private final Any23ValueFactoryWrapper valueFactory = |
56 | |
new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance()); |
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | |
|
63 | |
|
64 | |
public static TextField readTextField(Node node) { |
65 | |
TextField result; |
66 | 0 | final String name = node.getNodeName(); |
67 | 0 | final NamedNodeMap attributes = node.getAttributes(); |
68 | |
|
69 | 0 | if (attributes == null ) { |
70 | 0 | return new TextField( node.getTextContent(), node); |
71 | |
} |
72 | |
|
73 | 0 | List<Node> values = DomUtils.findAllByClassName(node, "value"); |
74 | 0 | if (!values.isEmpty()) { |
75 | 0 | String val = ""; |
76 | 0 | for (Node n : values) |
77 | 0 | val += n.getTextContent(); |
78 | 0 | return new TextField( val.trim(), node); |
79 | |
} |
80 | 0 | if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) { |
81 | 0 | result = new TextField(attributes.getNamedItem("title").getNodeValue(), node); |
82 | 0 | } else if ("A".equals(name)) { |
83 | 0 | if (DomUtils.hasAttribute(node, "rel", "tag")) { |
84 | 0 | String href = extractRelTag(attributes); |
85 | 0 | result = new TextField(href, node); |
86 | 0 | } else |
87 | 0 | result = new TextField(node.getTextContent(), node); |
88 | 0 | } else if ("IMG".equals(name) || "AREA".equals(name)) { |
89 | 0 | result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node); |
90 | |
} else { |
91 | 0 | result = new TextField(node.getTextContent(), node); |
92 | |
} |
93 | 0 | return result; |
94 | |
} |
95 | |
|
96 | |
|
97 | |
|
98 | |
|
99 | |
|
100 | |
|
101 | |
|
102 | |
public static void readUrlField(List<TextField> res, Node node) { |
103 | 0 | String name = node.getNodeName(); |
104 | 0 | NamedNodeMap attributes = node.getAttributes(); |
105 | 0 | if (null == attributes) { |
106 | 0 | res.add( new TextField(node.getTextContent(), node) ); |
107 | 0 | return; |
108 | |
} |
109 | 0 | if ("A".equals(name) || "AREA".equals(name)) { |
110 | 0 | Node n = attributes.getNamedItem("href"); |
111 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
112 | 0 | } else if ("ABBR".equals(name)) { |
113 | 0 | Node n = attributes.getNamedItem("title"); |
114 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
115 | 0 | } else if ("IMG".equals(name)) { |
116 | 0 | Node n = attributes.getNamedItem("src"); |
117 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
118 | 0 | } else if ("OBJECT".equals(name)) { |
119 | 0 | Node n = attributes.getNamedItem("data"); |
120 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
121 | 0 | } else { |
122 | 0 | res.add( new TextField(node.getTextContent().trim(), node) ); |
123 | |
} |
124 | 0 | } |
125 | |
|
126 | |
|
127 | |
|
128 | |
|
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | |
public static String extractRelTag(String hrefAttributeContent) { |
134 | 0 | String[] all = hrefAttributeContent.split("[#?]"); |
135 | |
|
136 | 0 | String path = all[0]; |
137 | 0 | int pathLenghtMin1 = path.length() - 1; |
138 | 0 | if( '/' == path.charAt(pathLenghtMin1) ) { |
139 | 0 | path = path.substring(0, pathLenghtMin1); |
140 | |
} |
141 | 0 | return path; |
142 | |
} |
143 | |
|
144 | |
|
145 | |
|
146 | |
|
147 | |
|
148 | |
|
149 | |
|
150 | |
|
151 | |
public static String extractRelTag(NamedNodeMap attributes) { |
152 | 0 | return extractRelTag(attributes.getNamedItem("href").getNodeValue()); |
153 | |
} |
154 | |
|
155 | |
|
156 | |
|
157 | |
|
158 | |
|
159 | |
|
160 | |
|
161 | |
|
162 | |
|
163 | |
|
164 | |
public static String readNodeContent(Node node, boolean prettify) { |
165 | 0 | final String content = node.getTextContent(); |
166 | 0 | return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content; |
167 | |
} |
168 | |
|
169 | |
|
170 | |
|
171 | |
|
172 | |
|
173 | |
|
174 | 0 | public HTMLDocument(Node document) { |
175 | 0 | if (null == document) |
176 | 0 | throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument"); |
177 | 0 | this.document = document; |
178 | 0 | } |
179 | |
|
180 | |
|
181 | |
|
182 | |
|
183 | |
|
184 | |
public URI resolveURI(String uri) throws ExtractionException { |
185 | 0 | return valueFactory.resolveURI(uri, getBaseURI()); |
186 | |
} |
187 | |
|
188 | |
public String find(String xpath) { |
189 | 0 | return DomUtils.find(getDocument(), xpath); |
190 | |
} |
191 | |
|
192 | |
public Node findNodeById(String id) { |
193 | 0 | return DomUtils.findNodeById(getDocument(), id); |
194 | |
} |
195 | |
|
196 | |
public List<Node> findAll(String xpath) { |
197 | 0 | return DomUtils.findAll(getDocument(), xpath); |
198 | |
} |
199 | |
|
200 | |
public String findMicroformattedValue( |
201 | |
String objectTag, |
202 | |
String object, |
203 | |
String fieldTag, |
204 | |
String field, |
205 | |
String key |
206 | |
) { |
207 | 0 | Node node = findMicroformattedObjectNode(objectTag, object); |
208 | 0 | if (null == node) |
209 | 0 | return ""; |
210 | |
|
211 | 0 | if (DomUtils.hasClassName(node, field)) |
212 | 0 | return node.getTextContent(); |
213 | |
|
214 | |
|
215 | |
try { |
216 | 0 | String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key; |
217 | 0 | String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING); |
218 | 0 | if (null == value) { |
219 | 0 | return ""; |
220 | |
} |
221 | 0 | return value; |
222 | 0 | } catch (XPathExpressionException ex) { |
223 | 0 | throw new RuntimeException("Should not happen, XPath expression is built locally", ex); |
224 | |
} |
225 | |
|
226 | |
} |
227 | |
|
228 | |
public Node getDocument() { |
229 | 0 | return document; |
230 | |
} |
231 | |
|
232 | |
|
233 | |
|
234 | |
|
235 | |
|
236 | |
|
237 | |
|
238 | |
|
239 | |
public TextField getSingularTextField(String className) { |
240 | 0 | TextField[] res = getPluralTextField(className); |
241 | 0 | if (res.length == 0) |
242 | 0 | return new TextField("", null); |
243 | 0 | return res[0]; |
244 | |
} |
245 | |
|
246 | |
|
247 | |
|
248 | |
|
249 | |
|
250 | |
|
251 | |
|
252 | |
public TextField[] getPluralTextField(String className) { |
253 | 0 | List<TextField> res = new ArrayList<TextField>(); |
254 | 0 | List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className); |
255 | 0 | for (Node node : nodes) { |
256 | 0 | res.add( readTextField(node) ); |
257 | |
} |
258 | 0 | return res.toArray( new TextField[res.size()] ); |
259 | |
} |
260 | |
|
261 | |
|
262 | |
|
263 | |
|
264 | |
|
265 | |
|
266 | |
|
267 | |
|
268 | |
public TextField getSingularUrlField(String className) { |
269 | 0 | TextField[] res = getPluralUrlField(className); |
270 | 0 | if (res.length < 1) |
271 | 0 | return new TextField("", null); |
272 | 0 | return res[0]; |
273 | |
} |
274 | |
|
275 | |
|
276 | |
|
277 | |
|
278 | |
|
279 | |
|
280 | |
|
281 | |
public TextField[] getPluralUrlField(String className) { |
282 | 0 | List<TextField> res = new ArrayList<TextField>(); |
283 | 0 | List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className); |
284 | 0 | for (Node node : nodes) |
285 | 0 | readUrlField(res, node); |
286 | 0 | return res.toArray( new TextField[res.size()] ); |
287 | |
} |
288 | |
|
289 | |
public Node findMicroformattedObjectNode(String objectTag, String name) { |
290 | 0 | List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name); |
291 | 0 | if (nodes.isEmpty()) |
292 | 0 | return null; |
293 | 0 | return nodes.get(0); |
294 | |
} |
295 | |
|
296 | |
|
297 | |
|
298 | |
|
299 | |
|
300 | |
|
301 | |
|
302 | |
|
303 | |
public String readAttribute(String attribute) { |
304 | 0 | return DomUtils.readAttribute(getDocument(), attribute); |
305 | |
} |
306 | |
|
307 | |
|
308 | |
|
309 | |
|
310 | |
|
311 | |
|
312 | |
|
313 | |
public List<Node> findAllByClassName(String clazz) { |
314 | 0 | return DomUtils.findAllByClassName(getDocument(), clazz); |
315 | |
} |
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | |
|
321 | |
|
322 | |
|
323 | |
public String getText() { |
324 | 0 | NodeList children = getDocument().getChildNodes(); |
325 | 0 | if(children.getLength() == 1 && children.item(0) instanceof Text) { |
326 | 0 | return children.item(0).getTextContent(); |
327 | |
} |
328 | 0 | return null; |
329 | |
} |
330 | |
|
331 | |
|
332 | |
|
333 | |
|
334 | |
|
335 | |
|
336 | |
public String getDefaultLanguage() { |
337 | 0 | final String xpathLanguageSelector = "/HTML"; |
338 | |
Node html; |
339 | |
try { |
340 | 0 | html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE); |
341 | 0 | } catch (XPathExpressionException xpeee) { |
342 | 0 | throw new IllegalStateException(); |
343 | 0 | } |
344 | 0 | if (html == null) { |
345 | 0 | return null; |
346 | |
} |
347 | 0 | Node langAttribute = html.getAttributes().getNamedItem("xml:lang"); |
348 | 0 | return langAttribute == null ? null : langAttribute.getTextContent(); |
349 | |
} |
350 | |
|
351 | |
|
352 | |
|
353 | |
|
354 | |
|
355 | |
|
356 | |
public String[] getPathToLocalRoot() { |
357 | 0 | return DomUtils.getXPathListForNode(document); |
358 | |
} |
359 | |
|
360 | |
|
361 | |
|
362 | |
|
363 | |
|
364 | |
|
365 | |
public TextField[] extractRelTagNodes() { |
366 | 0 | final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel"); |
367 | 0 | final List<TextField> result = new ArrayList<TextField>(); |
368 | 0 | for(Node relTagNode : relTagNodes) { |
369 | 0 | readUrlField(result, relTagNode); |
370 | |
} |
371 | 0 | return result.toArray( new TextField[result.size()] ); |
372 | |
} |
373 | |
|
374 | |
private java.net.URI getBaseURI() throws ExtractionException { |
375 | 0 | if (baseURI == null) { |
376 | |
try { |
377 | 0 | if (document.getBaseURI() == null) { |
378 | 0 | log.warn("document.getBaseURI() is null, this should not happen"); |
379 | |
} |
380 | 0 | baseURI = new java.net.URI(RDFUtils.fixAbsoluteURI(document.getBaseURI())); |
381 | 0 | } catch (IllegalArgumentException ex) { |
382 | 0 | throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex); |
383 | 0 | } catch (URISyntaxException ex) { |
384 | 0 | throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex); |
385 | 0 | } |
386 | |
} |
387 | 0 | return baseURI; |
388 | |
} |
389 | |
|
390 | |
|
391 | |
|
392 | |
|
393 | |
|
394 | |
public static class TextField { |
395 | |
private String value; |
396 | |
private Node source; |
397 | |
|
398 | 0 | public TextField(String value, Node source) { |
399 | 0 | this.value = value; |
400 | 0 | this.source = source; |
401 | 0 | } |
402 | |
|
403 | |
public String value() { |
404 | 0 | return value; |
405 | |
} |
406 | |
|
407 | |
public Node source() { |
408 | 0 | return source; |
409 | |
} |
410 | |
} |
411 | |
|
412 | |
} |