1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor.microdata; |
19 | |
|
20 | |
import org.apache.any23.extractor.ErrorReporter; |
21 | |
import org.apache.any23.extractor.ExtractionContext; |
22 | |
import org.apache.any23.extractor.ExtractionException; |
23 | |
import org.apache.any23.extractor.ExtractionParameters; |
24 | |
import org.apache.any23.extractor.ExtractionResult; |
25 | |
import org.apache.any23.extractor.Extractor; |
26 | |
import org.apache.any23.extractor.ExtractorDescription; |
27 | |
import org.apache.any23.extractor.ExtractorFactory; |
28 | |
import org.apache.any23.extractor.SimpleExtractorFactory; |
29 | |
import org.apache.any23.extractor.html.DomUtils; |
30 | |
import org.apache.any23.rdf.PopularPrefixes; |
31 | |
import org.apache.any23.rdf.RDFUtils; |
32 | |
import org.apache.any23.vocab.DCTERMS; |
33 | |
import org.apache.any23.vocab.XHTML; |
34 | |
import org.openrdf.model.Literal; |
35 | |
import org.openrdf.model.Resource; |
36 | |
import org.openrdf.model.URI; |
37 | |
import org.openrdf.model.Value; |
38 | |
import org.openrdf.model.vocabulary.RDF; |
39 | |
import org.openrdf.model.vocabulary.XMLSchema; |
40 | |
import org.w3c.dom.Document; |
41 | |
import org.w3c.dom.Node; |
42 | |
import org.w3c.dom.NodeList; |
43 | |
|
44 | |
import java.io.IOException; |
45 | |
import java.net.MalformedURLException; |
46 | |
import java.net.URL; |
47 | |
import java.util.Arrays; |
48 | |
import java.util.Date; |
49 | |
import java.util.HashMap; |
50 | |
import java.util.HashSet; |
51 | |
import java.util.List; |
52 | |
import java.util.Map; |
53 | |
import java.util.Set; |
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
|
61 | |
|
62 | 0 | public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor { |
63 | |
|
64 | 0 | private static final URI MICRODATA_ITEM |
65 | |
= RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item"); |
66 | |
|
67 | 0 | public final static ExtractorFactory<MicrodataExtractor> factory = |
68 | |
SimpleExtractorFactory.create( |
69 | |
"html-microdata", |
70 | |
PopularPrefixes.createSubset("rdf", "doac", "foaf"), |
71 | |
Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), |
72 | |
"example-microdata.html", |
73 | |
MicrodataExtractor.class |
74 | |
); |
75 | |
|
76 | |
private String documentLanguage; |
77 | |
|
78 | |
private boolean isStrict; |
79 | |
|
80 | |
private String defaultNamespace; |
81 | |
|
82 | |
public ExtractorDescription getDescription() { |
83 | 0 | return factory; |
84 | |
} |
85 | |
|
86 | |
|
87 | |
|
88 | |
|
89 | |
|
90 | |
|
91 | |
|
92 | |
|
93 | |
public void run( |
94 | |
ExtractionParameters extractionParameters, |
95 | |
ExtractionContext extractionContext, |
96 | |
Document in, |
97 | |
ExtractionResult out |
98 | |
) throws IOException, ExtractionException { |
99 | |
|
100 | 0 | final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in); |
101 | 0 | if(parserReport.getErrors().length > 0) { |
102 | 0 | notifyError(parserReport.getErrors(), out); |
103 | |
} |
104 | 0 | final ItemScope[] itemScopes = parserReport.getDetectedItemScopes(); |
105 | 0 | if (itemScopes.length == 0) { |
106 | 0 | return; |
107 | |
} |
108 | |
|
109 | 0 | isStrict = extractionParameters.getFlag("any23.microdata.strict"); |
110 | 0 | if (!isStrict) { |
111 | 0 | defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default"); |
112 | |
} |
113 | |
|
114 | 0 | documentLanguage = getDocumentLanguage(in); |
115 | |
|
116 | |
|
117 | |
|
118 | |
|
119 | 0 | final URI documentURI = extractionContext.getDocumentURI(); |
120 | 0 | final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>(); |
121 | 0 | for (ItemScope itemScope : itemScopes) { |
122 | 0 | Resource subject = processType(itemScope, documentURI, out, mappings); |
123 | 0 | out.writeTriple( |
124 | |
documentURI, |
125 | |
MICRODATA_ITEM, |
126 | |
subject |
127 | |
); |
128 | |
} |
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | 0 | processTitle(in, documentURI, out); |
134 | |
|
135 | |
|
136 | |
|
137 | 0 | processHREFElements(in, documentURI, out); |
138 | |
|
139 | |
|
140 | |
|
141 | 0 | processMetaElements(in, documentURI, out); |
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | 0 | processCiteElements(in, documentURI, out); |
147 | 0 | } |
148 | |
|
149 | |
|
150 | |
|
151 | |
|
152 | |
|
153 | |
|
154 | |
|
155 | |
private String getDocumentLanguage(Document in) { |
156 | 0 | String lang = DomUtils.find(in, "string(/HTML/@lang)"); |
157 | 0 | if (lang.equals("")) { |
158 | 0 | return null; |
159 | |
} |
160 | 0 | return lang; |
161 | |
} |
162 | |
|
163 | |
|
164 | |
|
165 | |
|
166 | |
|
167 | |
|
168 | |
|
169 | |
|
170 | |
private String getLanguage(Node node) { |
171 | 0 | Node nodeLang = node.getAttributes().getNamedItem("lang"); |
172 | 0 | if (nodeLang == null) { |
173 | |
|
174 | 0 | return documentLanguage; |
175 | |
} |
176 | 0 | return nodeLang.getTextContent(); |
177 | |
} |
178 | |
|
179 | |
|
180 | |
|
181 | |
|
182 | |
|
183 | |
|
184 | |
|
185 | |
|
186 | |
|
187 | |
private void processTitle(Document in, URI documentURI, ExtractionResult out) { |
188 | 0 | NodeList titles = in.getElementsByTagName("title"); |
189 | |
|
190 | 0 | if (titles.getLength() == 1) { |
191 | 0 | Node title = titles.item(0); |
192 | 0 | String titleValue = title.getTextContent(); |
193 | |
Literal object; |
194 | 0 | String lang = getLanguage(title); |
195 | 0 | if (lang == null) { |
196 | |
|
197 | 0 | object = RDFUtils.literal(titleValue); |
198 | |
} else { |
199 | 0 | object = RDFUtils.literal(titleValue, lang); |
200 | |
} |
201 | 0 | out.writeTriple( |
202 | |
documentURI, |
203 | |
DCTERMS.getInstance().title, |
204 | |
object |
205 | |
); |
206 | |
} |
207 | 0 | } |
208 | |
|
209 | |
|
210 | |
|
211 | |
|
212 | |
|
213 | |
|
214 | |
|
215 | |
|
216 | |
|
217 | |
private void processHREFElements(Document in, URI documentURI, ExtractionResult out) { |
218 | 0 | NodeList anchors = in.getElementsByTagName("a"); |
219 | 0 | for (int i = 0; i < anchors.getLength(); i++) { |
220 | 0 | processHREFElement(anchors.item(i), documentURI, out); |
221 | |
} |
222 | 0 | NodeList areas = in.getElementsByTagName("area"); |
223 | 0 | for (int i = 0; i < areas.getLength(); i++) { |
224 | 0 | processHREFElement(areas.item(i), documentURI, out); |
225 | |
} |
226 | 0 | NodeList links = in.getElementsByTagName("link"); |
227 | 0 | for (int i = 0; i < links.getLength(); i++) { |
228 | 0 | processHREFElement(links.item(i), documentURI, out); |
229 | |
} |
230 | 0 | } |
231 | |
|
232 | |
|
233 | |
|
234 | |
|
235 | |
|
236 | |
|
237 | |
|
238 | |
|
239 | |
|
240 | |
private void processHREFElement(Node item, URI documentURI, ExtractionResult out) { |
241 | 0 | Node rel = item.getAttributes().getNamedItem("rel"); |
242 | 0 | if (rel == null) { |
243 | 0 | return; |
244 | |
} |
245 | 0 | Node href = item.getAttributes().getNamedItem("href"); |
246 | 0 | if (href == null) { |
247 | 0 | return; |
248 | |
} |
249 | |
URL absoluteURL; |
250 | 0 | if (!isAbsoluteURL(href.getTextContent())) { |
251 | |
try { |
252 | 0 | absoluteURL = toAbsoluteURL( |
253 | |
documentURI.toString(), |
254 | |
href.getTextContent(), |
255 | |
'/' |
256 | |
); |
257 | 0 | } catch (MalformedURLException e) { |
258 | |
|
259 | 0 | return; |
260 | 0 | } |
261 | |
} else { |
262 | |
try { |
263 | 0 | absoluteURL = new URL(href.getTextContent()); |
264 | 0 | } catch (MalformedURLException e) { |
265 | |
|
266 | 0 | return; |
267 | 0 | } |
268 | |
} |
269 | 0 | String[] relTokens = rel.getTextContent().split(" "); |
270 | 0 | Set<String> tokensWithNoDuplicates = new HashSet<String>(); |
271 | 0 | for (String relToken : relTokens) { |
272 | 0 | if (relToken.contains(":")) { |
273 | |
|
274 | 0 | continue; |
275 | |
} |
276 | 0 | if (relToken.equals("alternate") || relToken.equals("stylesheet")) { |
277 | 0 | tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET"); |
278 | 0 | continue; |
279 | |
} |
280 | 0 | tokensWithNoDuplicates.add(relToken.toLowerCase()); |
281 | |
} |
282 | 0 | for (String token : tokensWithNoDuplicates) { |
283 | |
URI predicate; |
284 | 0 | if (isAbsoluteURL(token)) { |
285 | 0 | predicate = RDFUtils.uri(token); |
286 | |
} else { |
287 | 0 | predicate = RDFUtils.uri(XHTML.NS + token); |
288 | |
} |
289 | 0 | out.writeTriple( |
290 | |
documentURI, |
291 | |
predicate, |
292 | |
RDFUtils.uri(absoluteURL.toString()) |
293 | |
); |
294 | 0 | } |
295 | 0 | } |
296 | |
|
297 | |
|
298 | |
|
299 | |
|
300 | |
|
301 | |
|
302 | |
|
303 | |
|
304 | |
|
305 | |
private void processMetaElements(Document in, URI documentURI, ExtractionResult out) { |
306 | 0 | NodeList metas = in.getElementsByTagName("meta"); |
307 | 0 | for (int i = 0; i < metas.getLength(); i++) { |
308 | 0 | Node meta = metas.item(i); |
309 | 0 | String name = DomUtils.readAttribute(meta, "name" , null); |
310 | 0 | String content = DomUtils.readAttribute(meta, "content", null); |
311 | 0 | if (name != null && content != null) { |
312 | 0 | if (isAbsoluteURL(name)) { |
313 | 0 | processMetaElement( |
314 | |
RDFUtils.uri(name), |
315 | |
content, |
316 | |
getLanguage(meta), |
317 | |
documentURI, |
318 | |
out |
319 | |
); |
320 | |
} else { |
321 | 0 | processMetaElement( |
322 | |
name, |
323 | |
content, |
324 | |
getLanguage(meta), |
325 | |
documentURI, |
326 | |
out |
327 | |
); |
328 | |
} |
329 | |
} |
330 | |
} |
331 | 0 | } |
332 | |
|
333 | |
|
334 | |
|
335 | |
|
336 | |
|
337 | |
|
338 | |
|
339 | |
|
340 | |
|
341 | |
|
342 | |
|
343 | |
private void processMetaElement( |
344 | |
URI uri, |
345 | |
String content, |
346 | |
String language, |
347 | |
URI documentURI, |
348 | |
ExtractionResult out |
349 | |
) { |
350 | 0 | if (content.contains(":")) { |
351 | |
|
352 | 0 | return; |
353 | |
} |
354 | |
Literal subject; |
355 | 0 | if (language == null) { |
356 | |
|
357 | 0 | subject = RDFUtils.literal(content); |
358 | |
} else { |
359 | 0 | subject = RDFUtils.literal(content, language); |
360 | |
} |
361 | 0 | out.writeTriple( |
362 | |
documentURI, |
363 | |
uri, |
364 | |
subject |
365 | |
); |
366 | 0 | } |
367 | |
|
368 | |
|
369 | |
|
370 | |
|
371 | |
|
372 | |
|
373 | |
|
374 | |
|
375 | |
|
376 | |
|
377 | |
|
378 | |
private void processMetaElement( |
379 | |
String name, |
380 | |
String content, |
381 | |
String language, |
382 | |
URI documentURI, |
383 | |
ExtractionResult out) { |
384 | |
Literal subject; |
385 | 0 | if (language == null) { |
386 | |
|
387 | 0 | subject = RDFUtils.literal(content); |
388 | |
} else { |
389 | 0 | subject = RDFUtils.literal(content, language); |
390 | |
} |
391 | 0 | out.writeTriple( |
392 | |
documentURI, |
393 | |
RDFUtils.uri(XHTML.NS + name.toLowerCase()), |
394 | |
subject |
395 | |
); |
396 | 0 | } |
397 | |
|
398 | |
|
399 | |
|
400 | |
|
401 | |
|
402 | |
|
403 | |
|
404 | |
|
405 | |
|
406 | |
private void processCiteElements(Document in, URI documentURI, ExtractionResult out) { |
407 | 0 | NodeList blockQuotes = in.getElementsByTagName("blockquote"); |
408 | 0 | for (int i = 0; i < blockQuotes.getLength(); i++) { |
409 | 0 | processCiteElement(blockQuotes.item(i), documentURI, out); |
410 | |
} |
411 | 0 | NodeList quotes = in.getElementsByTagName("q"); |
412 | 0 | for (int i = 0; i < quotes.getLength(); i++) { |
413 | 0 | processCiteElement(quotes.item(i), documentURI, out); |
414 | |
} |
415 | 0 | } |
416 | |
|
417 | |
private void processCiteElement(Node item, URI documentURI, ExtractionResult out) { |
418 | 0 | if (item.getAttributes().getNamedItem("cite") != null) { |
419 | 0 | out.writeTriple( |
420 | |
documentURI, |
421 | |
DCTERMS.getInstance().source, |
422 | |
RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent()) |
423 | |
); |
424 | |
} |
425 | 0 | } |
426 | |
|
427 | |
|
428 | |
|
429 | |
|
430 | |
|
431 | |
|
432 | |
|
433 | |
|
434 | |
|
435 | |
|
436 | |
|
437 | |
|
438 | |
|
439 | |
private Resource processType( |
440 | |
ItemScope itemScope, |
441 | |
URI documentURI, ExtractionResult out, |
442 | |
Map<ItemScope, Resource> mappings |
443 | |
) throws ExtractionException { |
444 | |
Resource subject; |
445 | 0 | if (mappings.containsKey(itemScope)) { |
446 | 0 | subject = mappings.get(itemScope); |
447 | 0 | } else if (isAbsoluteURL(itemScope.getItemId())) { |
448 | 0 | subject = RDFUtils.uri(itemScope.getItemId()); |
449 | |
} else { |
450 | 0 | subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode())); |
451 | |
} |
452 | 0 | mappings.put(itemScope, subject); |
453 | |
|
454 | |
|
455 | 0 | String itemScopeType = ""; |
456 | 0 | if (itemScope.getType() != null) { |
457 | |
String itemType; |
458 | 0 | itemType = itemScope.getType().toString(); |
459 | 0 | out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType)); |
460 | 0 | itemScopeType = itemScope.getType().toString(); |
461 | |
} |
462 | 0 | for (String propName : itemScope.getProperties().keySet()) { |
463 | 0 | List<ItemProp> itemProps = itemScope.getProperties().get(propName); |
464 | 0 | for (ItemProp itemProp : itemProps) { |
465 | |
try { |
466 | 0 | processProperty( |
467 | |
subject, |
468 | |
propName, |
469 | |
itemProp, |
470 | |
itemScopeType, |
471 | |
documentURI, |
472 | |
mappings, |
473 | |
out |
474 | |
); |
475 | 0 | } catch (MalformedURLException e) { |
476 | 0 | throw new ExtractionException( |
477 | |
"Error while processing on subject '" + subject + |
478 | |
"' the itemProp: '" + itemProp + "' " |
479 | |
); |
480 | 0 | } |
481 | |
} |
482 | 0 | } |
483 | 0 | return subject; |
484 | |
} |
485 | |
|
486 | |
private void processProperty( |
487 | |
Resource subject, |
488 | |
String propName, |
489 | |
ItemProp itemProp, |
490 | |
String itemScopeType, |
491 | |
URI documentURI, |
492 | |
Map<ItemScope, Resource> mappings, |
493 | |
ExtractionResult out |
494 | |
) throws MalformedURLException, ExtractionException { |
495 | |
URI predicate; |
496 | 0 | if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) { |
497 | 0 | return; |
498 | 0 | } else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) { |
499 | 0 | predicate = RDFUtils.uri( |
500 | |
toAbsoluteURL( |
501 | |
defaultNamespace, |
502 | |
propName, |
503 | |
'/' |
504 | |
).toString() |
505 | |
); |
506 | |
} else { |
507 | 0 | predicate = RDFUtils.uri( |
508 | |
toAbsoluteURL( |
509 | |
itemScopeType, |
510 | |
propName, |
511 | |
'/' |
512 | |
).toString()); |
513 | |
} |
514 | |
Value value; |
515 | 0 | Object propValue = itemProp.getValue().getContent(); |
516 | 0 | ItemPropValue.Type propType = itemProp.getValue().getType(); |
517 | 0 | if (propType.equals(ItemPropValue.Type.Nested)) { |
518 | 0 | value = processType((ItemScope) propValue, documentURI, out, mappings); |
519 | 0 | } else if (propType.equals(ItemPropValue.Type.Plain)) { |
520 | 0 | value = RDFUtils.literal((String) propValue, documentLanguage); |
521 | 0 | } else if (propType.equals(ItemPropValue.Type.Link)) { |
522 | 0 | value = RDFUtils.uri( |
523 | |
toAbsoluteURL( |
524 | |
documentURI.toString(), |
525 | |
(String) propValue, |
526 | |
'/' |
527 | |
).toString() |
528 | |
); |
529 | 0 | } else if (propType.equals(ItemPropValue.Type.Date)) { |
530 | 0 | value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE); |
531 | |
} else { |
532 | 0 | throw new RuntimeException("Invalid Type '" + |
533 | |
propType + "' for ItemPropValue with name: '" + propName + "'"); |
534 | |
} |
535 | 0 | out.writeTriple(subject, predicate, value); |
536 | 0 | } |
537 | |
|
538 | |
private boolean isAbsoluteURL(String urlString) { |
539 | 0 | boolean result = false; |
540 | |
try { |
541 | 0 | URL url = new URL(urlString); |
542 | 0 | String protocol = url.getProtocol(); |
543 | 0 | if (protocol != null && protocol.trim().length() > 0) |
544 | 0 | result = true; |
545 | 0 | } catch (MalformedURLException e) { |
546 | 0 | return false; |
547 | 0 | } |
548 | 0 | return result; |
549 | |
} |
550 | |
|
551 | |
private URL toAbsoluteURL(String ns, String part, char trailing) |
552 | |
throws MalformedURLException { |
553 | 0 | if (isAbsoluteURL(part)) { |
554 | 0 | return new URL(part); |
555 | |
} |
556 | 0 | char lastChar = ns.charAt(ns.length() - 1); |
557 | 0 | if (lastChar == '#' || lastChar == '/') |
558 | 0 | return new URL(ns + part); |
559 | 0 | return new URL(ns + trailing + part); |
560 | |
} |
561 | |
|
562 | |
private void notifyError(MicrodataParserException[] errors, ExtractionResult out) { |
563 | 0 | for(MicrodataParserException mpe : errors) { |
564 | 0 | out.notifyError( |
565 | |
ErrorReporter.ErrorLevel.ERROR, |
566 | |
mpe.toJSON(), |
567 | |
mpe.getErrorLocationBeginRow() , |
568 | |
mpe.getErrorLocationBeginCol() |
569 | |
); |
570 | |
} |
571 | 0 | } |
572 | |
|
573 | |
} |