1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor.rdfa; |
19 | |
|
20 | |
import org.apache.any23.extractor.ErrorReporter; |
21 | |
import org.apache.any23.extractor.ExtractionResult; |
22 | |
import org.apache.any23.extractor.html.DomUtils; |
23 | |
import org.apache.any23.rdf.RDFUtils; |
24 | |
import org.openrdf.model.Literal; |
25 | |
import org.openrdf.model.Resource; |
26 | |
import org.openrdf.model.URI; |
27 | |
import org.openrdf.model.Value; |
28 | |
import org.openrdf.model.vocabulary.RDF; |
29 | |
import org.slf4j.Logger; |
30 | |
import org.slf4j.LoggerFactory; |
31 | |
import org.w3c.dom.Document; |
32 | |
import org.w3c.dom.NamedNodeMap; |
33 | |
import org.w3c.dom.Node; |
34 | |
import org.w3c.dom.NodeList; |
35 | |
|
36 | |
import javax.xml.transform.TransformerException; |
37 | |
import java.io.IOException; |
38 | |
import java.net.MalformedURLException; |
39 | |
import java.net.URISyntaxException; |
40 | |
import java.net.URL; |
41 | |
import java.util.ArrayList; |
42 | |
import java.util.HashMap; |
43 | |
import java.util.List; |
44 | |
import java.util.Map; |
45 | |
import java.util.Stack; |
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | 0 | public class RDFa11Parser { |
54 | |
|
55 | 0 | private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class); |
56 | |
|
57 | |
public static final String CURIE_SEPARATOR = ":"; |
58 | |
public static final char URI_PREFIX_SEPARATOR = ':'; |
59 | |
public static final String URI_SCHEMA_SEPARATOR = "://"; |
60 | |
public static final String URI_PATH_SEPARATOR = "/"; |
61 | |
|
62 | |
public static final String HEAD_TAG = "HEAD"; |
63 | |
public static final String BODY_TAG = "BODY"; |
64 | |
|
65 | |
public static final String XMLNS_ATTRIBUTE = "xmlns"; |
66 | |
public static final String XML_LANG_ATTRIBUTE = "xml:lang"; |
67 | |
|
68 | |
public static final String REL_ATTRIBUTE = "rel"; |
69 | |
public static final String REV_ATTRIBUTE = "rev"; |
70 | |
|
71 | |
public static final String ABOUT_ATTRIBUTE = "about"; |
72 | |
public static final String RESOURCE_ATTRIBUTE = "resource"; |
73 | |
public static final String SRC_ATTRIBUTE = "src"; |
74 | |
public static final String HREF_ATTRIBUTE = "href"; |
75 | |
|
76 | 0 | public static final String[] SUBJECT_ATTRIBUTES = { |
77 | |
ABOUT_ATTRIBUTE, |
78 | |
SRC_ATTRIBUTE, |
79 | |
RESOURCE_ATTRIBUTE, |
80 | |
HREF_ATTRIBUTE |
81 | |
}; |
82 | |
|
83 | |
public static final String PREFIX_ATTRIBUTE = "prefix"; |
84 | |
public static final String TYPEOF_ATTRIBUTE = "typeof"; |
85 | |
public static final String PROPERTY_ATTRIBUTE = "property"; |
86 | |
public static final String DATATYPE_ATTRIBUTE = "datatype"; |
87 | |
public static final String CONTENT_ATTRIBUTE = "content"; |
88 | |
public static final String VOCAB_ATTRIBUTE = "vocab"; |
89 | |
|
90 | |
public static final String PROFILE_ATTRIBUTE = "profile"; |
91 | |
|
92 | |
public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral"; |
93 | |
|
94 | |
public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml"; |
95 | |
|
96 | |
private ErrorReporter errorReporter; |
97 | |
|
98 | |
private URL documentBase; |
99 | |
|
100 | 0 | private final Stack<URIMapping> uriMappingStack = new Stack<URIMapping>(); |
101 | |
|
102 | 0 | private final Stack<Vocabulary> vocabularyStack = new Stack<Vocabulary>(); |
103 | |
|
104 | 0 | private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<IncompleteTriple>(); |
105 | |
|
106 | 0 | private final Stack<EvaluationContext> evaluationContextStack = new Stack<EvaluationContext>(); |
107 | |
|
108 | |
protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException { |
109 | |
String base; |
110 | 0 | base = DomUtils.find(document, "/HTML/HEAD/BASE/@href"); |
111 | 0 | if( ! "".equals(base) ) return new URL(base); |
112 | 0 | base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href"); |
113 | 0 | if( ! "".equals(base) ) return new URL(base); |
114 | 0 | return documentURL; |
115 | |
} |
116 | |
|
117 | |
|
118 | |
|
119 | |
|
120 | |
|
121 | |
|
122 | |
|
123 | |
|
124 | |
protected static String[] extractPrefixSections(String prefixesDeclaration) { |
125 | 0 | final String[] parts = prefixesDeclaration.split("\\s"); |
126 | 0 | final List<String> out = new ArrayList<String>(); |
127 | 0 | int i = 0; |
128 | 0 | while(i < parts.length) { |
129 | 0 | final String part = parts[i]; |
130 | 0 | if(part.length() == 0) { |
131 | 0 | i++; |
132 | 0 | continue; |
133 | |
} |
134 | 0 | if(part.charAt( part.length() -1 ) == URI_PREFIX_SEPARATOR) { |
135 | 0 | i++; |
136 | 0 | while(i < parts.length && parts[i].length() == 0) i++; |
137 | 0 | out.add( part + (i < parts.length ? parts[i] : "") ); |
138 | 0 | i++; |
139 | |
} else { |
140 | 0 | out.add(parts[i]); |
141 | 0 | i++; |
142 | |
} |
143 | 0 | } |
144 | 0 | return out.toArray( new String[out.size()] ); |
145 | |
} |
146 | |
|
147 | |
protected static boolean isAbsoluteURI(String uri) { |
148 | 0 | return uri.contains(URI_SCHEMA_SEPARATOR); |
149 | |
} |
150 | |
|
151 | |
protected static boolean isCURIE(String curie) { |
152 | 0 | if(curie == null) { |
153 | 0 | throw new NullPointerException("curie string cannot be null."); |
154 | |
} |
155 | 0 | if(curie.trim().length() == 0) return false; |
156 | |
|
157 | |
|
158 | 0 | if( curie.charAt(0) != '[' || curie.charAt(curie.length() -1) != ']') return false; |
159 | 0 | int separatorIndex = curie.indexOf(CURIE_SEPARATOR); |
160 | 0 | return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1; |
161 | |
} |
162 | |
|
163 | |
protected static boolean isCURIEBNode(String curie) { |
164 | 0 | return isCURIE(curie) && curie.substring(1, curie.length() -1).split(CURIE_SEPARATOR)[0].equals("_"); |
165 | |
} |
166 | |
|
167 | |
protected static boolean isRelativeNode(Node node) { |
168 | 0 | return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE); |
169 | |
} |
170 | |
|
171 | |
|
172 | |
protected static Literal getAsPlainLiteral(Node node, String currentLanguage) { |
173 | 0 | final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null); |
174 | 0 | if(content != null) return RDFUtils.literal(content, currentLanguage); |
175 | |
|
176 | 0 | if(! node.hasChildNodes() ) return RDFUtils.literal("", currentLanguage); |
177 | |
|
178 | 0 | final String nodeTextContent = node.getTextContent(); |
179 | 0 | return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage); |
180 | |
} |
181 | |
|
182 | |
protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException { |
183 | 0 | final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null); |
184 | 0 | if(! XML_LITERAL_DATATYPE.equals(datatype)) return null; |
185 | |
|
186 | 0 | final String xmlSerializedNode = DomUtils.serializeToXML(node, false); |
187 | 0 | return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL); |
188 | |
} |
189 | |
|
190 | |
protected static boolean isXMLNSDeclared(Document document) { |
191 | 0 | final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE); |
192 | 0 | if(attributeValue.length() == 0) return false; |
193 | 0 | return XMLNS_DEFAULT.equals(attributeValue); |
194 | |
} |
195 | |
|
196 | 0 | public RDFa11Parser() {} |
197 | |
|
198 | |
|
199 | |
|
200 | |
|
201 | |
|
202 | |
|
203 | |
|
204 | |
|
205 | |
public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult) |
206 | |
throws RDFa11ParserException { |
207 | |
try { |
208 | 0 | this.errorReporter = extractionResult; |
209 | |
|
210 | |
|
211 | 0 | if( ! isXMLNSDeclared(document)) { |
212 | 0 | reportError( |
213 | |
document.getDocumentElement(), |
214 | |
String.format( |
215 | |
"The default %s namespace is expected to be declared and equal to '%s' .", |
216 | |
XMLNS_ATTRIBUTE, XMLNS_DEFAULT |
217 | |
) |
218 | |
); |
219 | |
} |
220 | |
|
221 | |
try { |
222 | 0 | documentBase = getDocumentBase(documentURL, document); |
223 | 0 | } catch (MalformedURLException murle) { |
224 | 0 | throw new RDFa11ParserException("Invalid document base URL.", murle); |
225 | 0 | } |
226 | |
|
227 | |
|
228 | 0 | pushContext(document, new EvaluationContext(documentBase)); |
229 | |
|
230 | 0 | depthFirstNode(document, extractionResult); |
231 | |
|
232 | |
assert listOfIncompleteTriples.isEmpty() |
233 | |
: |
234 | 0 | "The list of incomplete triples is expected to be empty at the end of processing."; |
235 | |
} finally { |
236 | 0 | reset(); |
237 | 0 | } |
238 | 0 | } |
239 | |
|
240 | |
|
241 | |
|
242 | |
|
243 | |
public void reset() { |
244 | 0 | errorReporter = null; |
245 | 0 | documentBase = null; |
246 | 0 | uriMappingStack.clear(); |
247 | 0 | listOfIncompleteTriples.clear(); |
248 | 0 | evaluationContextStack.clear(); |
249 | 0 | } |
250 | |
|
251 | |
|
252 | |
|
253 | |
|
254 | |
|
255 | |
|
256 | |
protected void updateVocabulary(Node currentNode) { |
257 | 0 | final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null); |
258 | 0 | if(vocabularyStr == null) return; |
259 | |
try { |
260 | 0 | pushVocabulary(currentNode, RDFUtils.uri(vocabularyStr)); |
261 | 0 | } catch (Exception e) { |
262 | 0 | reportError(currentNode, String.format("Invalid vocabulary [%s], must be a URI.", vocabularyStr)); |
263 | 0 | } |
264 | 0 | } |
265 | |
|
266 | |
|
267 | |
|
268 | |
|
269 | |
|
270 | |
|
271 | |
protected void updateURIMapping(Node node) { |
272 | 0 | final NamedNodeMap attributes = node.getAttributes(); |
273 | 0 | if (null == attributes) return; |
274 | |
|
275 | |
Node attribute; |
276 | 0 | final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>(); |
277 | 0 | final String namespacePrefix = XMLNS_ATTRIBUTE + URI_PREFIX_SEPARATOR; |
278 | 0 | for (int a = 0; a < attributes.getLength(); a++) { |
279 | 0 | attribute = attributes.item(a); |
280 | 0 | if (attribute.getNodeName().startsWith(namespacePrefix)) { |
281 | 0 | prefixMapList.add( |
282 | |
new PrefixMap( |
283 | |
attribute.getNodeName().substring(namespacePrefix.length()), |
284 | |
resolveURI(attribute.getNodeValue()) |
285 | |
) |
286 | |
); |
287 | |
} |
288 | |
} |
289 | |
|
290 | 0 | extractPrefixes(node, prefixMapList); |
291 | |
|
292 | 0 | if(prefixMapList.size() == 0) return; |
293 | 0 | pushMappings( |
294 | |
node, |
295 | |
prefixMapList |
296 | |
); |
297 | 0 | } |
298 | |
|
299 | |
|
300 | |
|
301 | |
|
302 | |
|
303 | |
|
304 | |
|
305 | |
protected URI getMapping(String prefix) { |
306 | 0 | for (URIMapping uriMapping : uriMappingStack) { |
307 | 0 | final URI mapping = uriMapping.map.get(prefix); |
308 | 0 | if (mapping != null) { |
309 | 0 | return mapping; |
310 | |
} |
311 | 0 | } |
312 | 0 | return null; |
313 | |
} |
314 | |
|
315 | |
|
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | |
|
321 | |
|
322 | |
|
323 | |
protected URI[] resolveCurieOrURIList(Node n, String curieOrURIList, boolean termAllowed) |
324 | |
throws URISyntaxException { |
325 | 0 | if(curieOrURIList == null || curieOrURIList.trim().length() == 0) return new URI[0]; |
326 | |
|
327 | 0 | final String[] curieOrURIListParts = curieOrURIList.split("\\s"); |
328 | 0 | final List<URI> result = new ArrayList<URI>(); |
329 | |
Resource curieOrURI; |
330 | 0 | for(String curieORURIListPart : curieOrURIListParts) { |
331 | 0 | curieOrURI = resolveCURIEOrURI(curieORURIListPart, termAllowed); |
332 | 0 | if(curieOrURI != null && curieOrURI instanceof URI) { |
333 | 0 | result.add((URI) curieOrURI); |
334 | |
} else { |
335 | 0 | reportError(n, String.format("Invalid CURIE '%s' : expected URI, found BNode.", curieORURIListPart)); |
336 | |
} |
337 | |
} |
338 | 0 | return result.toArray(new URI[result.size()]); |
339 | |
} |
340 | |
|
341 | |
|
342 | |
|
343 | |
|
344 | |
|
345 | |
|
346 | |
|
347 | |
protected URI resolveURI(String uriStr) { |
348 | 0 | return RDFUtils.uri(uriStr); |
349 | |
} |
350 | |
|
351 | |
|
352 | |
|
353 | |
|
354 | |
|
355 | |
|
356 | |
|
357 | |
|
358 | |
protected Resource resolveCURIEOrURI(String curieOrURI, boolean termAllowed) { |
359 | 0 | if( isCURIE(curieOrURI) ) { |
360 | 0 | return resolveNamespacedURI(curieOrURI.substring(1, curieOrURI.length() - 1), ResolutionPolicy.NSRequired); |
361 | |
} |
362 | 0 | if(isAbsoluteURI(curieOrURI)) return resolveURI(curieOrURI); |
363 | 0 | return resolveNamespacedURI( |
364 | |
curieOrURI, |
365 | |
termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired |
366 | |
); |
367 | |
} |
368 | |
|
369 | |
|
370 | |
|
371 | |
|
372 | |
|
373 | |
|
374 | |
|
375 | |
private void pushContext(Node current, EvaluationContext ec) { |
376 | 0 | ec.node = current; |
377 | 0 | evaluationContextStack.push(ec); |
378 | 0 | } |
379 | |
|
380 | |
|
381 | |
|
382 | |
|
383 | |
private EvaluationContext getContext() { |
384 | 0 | return evaluationContextStack.peek(); |
385 | |
} |
386 | |
|
387 | |
|
388 | |
|
389 | |
|
390 | |
|
391 | |
|
392 | |
private void popContext(Node current) { |
393 | 0 | final Node peekNode = evaluationContextStack.peek().node; |
394 | 0 | if(DomUtils.isAncestorOf(peekNode, current)) { |
395 | 0 | evaluationContextStack.pop(); |
396 | |
} |
397 | 0 | } |
398 | |
|
399 | |
|
400 | |
|
401 | |
|
402 | |
|
403 | |
|
404 | |
|
405 | |
private void pushVocabulary(Node currentNode, URI vocab) { |
406 | 0 | vocabularyStack.push( new Vocabulary(currentNode, vocab) ); |
407 | 0 | } |
408 | |
|
409 | |
|
410 | |
|
411 | |
|
412 | |
private URI getVocabulary() { |
413 | 0 | if(vocabularyStack.isEmpty()) return null; |
414 | 0 | return vocabularyStack.peek().prefix; |
415 | |
} |
416 | |
|
417 | |
|
418 | |
|
419 | |
|
420 | |
|
421 | |
|
422 | |
private void popVocabulary(Node current) { |
423 | 0 | if(vocabularyStack.isEmpty()) return; |
424 | 0 | if(DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) { |
425 | 0 | vocabularyStack.pop(); |
426 | |
} |
427 | 0 | } |
428 | |
|
429 | |
|
430 | |
|
431 | |
|
432 | |
|
433 | |
|
434 | |
private void purgeIncompleteTriples(Node current) { |
435 | 0 | final List<IncompleteTriple> toBePurged = new ArrayList<IncompleteTriple>(); |
436 | 0 | for(IncompleteTriple incompleteTriple : listOfIncompleteTriples) { |
437 | 0 | if( DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true) ) { |
438 | 0 | toBePurged.add(incompleteTriple); |
439 | |
} |
440 | |
} |
441 | 0 | listOfIncompleteTriples.removeAll(toBePurged); |
442 | 0 | toBePurged.clear(); |
443 | 0 | } |
444 | |
|
445 | |
|
446 | |
|
447 | |
|
448 | |
|
449 | |
|
450 | |
|
451 | |
private void reportError(Node n, String msg) { |
452 | 0 | final String errorMsg = String.format( |
453 | |
"Error while processing node [%s] : '%s'", |
454 | |
DomUtils.getXPathForNode(n), msg |
455 | |
); |
456 | 0 | final int[] errorLocation = DomUtils.getNodeLocation(n); |
457 | 0 | this.errorReporter.notifyError( |
458 | |
ErrorReporter.ErrorLevel.WARN, |
459 | |
errorMsg, |
460 | |
errorLocation == null ? -1 : errorLocation[0], |
461 | |
errorLocation == null ? -1 : errorLocation[1] |
462 | |
); |
463 | 0 | } |
464 | |
|
465 | |
|
466 | |
|
467 | |
|
468 | |
|
469 | |
|
470 | |
|
471 | |
private void depthFirstNode(Node node, ExtractionResult extractionResult) { |
472 | |
try { |
473 | 0 | processNode(node, extractionResult); |
474 | 0 | } catch (Exception e) { |
475 | 0 | if(logger.isDebugEnabled()) logger.debug("Error while processing node.", e); |
476 | 0 | reportError(node, e.getMessage()); |
477 | |
|
478 | 0 | } |
479 | 0 | depthFirstChildren(node.getChildNodes(), extractionResult); |
480 | 0 | purgeIncompleteTriples(node); |
481 | 0 | } |
482 | |
|
483 | |
|
484 | |
|
485 | |
|
486 | |
|
487 | |
|
488 | |
|
489 | |
private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) { |
490 | 0 | for(int i = 0; i < nodeList.getLength(); i++) { |
491 | 0 | final Node child = nodeList.item(i); |
492 | 0 | depthFirstNode(child, extractionResult); |
493 | 0 | popMappings(child); |
494 | 0 | popVocabulary(child); |
495 | 0 | popContext(child); |
496 | |
} |
497 | 0 | } |
498 | |
|
499 | |
|
500 | |
|
501 | |
|
502 | |
|
503 | |
|
504 | |
|
505 | |
|
506 | |
|
507 | |
private void writeTriple(Resource s, URI p, Value o, ExtractionResult extractionResult) { |
508 | 0 | if(logger.isTraceEnabled()) logger.trace(String.format("writeTriple(%s %s %s)" , s, p, o)); |
509 | 0 | assert s != null : "subject is null."; |
510 | 0 | assert p != null : "predicate is null."; |
511 | 0 | assert o != null : "object is null."; |
512 | 0 | extractionResult.writeTriple(s, p, o); |
513 | 0 | } |
514 | |
|
515 | |
|
516 | |
|
517 | |
|
518 | |
|
519 | |
|
520 | |
|
521 | |
|
522 | |
|
523 | |
|
524 | |
|
525 | |
|
526 | |
private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception { |
527 | 0 | if(logger.isTraceEnabled()) logger.trace("processNode(" + DomUtils.getXPathForNode(currentElement) + ")"); |
528 | 0 | final EvaluationContext currentEvaluationContext = getContext(); |
529 | |
try { |
530 | 0 | if( |
531 | |
currentElement.getNodeType() != Node.DOCUMENT_NODE |
532 | |
&& |
533 | |
currentElement.getNodeType() != Node.ELEMENT_NODE |
534 | |
) return; |
535 | |
|
536 | |
|
537 | 0 | updateVocabulary(currentElement); |
538 | |
|
539 | |
|
540 | |
|
541 | 0 | updateURIMapping(currentElement); |
542 | |
|
543 | |
|
544 | 0 | updateLanguage(currentElement, currentEvaluationContext); |
545 | |
|
546 | 0 | if(! isRelativeNode(currentElement)) { |
547 | |
|
548 | 0 | establishNewSubject(currentElement, currentEvaluationContext); |
549 | |
} else { |
550 | |
|
551 | 0 | establishNewSubjectCurrentObjectResource( |
552 | |
currentElement, |
553 | |
currentEvaluationContext |
554 | |
); |
555 | |
} |
556 | |
|
557 | |
|
558 | |
|
559 | |
|
560 | |
|
561 | |
|
562 | |
|
563 | 0 | if(currentEvaluationContext.newSubject == null) return; |
564 | 0 | if(logger.isDebugEnabled()) logger.debug("newSubject: " + currentEvaluationContext.newSubject); |
565 | |
|
566 | |
|
567 | 0 | final URI[] types = getTypes(currentElement); |
568 | 0 | for(URI type : types) { |
569 | 0 | writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult); |
570 | |
} |
571 | |
|
572 | |
|
573 | 0 | final URI[] rels = getRels(currentElement); |
574 | 0 | final URI[] revs = getRevs(currentElement); |
575 | 0 | if(currentEvaluationContext.currentObjectResource != null) { |
576 | 0 | for (URI rel : rels) { |
577 | 0 | writeTriple( |
578 | |
currentEvaluationContext.newSubject, |
579 | |
rel, |
580 | |
currentEvaluationContext.currentObjectResource, |
581 | |
extractionResult |
582 | |
); |
583 | |
} |
584 | 0 | for (URI rev : revs) { |
585 | 0 | writeTriple( |
586 | |
currentEvaluationContext.currentObjectResource, |
587 | |
rev, |
588 | |
currentEvaluationContext.newSubject, extractionResult |
589 | |
); |
590 | |
} |
591 | |
} else { |
592 | 0 | for(URI rel : rels) { |
593 | 0 | listOfIncompleteTriples.add( |
594 | |
new IncompleteTriple( |
595 | |
currentElement, |
596 | |
currentEvaluationContext.newSubject, |
597 | |
rel, |
598 | |
IncompleteTripleDirection.Forward |
599 | |
) |
600 | |
); |
601 | |
} |
602 | 0 | for(URI rev : revs) { |
603 | 0 | listOfIncompleteTriples.add( |
604 | |
new IncompleteTriple( |
605 | |
currentElement, |
606 | |
currentEvaluationContext.newSubject, |
607 | |
rev, |
608 | |
IncompleteTripleDirection.Reverse |
609 | |
) |
610 | |
); |
611 | |
} |
612 | |
} |
613 | |
|
614 | |
|
615 | 0 | final Value currentObject = getCurrentObject(currentElement); |
616 | 0 | final URI[] predicates = getPredicate(currentElement); |
617 | 0 | if (currentObject != null && predicates != null) { |
618 | 0 | for (URI predicate : predicates) { |
619 | 0 | writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult); |
620 | |
} |
621 | |
} |
622 | |
|
623 | |
|
624 | 0 | if(!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) { |
625 | 0 | for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) { |
626 | 0 | incompleteTriple.produceTriple( |
627 | |
currentElement, |
628 | |
currentEvaluationContext.newSubject, |
629 | |
extractionResult |
630 | |
); |
631 | |
} |
632 | |
} |
633 | 0 | } catch (Exception e) { |
634 | 0 | throw e; |
635 | |
} finally { |
636 | |
|
637 | 0 | if(currentEvaluationContext.recourse) { |
638 | 0 | EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base); |
639 | 0 | if(currentEvaluationContext.skipElem) { |
640 | 0 | newEvaluationContext.language = currentEvaluationContext.language; |
641 | |
} else { |
642 | 0 | newEvaluationContext.base = currentEvaluationContext.base; |
643 | |
|
644 | 0 | if(currentEvaluationContext.newSubject != null) { |
645 | 0 | newEvaluationContext.parentSubject = currentEvaluationContext.newSubject; |
646 | |
} else { |
647 | 0 | newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject; |
648 | |
} |
649 | |
|
650 | 0 | if(currentEvaluationContext.currentObjectResource != null) { |
651 | 0 | newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource; |
652 | 0 | } else if(currentEvaluationContext.newSubject != null) { |
653 | 0 | newEvaluationContext.parentObject = currentEvaluationContext.newSubject; |
654 | |
} else { |
655 | 0 | newEvaluationContext.parentObject = currentEvaluationContext.parentSubject; |
656 | |
} |
657 | |
|
658 | 0 | newEvaluationContext.language = currentEvaluationContext.language; |
659 | |
} |
660 | 0 | pushContext(currentElement, newEvaluationContext); |
661 | 0 | } |
662 | |
} |
663 | 0 | } |
664 | |
|
665 | |
|
666 | |
|
667 | |
|
668 | |
|
669 | |
|
670 | |
|
671 | |
private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) { |
672 | 0 | final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null); |
673 | 0 | if(prefixAttribute == null) return; |
674 | 0 | final String[] prefixParts = extractPrefixSections(prefixAttribute); |
675 | 0 | for(String prefixPart : prefixParts) { |
676 | 0 | int splitPoint = prefixPart.indexOf(URI_PREFIX_SEPARATOR); |
677 | 0 | final String prefix = prefixPart.substring(0, splitPoint); |
678 | 0 | if(prefix.length() == 0) { |
679 | 0 | reportError(node, String.format("Invalid prefix length in prefix attribute '%s'", prefixAttribute)); |
680 | 0 | continue; |
681 | |
} |
682 | |
final URI uri; |
683 | 0 | final String uriStr = prefixPart.substring(splitPoint + 1); |
684 | |
try { |
685 | 0 | uri = resolveURI(uriStr); |
686 | 0 | } catch (Exception e) { |
687 | 0 | reportError( |
688 | |
node, |
689 | |
String.format( |
690 | |
"Resolution of prefix '%s' defines an invalid URI: '%s'", |
691 | |
prefixAttribute, uriStr |
692 | |
) |
693 | |
); |
694 | 0 | continue; |
695 | 0 | } |
696 | 0 | prefixMapList.add( new PrefixMap(prefix, uri) ); |
697 | |
} |
698 | 0 | } |
699 | |
|
700 | |
|
701 | |
|
702 | |
|
703 | |
|
704 | |
|
705 | |
|
706 | |
private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) { |
707 | 0 | final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null); |
708 | 0 | if(candidateLanguage != null) currentEvaluationContext.language = candidateLanguage; |
709 | 0 | } |
710 | |
|
711 | |
|
712 | |
|
713 | |
|
714 | |
|
715 | |
|
716 | |
|
717 | |
|
718 | |
|
719 | |
private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext) |
720 | |
throws URISyntaxException { |
721 | |
String candidateURIOrCURIE; |
722 | 0 | for(String subjectAttribute : SUBJECT_ATTRIBUTES) { |
723 | 0 | candidateURIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null); |
724 | 0 | if(candidateURIOrCURIE != null) { |
725 | 0 | currentEvaluationContext.newSubject = resolveCURIEOrURI(candidateURIOrCURIE, false); |
726 | 0 | return; |
727 | |
} |
728 | |
} |
729 | |
|
730 | 0 | if(node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) { |
731 | 0 | currentEvaluationContext.newSubject = resolveURI(currentEvaluationContext.base.toString()); |
732 | 0 | return; |
733 | |
} |
734 | |
|
735 | 0 | if(DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) { |
736 | 0 | currentEvaluationContext.newSubject = RDFUtils.bnode(); |
737 | 0 | return; |
738 | |
} |
739 | |
|
740 | 0 | if(DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) { |
741 | 0 | currentEvaluationContext.skipElem = true; |
742 | |
} |
743 | 0 | if(currentEvaluationContext.parentObject != null) { |
744 | 0 | currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject; |
745 | 0 | return; |
746 | |
} |
747 | |
|
748 | 0 | currentEvaluationContext.newSubject = null; |
749 | 0 | } |
750 | |
|
751 | |
|
752 | |
|
753 | |
|
754 | |
|
755 | |
|
756 | |
|
757 | |
|
758 | |
|
759 | |
|
760 | |
private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext) |
761 | |
throws URISyntaxException { |
762 | |
|
763 | |
String candidateURIOrCURIE; |
764 | 0 | candidateURIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null); |
765 | 0 | if(candidateURIOrCURIE != null) { |
766 | 0 | currentEvaluationContext.newSubject = resolveCURIEOrURI(candidateURIOrCURIE, false); |
767 | |
} else { |
768 | 0 | candidateURIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null); |
769 | 0 | if (candidateURIOrCURIE != null) { |
770 | 0 | currentEvaluationContext.newSubject = resolveURI(candidateURIOrCURIE); |
771 | |
} else { |
772 | 0 | if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) { |
773 | 0 | currentEvaluationContext.newSubject = resolveURI(currentEvaluationContext.base.toString()); |
774 | |
} else { |
775 | 0 | if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) { |
776 | 0 | currentEvaluationContext.newSubject = RDFUtils.bnode(); |
777 | |
} else { |
778 | 0 | if (currentEvaluationContext.parentObject != null) { |
779 | 0 | currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject; |
780 | |
} |
781 | |
} |
782 | |
} |
783 | |
} |
784 | |
} |
785 | |
|
786 | |
|
787 | 0 | candidateURIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null); |
788 | 0 | if(candidateURIOrCURIE != null) { |
789 | 0 | currentEvaluationContext.currentObjectResource = resolveCURIEOrURI(candidateURIOrCURIE, false); |
790 | 0 | return; |
791 | |
} |
792 | |
|
793 | 0 | candidateURIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null); |
794 | 0 | if(candidateURIOrCURIE != null) { |
795 | 0 | currentEvaluationContext.currentObjectResource = resolveURI(candidateURIOrCURIE); |
796 | 0 | return; |
797 | |
} |
798 | 0 | currentEvaluationContext.currentObjectResource = null; |
799 | 0 | } |
800 | |
|
801 | |
private URI[] getTypes(Node node) throws URISyntaxException { |
802 | 0 | final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null); |
803 | 0 | return resolveCurieOrURIList(node, typeOf, true); |
804 | |
} |
805 | |
|
806 | |
private URI[] getRels(Node node) throws URISyntaxException { |
807 | 0 | final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null); |
808 | 0 | return resolveCurieOrURIList(node, rel, true); |
809 | |
} |
810 | |
|
811 | |
private URI[] getRevs(Node node) throws URISyntaxException { |
812 | 0 | final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null); |
813 | 0 | return resolveCurieOrURIList(node, rev, true); |
814 | |
} |
815 | |
|
816 | |
private URI[] getPredicate(Node node) throws URISyntaxException { |
817 | 0 | final String candidateURI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null); |
818 | 0 | if(candidateURI == null) return null; |
819 | 0 | return resolveCurieOrURIList(node, candidateURI, true); |
820 | |
} |
821 | |
|
822 | |
|
823 | |
|
824 | |
|
825 | |
|
826 | |
|
827 | |
|
828 | |
|
829 | |
|
830 | |
|
831 | |
|
832 | |
private Value getCurrentObject(Node node) |
833 | |
throws URISyntaxException, IOException, TransformerException { |
834 | 0 | final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null); |
835 | 0 | if(candidateObject != null) { |
836 | 0 | return resolveURI(candidateObject); |
837 | |
} else { |
838 | 0 | return gerCurrentObjectLiteral(node); |
839 | |
} |
840 | |
} |
841 | |
|
842 | |
private Literal gerCurrentObjectLiteral(Node node) |
843 | |
throws URISyntaxException, IOException, TransformerException { |
844 | 0 | final EvaluationContext currentEvaluationContext = getContext(); |
845 | |
Literal literal; |
846 | |
|
847 | 0 | literal = getAsTypedLiteral(node); |
848 | 0 | if(literal != null) return literal; |
849 | |
|
850 | 0 | literal = getAsXMLLiteral(node); |
851 | 0 | if(literal != null) { |
852 | 0 | currentEvaluationContext.recourse = false; |
853 | 0 | return literal; |
854 | |
} |
855 | |
|
856 | 0 | literal = getAsPlainLiteral(node, currentEvaluationContext.language); |
857 | 0 | if(literal != null) return literal; |
858 | |
|
859 | 0 | return null; |
860 | |
} |
861 | |
|
862 | |
private static String getNodeContent(Node node) { |
863 | 0 | final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null); |
864 | 0 | if(candidateContent != null) return candidateContent; |
865 | 0 | return node.getTextContent(); |
866 | |
} |
867 | |
|
868 | |
|
869 | |
|
870 | |
|
871 | |
|
872 | |
|
873 | |
|
874 | |
|
875 | |
|
876 | |
private Literal getAsTypedLiteral(Node node) throws URISyntaxException { |
877 | 0 | final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null); |
878 | 0 | if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim()) ) { |
879 | 0 | return null; |
880 | |
} |
881 | 0 | final Resource curieOrURI = resolveCURIEOrURI(datatype, true); |
882 | 0 | return RDFUtils.literal(getNodeContent(node), curieOrURI instanceof URI ? (URI) curieOrURI : null); |
883 | |
} |
884 | |
|
885 | |
private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) { |
886 | 0 | logger.trace("pushMappings()"); |
887 | |
|
888 | 0 | final Map<String, URI> mapping = new HashMap<String, URI>(); |
889 | 0 | for (PrefixMap prefixMap : prefixMapList) { |
890 | 0 | mapping.put(prefixMap.prefix, prefixMap.uri); |
891 | |
} |
892 | 0 | uriMappingStack.push( new URIMapping(sourceNode, mapping) ); |
893 | 0 | } |
894 | |
|
895 | |
private void popMappings(Node node) { |
896 | 0 | if(uriMappingStack.isEmpty()) return; |
897 | 0 | final URIMapping peek = uriMappingStack.peek(); |
898 | 0 | if( ! DomUtils.isAncestorOf(peek.sourceNode, node) ) { |
899 | 0 | logger.trace("popMappings()"); |
900 | 0 | uriMappingStack.pop(); |
901 | |
} |
902 | 0 | } |
903 | |
|
904 | |
|
905 | |
|
906 | |
|
907 | |
|
908 | |
|
909 | |
|
910 | |
|
911 | |
|
912 | |
private Resource resolveNamespacedURI(String mapping, ResolutionPolicy resolutionPolicy) { |
913 | 0 | if(mapping.indexOf(URI_PATH_SEPARATOR) == 0) { |
914 | 0 | mapping = mapping.substring(1); |
915 | |
} |
916 | |
|
917 | 0 | final int prefixSeparatorIndex = mapping.indexOf(':'); |
918 | 0 | if(prefixSeparatorIndex == -1) { |
919 | 0 | if(resolutionPolicy == ResolutionPolicy.NSRequired) { |
920 | 0 | throw new IllegalArgumentException( |
921 | |
String.format("Invalid mapping string [%s], must declare a prefix.", mapping) |
922 | |
); |
923 | |
} |
924 | 0 | if (resolutionPolicy == ResolutionPolicy.TermAllowed) { |
925 | 0 | final URI currentVocabulary = getVocabulary(); |
926 | |
|
927 | 0 | if (currentVocabulary != null) { |
928 | 0 | return resolveURI(currentVocabulary.toString() + mapping); |
929 | |
} |
930 | |
} |
931 | 0 | return resolveURI(documentBase.toString() + mapping); |
932 | |
} |
933 | |
|
934 | 0 | final String prefix = mapping.substring(0, prefixSeparatorIndex); |
935 | 0 | final URI curieMapping = getMapping(prefix); |
936 | 0 | if(curieMapping == null) { |
937 | 0 | throw new IllegalArgumentException( String.format("Cannot map prefix '%s'", prefix) ); |
938 | |
} |
939 | 0 | final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1); |
940 | |
final java.net.URI candidateCURIE; |
941 | |
try { |
942 | 0 | candidateCURIE = new java.net.URI(candidateCURIEStr); |
943 | 0 | } catch (URISyntaxException urise) { |
944 | 0 | throw new IllegalArgumentException(String.format("Invalid CURIE '%s'", candidateCURIEStr) ); |
945 | 0 | } |
946 | 0 | return resolveURI( |
947 | |
candidateCURIE.isAbsolute() |
948 | |
? |
949 | |
candidateCURIE.toString() |
950 | |
: |
951 | |
documentBase.toString() + candidateCURIE.toString() |
952 | |
); |
953 | |
} |
954 | |
|
955 | |
|
956 | |
|
957 | |
|
958 | 0 | enum ResolutionPolicy { |
959 | 0 | NSNotRequired, |
960 | 0 | NSRequired, |
961 | 0 | TermAllowed |
962 | |
} |
963 | |
|
964 | |
|
965 | |
|
966 | |
|
967 | 0 | private class EvaluationContext { |
968 | |
private Node node; |
969 | |
private URL base; |
970 | |
private Resource parentSubject; |
971 | |
private Value parentObject; |
972 | |
private String language; |
973 | |
private boolean recourse; |
974 | |
private boolean skipElem; |
975 | |
private Resource newSubject; |
976 | |
private Resource currentObjectResource; |
977 | |
|
978 | |
|
979 | |
|
980 | |
|
981 | |
|
982 | |
|
983 | 0 | EvaluationContext(URL base) { |
984 | 0 | this.base = base; |
985 | 0 | this.parentSubject = resolveURI( base.toExternalForm() ); |
986 | 0 | this.parentObject = null; |
987 | 0 | this.language = null; |
988 | 0 | this.recourse = true; |
989 | 0 | this.skipElem = false; |
990 | 0 | this.newSubject = null; |
991 | 0 | this.currentObjectResource = null; |
992 | 0 | } |
993 | |
} |
994 | |
|
995 | |
|
996 | |
|
997 | |
|
998 | |
private class PrefixMap { |
999 | |
final String prefix; |
1000 | |
final URI uri; |
1001 | 0 | public PrefixMap(String prefix, URI uri) { |
1002 | 0 | this.prefix = prefix; |
1003 | 0 | this.uri = uri; |
1004 | 0 | } |
1005 | |
} |
1006 | |
|
1007 | |
|
1008 | |
|
1009 | |
|
1010 | |
private class URIMapping { |
1011 | |
final Node sourceNode; |
1012 | |
final Map<String, URI> map; |
1013 | |
|
1014 | 0 | public URIMapping(Node sourceNode, Map<String, URI> map) { |
1015 | 0 | this.sourceNode = sourceNode; |
1016 | 0 | this.map = map; |
1017 | 0 | } |
1018 | |
} |
1019 | |
|
1020 | |
|
1021 | |
|
1022 | |
|
1023 | 0 | private enum IncompleteTripleDirection { |
1024 | 0 | Forward, |
1025 | 0 | Reverse |
1026 | |
} |
1027 | |
|
1028 | |
|
1029 | |
|
1030 | |
|
1031 | |
private class IncompleteTriple { |
1032 | |
final Node originatingNode; |
1033 | |
final Resource subject; |
1034 | |
final URI predicate; |
1035 | |
final IncompleteTripleDirection direction; |
1036 | |
|
1037 | |
public IncompleteTriple( |
1038 | |
Node originatingNode, |
1039 | |
Resource subject, |
1040 | |
URI predicate, |
1041 | |
IncompleteTripleDirection direction |
1042 | 0 | ) { |
1043 | 0 | if(originatingNode == null || subject == null || predicate == null || direction == null) |
1044 | 0 | throw new IllegalArgumentException(); |
1045 | |
|
1046 | 0 | this.originatingNode = originatingNode; |
1047 | 0 | this.subject = subject; |
1048 | 0 | this.predicate = predicate; |
1049 | 0 | this.direction = direction; |
1050 | 0 | } |
1051 | |
|
1052 | |
public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) { |
1053 | 0 | if( ! DomUtils.isAncestorOf(originatingNode, resourceNode, true) ) return false; |
1054 | |
|
1055 | 0 | if(r == null) throw new IllegalArgumentException(); |
1056 | 0 | switch (direction) { |
1057 | |
case Forward: |
1058 | 0 | extractionResult.writeTriple(subject, predicate, r); |
1059 | 0 | break; |
1060 | |
case Reverse: |
1061 | 0 | extractionResult.writeTriple(r, predicate, subject); |
1062 | 0 | break; |
1063 | |
default: |
1064 | 0 | throw new IllegalStateException(); |
1065 | |
} |
1066 | 0 | return true; |
1067 | |
} |
1068 | |
} |
1069 | |
|
1070 | |
|
1071 | |
|
1072 | |
|
1073 | |
private class Vocabulary { |
1074 | |
final Node originatingNode; |
1075 | |
final URI prefix; |
1076 | |
|
1077 | 0 | public Vocabulary(Node originatingNode, URI prefix) { |
1078 | 0 | this.originatingNode = originatingNode; |
1079 | 0 | this.prefix = prefix; |
1080 | 0 | } |
1081 | |
} |
1082 | |
|
1083 | |
} |