1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.io.nquads; |
19 | |
|
20 | |
import org.apache.any23.util.ReaderInputStream; |
21 | |
import org.openrdf.model.BNode; |
22 | |
import org.openrdf.model.Resource; |
23 | |
import org.openrdf.model.Statement; |
24 | |
import org.openrdf.model.URI; |
25 | |
import org.openrdf.model.Value; |
26 | |
import org.openrdf.model.datatypes.XMLDatatypeUtil; |
27 | |
import org.openrdf.model.impl.URIImpl; |
28 | |
import org.openrdf.rio.ParseLocationListener; |
29 | |
import org.openrdf.rio.RDFFormat; |
30 | |
import org.openrdf.rio.RDFHandler; |
31 | |
import org.openrdf.rio.RDFHandlerException; |
32 | |
import org.openrdf.rio.RDFParseException; |
33 | |
import org.openrdf.rio.helpers.RDFParserBase; |
34 | |
import org.openrdf.rio.ntriples.NTriplesUtil; |
35 | |
|
36 | |
import java.io.BufferedReader; |
37 | |
import java.io.IOException; |
38 | |
import java.io.InputStream; |
39 | |
import java.io.InputStreamReader; |
40 | |
import java.io.Reader; |
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
public class NQuadsParser extends RDFParserBase { |
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
private ParseLocationListener locationListener; |
56 | |
|
57 | |
|
58 | |
|
59 | |
|
60 | |
private RDFHandler rdfHandler; |
61 | |
|
62 | |
|
63 | |
|
64 | |
|
65 | |
private int row, col, mark; |
66 | |
|
67 | 0 | public NQuadsParser() {} |
68 | |
|
69 | |
public RDFFormat getRDFFormat() { |
70 | 0 | return NQuads.FORMAT; |
71 | |
} |
72 | |
|
73 | |
public void parse(Reader reader, String s) |
74 | |
throws IOException, RDFParseException, RDFHandlerException { |
75 | 0 | ReaderInputStream readerInputStream = new ReaderInputStream(reader); |
76 | 0 | parse(readerInputStream, s); |
77 | 0 | } |
78 | |
|
79 | |
public synchronized void parse(InputStream is, String baseURI) |
80 | |
throws IOException, RDFParseException, RDFHandlerException { |
81 | 0 | if(is == null) { |
82 | 0 | throw new NullPointerException("inputStream cannot be null."); |
83 | |
} |
84 | 0 | if(baseURI == null) { |
85 | 0 | throw new NullPointerException("baseURI cannot be null."); |
86 | |
} |
87 | |
|
88 | |
try { |
89 | 0 | row = col = 1; |
90 | |
|
91 | 0 | locationListener = getParseLocationListener(); |
92 | 0 | rdfHandler = getRDFHandler(); |
93 | |
|
94 | 0 | setBaseURI(baseURI); |
95 | |
|
96 | 0 | final BufferedReader br = new BufferedReader( new InputStreamReader(is) ); |
97 | 0 | if( rdfHandler != null ) { |
98 | 0 | rdfHandler.startRDF(); |
99 | |
} |
100 | 0 | while( parseLine(br) ) { |
101 | 0 | nextRow(); |
102 | |
} |
103 | |
} finally { |
104 | 0 | if(rdfHandler != null) { |
105 | 0 | rdfHandler.endRDF(); |
106 | |
} |
107 | 0 | clear(); |
108 | 0 | clearBNodeIDMap(); |
109 | 0 | } |
110 | 0 | } |
111 | |
|
112 | |
|
113 | |
|
114 | |
|
115 | |
private void nextRow() { |
116 | 0 | col = 0; |
117 | 0 | row++; |
118 | 0 | if(locationListener != null) { |
119 | 0 | locationListener.parseLocationUpdate(row, col); |
120 | |
} |
121 | 0 | } |
122 | |
|
123 | |
|
124 | |
|
125 | |
|
126 | |
private void nextCol() { |
127 | 0 | col++; |
128 | 0 | if(locationListener != null) { |
129 | 0 | locationListener.parseLocationUpdate(row, col); |
130 | |
} |
131 | 0 | } |
132 | |
|
133 | |
|
134 | |
|
135 | |
|
136 | |
|
137 | |
|
138 | |
|
139 | |
|
140 | |
private char readChar(BufferedReader br) throws IOException { |
141 | 0 | final int c = br.read(); |
142 | 0 | if(c == -1) { |
143 | 0 | throw new EOS(); |
144 | |
} |
145 | 0 | nextCol(); |
146 | 0 | return (char) c; |
147 | |
} |
148 | |
|
149 | |
|
150 | |
|
151 | |
|
152 | |
|
153 | |
|
154 | |
|
155 | |
|
156 | |
|
157 | |
private char readUnicode(BufferedReader br) throws IOException, RDFParseException { |
158 | 0 | final char[] unicodeSequence = new char[4]; |
159 | 0 | for(int i = 0; i < unicodeSequence.length; i++) { |
160 | 0 | unicodeSequence[i] = readChar(br); |
161 | |
} |
162 | 0 | final String unicodeCharStr = new String(unicodeSequence); |
163 | |
try { |
164 | 0 | return (char) Integer.parseInt(unicodeCharStr, 16); |
165 | 0 | } catch (NumberFormatException nfe) { |
166 | 0 | reportError("Error while converting unicode char '\\u" + unicodeCharStr + "'", row, col); |
167 | 0 | throw new IllegalStateException(); |
168 | |
} |
169 | |
} |
170 | |
|
171 | |
|
172 | |
|
173 | |
|
174 | |
|
175 | |
|
176 | |
private void mark(BufferedReader br) throws IOException { |
177 | 0 | mark = col; |
178 | 0 | br.mark(5); |
179 | 0 | } |
180 | |
|
181 | |
|
182 | |
|
183 | |
|
184 | |
|
185 | |
|
186 | |
|
187 | |
private void reset(BufferedReader br) throws IOException { |
188 | 0 | col = mark; |
189 | 0 | br.reset(); |
190 | 0 | if(locationListener != null) { |
191 | 0 | locationListener.parseLocationUpdate(row, col); |
192 | |
} |
193 | 0 | } |
194 | |
|
195 | |
|
196 | |
|
197 | |
|
198 | |
|
199 | |
|
200 | |
|
201 | |
|
202 | |
private void assertChar(BufferedReader br, char c) throws IOException { |
203 | 0 | if( readChar(br) != c) { |
204 | 0 | throw new IllegalArgumentException( |
205 | |
String.format("Unexpected char at location %s %s, expected '%s'", row, col, c) |
206 | |
); |
207 | |
} |
208 | 0 | } |
209 | |
|
210 | |
|
211 | |
|
212 | |
|
213 | |
|
214 | |
|
215 | |
|
216 | |
|
217 | |
|
218 | |
|
219 | |
private boolean parseLine(BufferedReader br) |
220 | |
throws IOException, RDFParseException, RDFHandlerException { |
221 | |
|
222 | 0 | if(!consumeSpacesAndNotEOS(br)) { |
223 | 0 | return false; |
224 | |
} |
225 | |
|
226 | |
|
227 | |
try { |
228 | 0 | if(consumeEmptyLine(br)) return true; |
229 | 0 | if( consumeComment(br) ) return true; |
230 | 0 | } catch (EOS eos) { |
231 | 0 | return false; |
232 | 0 | } |
233 | |
|
234 | |
final Resource sub; |
235 | |
final URI pred; |
236 | |
final Value obj; |
237 | |
final URI graph; |
238 | |
try { |
239 | 0 | sub = parseSubject(br); |
240 | 0 | consumeSpaces(br); |
241 | 0 | pred = parsePredicate(br); |
242 | 0 | consumeSpaces(br); |
243 | 0 | obj = parseObject(br); |
244 | 0 | consumeSpaces(br); |
245 | 0 | graph = parseGraph(br); |
246 | 0 | consumeSpaces(br); |
247 | 0 | parseDot(br); |
248 | 0 | } catch (EOS eos) { |
249 | 0 | reportFatalError("Unexpected end of line.", row, col); |
250 | 0 | throw new IllegalStateException(); |
251 | 0 | } |
252 | |
|
253 | 0 | notifyStatement(sub, pred, obj, graph); |
254 | |
|
255 | 0 | if(!consumeSpacesAndNotEOS(br)) { |
256 | 0 | return false; |
257 | |
} |
258 | 0 | return readChar(br) == '\n'; |
259 | |
} |
260 | |
|
261 | |
|
262 | |
|
263 | |
|
264 | |
|
265 | |
|
266 | |
|
267 | |
|
268 | |
private boolean consumeEmptyLine(BufferedReader br) throws IOException { |
269 | |
char c; |
270 | 0 | mark(br); |
271 | 0 | c = readChar(br); |
272 | 0 | if (c == '\n') { |
273 | 0 | return true; |
274 | |
} else { |
275 | 0 | reset(br); |
276 | 0 | return false; |
277 | |
} |
278 | |
} |
279 | |
|
280 | |
|
281 | |
|
282 | |
|
283 | |
|
284 | |
|
285 | |
|
286 | |
private boolean consumeSpacesAndNotEOS(BufferedReader br) throws IOException { |
287 | |
try { |
288 | 0 | consumeSpaces(br); |
289 | 0 | return true; |
290 | 0 | } catch (EOS eos) { |
291 | 0 | return false; |
292 | |
} |
293 | |
} |
294 | |
|
295 | |
|
296 | |
|
297 | |
|
298 | |
|
299 | |
|
300 | |
|
301 | |
|
302 | |
private boolean consumeComment(BufferedReader br) throws IOException { |
303 | |
char c; |
304 | 0 | mark(br); |
305 | 0 | c = readChar(br); |
306 | 0 | if (c == '#') { |
307 | 0 | mark(br); |
308 | 0 | while (readChar(br) != '\n'); |
309 | 0 | mark(br); |
310 | 0 | return true; |
311 | |
} else { |
312 | 0 | reset(br); |
313 | 0 | return false; |
314 | |
} |
315 | |
} |
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | |
|
321 | |
|
322 | |
|
323 | |
|
324 | |
|
325 | |
|
326 | |
|
327 | |
private void notifyStatement(Resource sub, URI pred, Value obj, URI graph) |
328 | |
throws RDFParseException, RDFHandlerException { |
329 | 0 | Statement statement = createStatement(sub, pred, obj, graph); |
330 | 0 | if (rdfHandler != null) { |
331 | |
try { |
332 | 0 | rdfHandler.handleStatement(statement); |
333 | 0 | } catch (RDFHandlerException rdfhe) { |
334 | 0 | reportFatalError(rdfhe); |
335 | 0 | throw rdfhe; |
336 | 0 | } |
337 | |
} |
338 | 0 | } |
339 | |
|
340 | |
|
341 | |
|
342 | |
|
343 | |
|
344 | |
|
345 | |
|
346 | |
private void consumeSpaces(BufferedReader br) throws IOException { |
347 | |
char c; |
348 | |
while(true) { |
349 | 0 | mark(br); |
350 | 0 | c = readChar(br); |
351 | 0 | if(c == ' ' || c == '\r' || c == '\f' || c == '\t') { |
352 | 0 | mark(br); |
353 | |
} else { |
354 | |
break; |
355 | |
} |
356 | |
} |
357 | 0 | reset(br); |
358 | 0 | } |
359 | |
|
360 | |
|
361 | |
|
362 | |
|
363 | |
|
364 | |
|
365 | |
|
366 | |
private void parseDot(BufferedReader br) throws IOException { |
367 | 0 | assertChar(br, '.'); |
368 | 0 | } |
369 | |
|
370 | |
|
371 | |
|
372 | |
|
373 | |
|
374 | |
|
375 | |
|
376 | |
|
377 | |
private URI parseURI(BufferedReader br) throws IOException, RDFParseException { |
378 | 0 | assertChar(br, '<'); |
379 | |
|
380 | 0 | StringBuilder sb = new StringBuilder(); |
381 | |
char c; |
382 | |
while(true) { |
383 | 0 | c = readChar(br); |
384 | 0 | if(c != '>') { |
385 | 0 | sb.append(c); |
386 | |
} else { |
387 | |
break; |
388 | |
} |
389 | |
} |
390 | 0 | mark(br); |
391 | |
|
392 | |
try { |
393 | |
|
394 | 0 | String uriStr = NTriplesUtil.unescapeString( sb.toString() ); |
395 | |
URI uri; |
396 | 0 | if(uriStr.charAt(0) == '#') { |
397 | 0 | uri = resolveURI(uriStr); |
398 | |
} else { |
399 | 0 | uri = createURI(uriStr); |
400 | |
} |
401 | 0 | return uri; |
402 | 0 | } catch (RDFParseException rdfpe) { |
403 | 0 | reportFatalError(rdfpe, row, col); |
404 | 0 | throw rdfpe; |
405 | |
} |
406 | |
} |
407 | |
|
408 | |
|
409 | |
|
410 | |
|
411 | |
|
412 | |
|
413 | |
|
414 | |
|
415 | |
|
416 | |
private BNode parseBNode(BufferedReader br) throws IOException, RDFParseException { |
417 | 0 | assertChar(br, '_'); |
418 | 0 | assertChar(br, ':'); |
419 | |
|
420 | |
char c; |
421 | 0 | StringBuilder sb = new StringBuilder(); |
422 | |
while(true) { |
423 | 0 | c = readChar(br); |
424 | 0 | if(c != ' ' && c != '<') { |
425 | 0 | sb.append(c); |
426 | 0 | mark(br); |
427 | |
} else { |
428 | |
break; |
429 | |
} |
430 | |
} |
431 | 0 | reset(br); |
432 | |
|
433 | |
try { |
434 | 0 | return createBNode( sb.toString() ); |
435 | 0 | } catch (RDFParseException rdfpe) { |
436 | 0 | reportFatalError(rdfpe, row, col); |
437 | 0 | throw rdfpe; |
438 | |
} |
439 | |
} |
440 | |
|
441 | |
|
442 | |
|
443 | |
|
444 | |
|
445 | |
|
446 | |
|
447 | |
|
448 | |
private LiteralAttribute parseLiteralAttribute(BufferedReader br) throws IOException { |
449 | 0 | char c = readChar(br); |
450 | 0 | if(c != '^' && c != '@') { |
451 | 0 | reset(br); |
452 | 0 | return null; |
453 | |
} |
454 | |
|
455 | 0 | boolean isLang = true; |
456 | 0 | if(c == '^') { |
457 | 0 | isLang = false; |
458 | 0 | assertChar(br, '^'); |
459 | |
} |
460 | |
|
461 | |
|
462 | 0 | mark(br); |
463 | 0 | c = readChar(br); |
464 | 0 | if(c != '<') { |
465 | 0 | reset(br); |
466 | |
} |
467 | |
|
468 | 0 | StringBuilder sb = new StringBuilder(); |
469 | |
while(true) { |
470 | 0 | c = readChar(br); |
471 | 0 | if(c == '>') { |
472 | 0 | mark(br); |
473 | 0 | continue; |
474 | |
} |
475 | 0 | if(c != ' ' && c != '<') { |
476 | 0 | mark(br); |
477 | 0 | sb.append(c); |
478 | |
} else { |
479 | |
break; |
480 | |
} |
481 | |
} |
482 | 0 | reset(br); |
483 | 0 | return new LiteralAttribute( isLang, sb.toString() ); |
484 | |
} |
485 | |
|
486 | |
|
487 | |
|
488 | |
|
489 | |
|
490 | |
|
491 | |
|
492 | |
|
493 | |
|
494 | |
|
495 | |
private String validateAndNormalizeLiteral(String value, URI datatype) throws RDFParseException { |
496 | 0 | DatatypeHandling dh = datatypeHandling(); |
497 | 0 | if(dh.equals( DatatypeHandling.IGNORE )) { |
498 | 0 | return value; |
499 | |
} |
500 | |
|
501 | 0 | if ( dh.equals(DatatypeHandling.VERIFY) ) { |
502 | 0 | if( ! XMLDatatypeUtil.isBuiltInDatatype(datatype)){ |
503 | 0 | return value; |
504 | |
} |
505 | 0 | if( ! XMLDatatypeUtil.isValidValue(value, datatype) ) { |
506 | 0 | throw new RDFParseException( |
507 | |
String.format("Illegal literal value '%s' with datatype %s", value, datatype.stringValue() ), |
508 | |
row, col |
509 | |
); |
510 | |
} |
511 | 0 | return value; |
512 | 0 | } else if( dh.equals(DatatypeHandling.NORMALIZE) ) { |
513 | 0 | return XMLDatatypeUtil.normalize(value, datatype); |
514 | |
} else { |
515 | 0 | throw new IllegalArgumentException( String.format("Unsupported datatype handling: %s", dh) ); |
516 | |
} |
517 | |
} |
518 | |
|
519 | |
|
520 | |
|
521 | |
|
522 | |
|
523 | |
|
524 | |
|
525 | |
private void printEscaped(char c, StringBuilder sb) { |
526 | 0 | if(c == 'b') { |
527 | 0 | sb.append('\b'); |
528 | 0 | return; |
529 | |
} |
530 | 0 | if(c == 'f') { |
531 | 0 | sb.append('\f'); |
532 | 0 | return; |
533 | |
} |
534 | 0 | if(c == 'n') { |
535 | 0 | sb.append('\n'); |
536 | 0 | return; |
537 | |
} |
538 | 0 | if(c == 'r') { |
539 | 0 | sb.append('\r'); |
540 | 0 | return; |
541 | |
} |
542 | 0 | if(c == 't') { |
543 | 0 | sb.append('\t'); |
544 | 0 | return; |
545 | |
} |
546 | 0 | } |
547 | |
|
548 | |
|
549 | |
|
550 | |
|
551 | |
|
552 | |
|
553 | |
|
554 | |
|
555 | |
|
556 | |
private Value parseLiteral(BufferedReader br) throws IOException, RDFParseException { |
557 | 0 | assertChar(br, '"'); |
558 | |
|
559 | |
char c; |
560 | 0 | boolean escaped = false; |
561 | 0 | StringBuilder sb = new StringBuilder(); |
562 | |
while(true) { |
563 | 0 | c = readChar(br); |
564 | 0 | if( c == '\\' ) { |
565 | 0 | if(escaped) { |
566 | 0 | escaped = false; |
567 | 0 | sb.append(c); |
568 | |
} else { |
569 | 0 | escaped = true; |
570 | |
} |
571 | 0 | continue; |
572 | 0 | } else if(c == '"' && !escaped) { |
573 | 0 | break; |
574 | |
} |
575 | 0 | if(escaped) { |
576 | 0 | if(c == 'u') { |
577 | 0 | char unicodeChar = readUnicode(br); |
578 | 0 | sb.append(unicodeChar); |
579 | 0 | } else { |
580 | 0 | printEscaped(c, sb); |
581 | |
} |
582 | 0 | escaped = false; |
583 | |
} else { |
584 | 0 | sb.append(c); |
585 | |
} |
586 | |
} |
587 | 0 | mark(br); |
588 | |
|
589 | 0 | LiteralAttribute lt = parseLiteralAttribute(br); |
590 | |
|
591 | 0 | final String value = sb.toString(); |
592 | 0 | if(lt == null) { |
593 | 0 | return createLiteral(value, null, null); |
594 | 0 | }else if(lt.isLang) { |
595 | 0 | return createLiteral( |
596 | |
value, |
597 | |
lt.value, |
598 | |
null |
599 | |
); |
600 | |
} else { |
601 | 0 | URI literalType = null; |
602 | |
try { |
603 | 0 | literalType = new URIImpl(lt.value); |
604 | 0 | } catch (Exception e) { |
605 | 0 | reportError( String.format("Error while parsing literal type '%s'", lt.value), row, col ); |
606 | 0 | } |
607 | 0 | return createLiteral( |
608 | |
validateAndNormalizeLiteral(value, literalType), |
609 | |
null, |
610 | |
literalType |
611 | |
); |
612 | |
} |
613 | |
} |
614 | |
|
615 | |
|
616 | |
|
617 | |
|
618 | |
|
619 | |
|
620 | |
|
621 | |
|
622 | |
|
623 | |
private Resource parseSubject(BufferedReader br) throws IOException, RDFParseException { |
624 | 0 | mark(br); |
625 | 0 | char c = readChar(br); |
626 | 0 | reset(br); |
627 | 0 | if( c == '<' ) { |
628 | 0 | return parseURI(br); |
629 | |
} else { |
630 | 0 | return parseBNode(br); |
631 | |
} |
632 | |
} |
633 | |
|
634 | |
|
635 | |
|
636 | |
|
637 | |
|
638 | |
|
639 | |
|
640 | |
|
641 | |
|
642 | |
private URI parsePredicate(BufferedReader br) throws IOException, RDFParseException { |
643 | 0 | return parseURI(br); |
644 | |
} |
645 | |
|
646 | |
|
647 | |
|
648 | |
|
649 | |
|
650 | |
|
651 | |
|
652 | |
|
653 | |
|
654 | |
private Value parseObject(BufferedReader br) throws IOException, RDFParseException { |
655 | 0 | mark(br); |
656 | 0 | char c = readChar(br); |
657 | 0 | reset(br); |
658 | 0 | if( c == '<' ) { |
659 | 0 | return parseURI(br); |
660 | 0 | } else if( c == '_') { |
661 | 0 | return parseBNode(br); |
662 | |
} else { |
663 | 0 | return parseLiteral(br); |
664 | |
} |
665 | |
} |
666 | |
|
667 | |
|
668 | |
|
669 | |
|
670 | |
class LiteralAttribute { |
671 | |
final boolean isLang; |
672 | |
final String value; |
673 | |
|
674 | 0 | LiteralAttribute(boolean lang, String value) { |
675 | 0 | isLang = lang; |
676 | 0 | this.value = value; |
677 | 0 | } |
678 | |
} |
679 | |
|
680 | |
|
681 | |
|
682 | |
|
683 | |
|
684 | |
|
685 | |
|
686 | |
|
687 | |
|
688 | |
private URI parseGraph(BufferedReader br) throws IOException, RDFParseException { |
689 | 0 | return parseURI(br); |
690 | |
} |
691 | |
|
692 | |
|
693 | |
|
694 | |
|
695 | 0 | class EOS extends IOException {} |
696 | |
|
697 | |
} |