View Javadoc
1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements. See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership. The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.commons.rdf.experimental;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.nio.charset.StandardCharsets;
23  import java.nio.file.Path;
24  import java.util.Optional;
25  import java.util.concurrent.Future;
26  import java.util.function.Consumer;
27  
28  import org.apache.commons.rdf.api.BlankNode;
29  import org.apache.commons.rdf.api.Dataset;
30  import org.apache.commons.rdf.api.Graph;
31  import org.apache.commons.rdf.api.IRI;
32  import org.apache.commons.rdf.api.Quad;
33  import org.apache.commons.rdf.api.RDFSyntax;
34  import org.apache.commons.rdf.api.RDFTerm;
35  import org.apache.commons.rdf.api.RDF;
36  import org.apache.commons.rdf.api.Triple;
37  
38  /**
39   * Parse an RDF source into a target (e.g. a Graph/Dataset).
40   * <h2>Experimental</h2> This interface (and its implementations) should be
41   * considered <strong>at risk</strong>; they might change or be removed in the
42   * next minor update of Commons RDF. It may move to the the
43   * {@link org.apache.commons.rdf.api} package when it has stabilized.
44   * <h2>Description</h2>
45   * <p>
46   * This interface follows the
47   * <a href="https://en.wikipedia.org/wiki/Builder_pattern">Builder pattern</a>,
48   * allowing to set parser settings like {@link #contentType(RDFSyntax)} and
49   * {@link #base(IRI)}. A caller MUST call one of the <code>source</code> methods
50   * (e.g. {@link #source(IRI)}, {@link #source(Path)},
51   * {@link #source(InputStream)}), and MUST call one of the <code>target</code>
52   * methods (e.g. {@link #target(Consumer)}, {@link #target(Dataset)},
53   * {@link #target(Graph)}) before calling {@link #parse()} on the returned
54   * RDFParser - however methods can be called in any order.
55   * <p>
56   * The call to {@link #parse()} returns a {@link Future}, allowing asynchronous
57   * parse operations. Callers are recommended to check {@link Future#get()} to
58   * ensure parsing completed successfully, or catch exceptions thrown during
59   * parsing.
60   * <p>
61   * Setting a method that has already been set will override any existing value
62   * in the returned builder - regardless of the parameter type (e.g.
63   * {@link #source(IRI)} will override a previous {@link #source(Path)}. Settings
64   * can be unset by passing <code>null</code> - note that this may require
65   * casting, e.g. <code>contentType( (RDFSyntax) null )</code> to undo a previous
66   * call to {@link #contentType(RDFSyntax)}.
67   * <p>
68   * It is undefined if a RDFParser is mutable or thread-safe, so callers should
69   * always use the returned modified RDFParser from the builder methods. The
70   * builder may return itself after modification, or a cloned builder with the
71   * modified settings applied. Implementations are however encouraged to be
72   * immutable, thread-safe and document this. As an example starting point, see
73   * <code>org.apache.commons.rdf.simple.AbstractRDFParser</code>.
74   * <p>
75   * Example usage:
76   * </p>
77   * 
78   * <pre>
79   * Graph g1 = rDFTermFactory.createGraph();
80   * new ExampleRDFParserBuilder().source(Paths.get("/tmp/graph.ttl")).contentType(RDFSyntax.TURTLE).target(g1).parse()
81   *         .get(30, TimeUnit.Seconds);
82   * </pre>
83   *
84   */
85  public interface RDFParser {
86  
87      /**
88       * The result of {@link RDFParser#parse()} indicating parsing completed.
89       * <p>
90       * This is a marker interface that may be subclassed to include parser
91       * details, e.g. warning messages or triple counts.
92       */
93      public interface ParseResult {
94      }
95  
96      /**
97       * Specify which {@link RDF} to use for generating {@link RDFTerm}s.
98       * <p>
99       * This option may be used together with {@link #target(Graph)} to override
100      * the implementation's default factory and graph.
101      * <p>
102      * <strong>Warning:</strong> Using the same {@link RDF} for multiple
103      * {@link #parse()} calls may accidentally merge {@link BlankNode}s having
104      * the same label, as the parser may use the
105      * {@link RDF#createBlankNode(String)} method from the parsed blank node
106      * labels.
107      * 
108      * @see #target(Graph)
109      * @param rdfTermFactory
110      *            {@link RDF} to use for generating RDFTerms.
111      * @return An {@link RDFParser} that will use the specified rdfTermFactory
112      */
113     RDFParser rdfTermFactory(RDF rdfTermFactory);
114 
115     /**
116      * Specify the content type of the RDF syntax to parse.
117      * <p>
118      * This option can be used to select the RDFSyntax of the source, overriding
119      * any <code>Content-Type</code> headers or equivalent.
120      * <p>
121      * The character set of the RDFSyntax is assumed to be
122      * {@link StandardCharsets#UTF_8} unless overridden within the document
123      * (e.g. {@code <?xml version="1.0" encoding="iso-8859-1"?>} in
124      * {@link RDFSyntax#RDFXML}).
125      * <p>
126      * This method will override any contentType set with
127      * {@link #contentType(String)}.
128      * 
129      * @see #contentType(String)
130      * @param rdfSyntax
131      *            An {@link RDFSyntax} to parse the source according to, e.g.
132      *            {@link RDFSyntax#TURTLE}.
133      * @throws IllegalArgumentException
134      *             If this RDFParser does not support the specified RDFSyntax.
135      * @return An {@link RDFParser} that will use the specified content type.
136      */
137     RDFParser contentType(RDFSyntax rdfSyntax) throws IllegalArgumentException;
138 
139     /**
140      * Specify the content type of the RDF syntax to parse.
141      * <p>
142      * This option can be used to select the RDFSyntax of the source, overriding
143      * any <code>Content-Type</code> headers or equivalent.
144      * <p>
145      * The content type MAY include a <code>charset</code> parameter if the RDF
146      * media types permit it; the default charset is
147      * {@link StandardCharsets#UTF_8} unless overridden within the document.
148      * <p>
149      * This method will override any contentType set with
150      * {@link #contentType(RDFSyntax)}.
151      * 
152      * @see #contentType(RDFSyntax)
153      * @param contentType
154      *            A content-type string, e.g. <code>application/ld+json</code>
155      *            or <code>text/turtle;charset="UTF-8"</code> as specified by
156      *            <a href="https://tools.ietf.org/html/rfc7231#section-3.1.1.1">
157      *            RFC7231</a>.
158      * @return An {@link RDFParser} that will use the specified content type.
159      * @throws IllegalArgumentException
160      *             If the contentType has an invalid syntax, or this RDFParser
161      *             does not support the specified contentType.
162      */
163     RDFParser contentType(String contentType) throws IllegalArgumentException;
164 
165     /**
166      * Specify a {@link Graph} to add parsed triples to.
167      * <p>
168      * If the source supports datasets (e.g. the {@link #contentType(RDFSyntax)}
169      * set has {@link RDFSyntax#supportsDataset} is true)), then only quads in
170      * the <em>default graph</em> will be added to the Graph as {@link Triple}s.
171      * <p>
172      * It is undefined if any triples are added to the specified {@link Graph}
173      * if {@link #parse()} throws any exceptions. (However implementations are
174      * free to prevent this using transaction mechanisms or similar). If
175      * {@link Future#get()} does not indicate an exception, the parser
176      * implementation SHOULD have inserted all parsed triples to the specified
177      * graph.
178      * <p>
179      * Calling this method will override any earlier targets set with
180      * {@link #target(Graph)}, {@link #target(Consumer)} or
181      * {@link #target(Dataset)}.
182      * <p>
183      * The default implementation of this method calls {@link #target(Consumer)}
184      * with a {@link Consumer} that does {@link Graph#add(Triple)} with
185      * {@link Quad#asTriple()} if the quad is in the default graph.
186      * 
187      * @param graph
188      *            The {@link Graph} to add triples to.
189      * @return An {@link RDFParser} that will insert triples into the specified
190      *         graph.
191      */
192     default RDFParser target(final Graph graph) {
193         return target(q -> {
194             if (!q.getGraphName().isPresent()) {
195                 graph.add(q.asTriple());
196             }
197         });
198     }
199 
200     /**
201      * Specify a {@link Dataset} to add parsed quads to.
202      * <p>
203      * It is undefined if any quads are added to the specified {@link Dataset}
204      * if {@link #parse()} throws any exceptions. (However implementations are
205      * free to prevent this using transaction mechanisms or similar). On the
206      * other hand, if {@link #parse()} does not indicate an exception, the
207      * implementation SHOULD have inserted all parsed quads to the specified
208      * dataset.
209      * <p>
210      * Calling this method will override any earlier targets set with
211      * {@link #target(Graph)}, {@link #target(Consumer)} or
212      * {@link #target(Dataset)}.
213      * <p>
214      * The default implementation of this method calls {@link #target(Consumer)}
215      * with a {@link Consumer} that does {@link Dataset#add(Quad)}.
216      * 
217      * @param dataset
218      *            The {@link Dataset} to add quads to.
219      * @return An {@link RDFParser} that will insert triples into the specified
220      *         dataset.
221      */
222     default RDFParser target(final Dataset dataset) {
223         return target(dataset::add);
224     }
225 
226     /**
227      * Specify a consumer for parsed quads.
228      * <p>
229      * The quads will include triples in all named graphs of the parsed source,
230      * including any triples in the default graph. When parsing a source format
231      * which do not support datasets, all quads delivered to the consumer will
232      * be in the default graph (e.g. their {@link Quad#getGraphName()} will be
233      * as {@link Optional#empty()}), while for a source
234      * <p>
235      * It is undefined if any quads are consumed if {@link #parse()} throws any
236      * exceptions. On the other hand, if {@link #parse()} does not indicate an
237      * exception, the implementation SHOULD have produced all parsed quads to
238      * the specified consumer.
239      * <p>
240      * Calling this method will override any earlier targets set with
241      * {@link #target(Graph)}, {@link #target(Consumer)} or
242      * {@link #target(Dataset)}.
243      * <p>
244      * The consumer is not assumed to be thread safe - only one
245      * {@link Consumer#accept(Object)} is delivered at a time for a given
246      * {@link RDFParser#parse()} call.
247      * <p>
248      * This method is typically called with a functional consumer, for example:
249      * 
250      * <pre>
251      * {@code
252      * List<Quad> quads = new ArrayList<Quad>;
253      * parserBuilder.target(quads::add).parse();
254      * }
255      * </pre>
256      * 
257      * @param consumer
258      *            A {@link Consumer} of {@link Quad}s
259      * @return An {@link RDFParser} that will call the consumer for into the
260      *         specified dataset.
261      */
262     RDFParser target(Consumer<Quad> consumer);
263 
264     /**
265      * Specify a base IRI to use for parsing any relative IRI references.
266      * <p>
267      * Setting this option will override any protocol-specific base IRI (e.g.
268      * <code>Content-Location</code> header) or the {@link #source(IRI)} IRI,
269      * but does not override any base IRIs set within the source document (e.g.
270      * <code>@base</code> in Turtle documents).
271      * <p>
272      * If the source is in a syntax that does not support relative IRI
273      * references (e.g. {@link RDFSyntax#NTRIPLES}), setting the
274      * <code>base</code> has no effect.
275      * <p>
276      * This method will override any base IRI set with {@link #base(String)}.
277      *
278      * @see #base(String)
279      * @param base
280      *            An absolute IRI to use as a base.
281      * @return An {@link RDFParser} that will use the specified base IRI.
282      */
283     RDFParser base(IRI base);
284 
285     /**
286      * Specify a base IRI to use for parsing any relative IRI references.
287      * <p>
288      * Setting this option will override any protocol-specific base IRI (e.g.
289      * <code>Content-Location</code> header) or the {@link #source(IRI)} IRI,
290      * but does not override any base IRIs set within the source document (e.g.
291      * <code>@base</code> in Turtle documents).
292      * <p>
293      * If the source is in a syntax that does not support relative IRI
294      * references (e.g. {@link RDFSyntax#NTRIPLES}), setting the
295      * <code>base</code> has no effect.
296      * <p>
297      * This method will override any base IRI set with {@link #base(IRI)}.
298      *
299      * @see #base(IRI)
300      * @param base
301      *            An absolute IRI to use as a base.
302      * @return An {@link RDFParser} that will use the specified base IRI.
303      * @throws IllegalArgumentException
304      *             If the base is not a valid absolute IRI string
305      */
306     RDFParser base(String base) throws IllegalArgumentException;
307 
308     /**
309      * Specify a source {@link InputStream} to parse.
310      * <p>
311      * The source set will not be read before the call to {@link #parse()}.
312      * <p>
313      * The InputStream will not be closed after parsing. The InputStream does
314      * not need to support {@link InputStream#markSupported()}.
315      * <p>
316      * The parser might not consume the complete stream (e.g. an RDF/XML parser
317      * may not read beyond the closing tag of
318      * <code>&lt;/rdf:Description&gt;</code>).
319      * <p>
320      * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)}
321      * SHOULD be set before calling {@link #parse()}.
322      * <p>
323      * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
324      * the {@link #contentType(String)} specifies otherwise or the document
325      * declares its own charset (e.g. RDF/XML with a
326      * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
327      * <p>
328      * The {@link #base(IRI)} or {@link #base(String)} MUST be set before
329      * calling {@link #parse()}, unless the RDF syntax does not permit relative
330      * IRIs (e.g. {@link RDFSyntax#NTRIPLES}).
331      * <p>
332      * This method will override any source set with {@link #source(IRI)},
333      * {@link #source(Path)} or {@link #source(String)}.
334      * 
335      * @param inputStream
336      *            An InputStream to consume
337      * @return An {@link RDFParser} that will use the specified source.
338      */
339     RDFParser source(InputStream inputStream);
340 
341     /**
342      * Specify a source file {@link Path} to parse.
343      * <p>
344      * The source set will not be read before the call to {@link #parse()}.
345      * <p>
346      * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)}
347      * SHOULD be set before calling {@link #parse()}.
348      * <p>
349      * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
350      * the {@link #contentType(String)} specifies otherwise or the document
351      * declares its own charset (e.g. RDF/XML with a
352      * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
353      * <p>
354      * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
355      * {@link #parse()}, otherwise {@link Path#toUri()} will be used as the base
356      * IRI.
357      * <p>
358      * This method will override any source set with {@link #source(IRI)},
359      * {@link #source(InputStream)} or {@link #source(String)}.
360      * 
361      * @param file
362      *            A Path for a file to parse
363      * @return An {@link RDFParser} that will use the specified source.
364      */
365     RDFParser source(Path file);
366 
367     /**
368      * Specify an absolute source {@link IRI} to retrieve and parse.
369      * <p>
370      * The source set will not be read before the call to {@link #parse()}.
371      * <p>
372      * If this builder does not support the given IRI protocol (e.g.
373      * <code>urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890</code>), this method
374      * should succeed, while the {@link #parse()} should throw an
375      * {@link IOException}.
376      * <p>
377      * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY
378      * be set before calling {@link #parse()}, in which case that type MAY be
379      * used for content negotiation (e.g. <code>Accept</code> header in HTTP),
380      * and SHOULD be used for selecting the RDFSyntax.
381      * <p>
382      * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
383      * the protocol's equivalent of <code>Content-Type</code> specifies
384      * otherwise or the document declares its own charset (e.g. RDF/XML with a
385      * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
386      * <p>
387      * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
388      * {@link #parse()}, otherwise the source IRI will be used as the base IRI.
389      * <p>
390      * This method will override any source set with {@link #source(Path)},
391      * {@link #source(InputStream)} or {@link #source(String)}.
392      * 
393      * @param iri
394      *            An IRI to retrieve and parse
395      * @return An {@link RDFParser} that will use the specified source.
396      */
397     RDFParser source(IRI iri);
398 
399     /**
400      * Specify an absolute source IRI to retrieve and parse.
401      * <p>
402      * The source set will not be read before the call to {@link #parse()}.
403      * <p>
404      * If this builder does not support the given IRI (e.g.
405      * <code>urn:uuid:ce667463-c5ab-4c23-9b64-701d055c4890</code>), this method
406      * should succeed, while the {@link #parse()} should throw an
407      * {@link IOException}.
408      * <p>
409      * The {@link #contentType(RDFSyntax)} or {@link #contentType(String)} MAY
410      * be set before calling {@link #parse()}, in which case that type MAY be
411      * used for content negotiation (e.g. <code>Accept</code> header in HTTP),
412      * and SHOULD be used for selecting the RDFSyntax.
413      * <p>
414      * The character set is assumed to be {@link StandardCharsets#UTF_8} unless
415      * the protocol's equivalent of <code>Content-Type</code> specifies
416      * otherwise or the document declares its own charset (e.g. RDF/XML with a
417      * <code>&lt;?xml encoding="iso-8859-1"&gt;</code> header).
418      * <p>
419      * The {@link #base(IRI)} or {@link #base(String)} MAY be set before calling
420      * {@link #parse()}, otherwise the source IRI will be used as the base IRI.
421      * <p>
422      * This method will override any source set with {@link #source(Path)},
423      * {@link #source(InputStream)} or {@link #source(IRI)}.
424      * 
425      * @param iri
426      *            An IRI to retrieve and parse
427      * @return An {@link RDFParser} that will use the specified source.
428      * @throws IllegalArgumentException
429      *             If the base is not a valid absolute IRI string
430      * 
431      */
432     RDFParser source(String iri) throws IllegalArgumentException;
433 
434     /**
435      * Parse the specified source.
436      * <p>
437      * A source method (e.g. {@link #source(InputStream)}, {@link #source(IRI)},
438      * {@link #source(Path)}, {@link #source(String)} or an equivalent subclass
439      * method) MUST have been called before calling this method, otherwise an
440      * {@link IllegalStateException} will be thrown.
441      * <p>
442      * A target method (e.g. {@link #target(Consumer)},
443      * {@link #target(Dataset)}, {@link #target(Graph)} or an equivalent
444      * subclass method) MUST have been called before calling parse(), otherwise
445      * an {@link IllegalStateException} will be thrown.
446      * <p>
447      * It is undefined if this method is thread-safe, however the
448      * {@link RDFParser} may be reused (e.g. setting a different source) as soon
449      * as the {@link Future} has been returned from this method.
450      * <p>
451      * The RDFParser SHOULD perform the parsing as an asynchronous operation,
452      * and return the {@link Future} as soon as preliminary checks (such as
453      * validity of the {@link #source(IRI)} and {@link #contentType(RDFSyntax)}
454      * settings) have finished. The future SHOULD not mark
455      * {@link Future#isDone()} before parsing is complete. A synchronous
456      * implementation MAY be blocking on the <code>parse()</code> call and
457      * return a Future that is already {@link Future#isDone()}.
458      * <p>
459      * The returned {@link Future} contains a {@link ParseResult}.
460      * Implementations may subclass this interface to provide any parser
461      * details, e.g. list of warnings. <code>null</code> is a possible return
462      * value if no details are available, but parsing succeeded.
463      * <p>
464      * If an exception occurs during parsing, (e.g. {@link IOException} or
465      * <code>org.apache.commons.rdf.simple.experimental.RDFParseException</code>),
466      * it should be indicated as the
467      * {@link java.util.concurrent.ExecutionException#getCause()} in the
468      * {@link java.util.concurrent.ExecutionException} thrown on
469      * {@link Future#get()}.
470      * 
471      * @return A Future that will return the populated {@link Graph} when the
472      *         parsing has finished.
473      * @throws IOException
474      *             If an error occurred while starting to read the source (e.g.
475      *             file not found, unsupported IRI protocol). Note that IO
476      *             errors during parsing would instead be the
477      *             {@link java.util.concurrent.ExecutionException#getCause()} of
478      *             the {@link java.util.concurrent.ExecutionException} thrown on
479      *             {@link Future#get()}.
480      * @throws IllegalStateException
481      *             If the builder is in an invalid state, e.g. a
482      *             <code>source</code> has not been set.
483      */
484     Future<? extends ParseResult> parse() throws IOException, IllegalStateException;
485 }