1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor; |
19 | |
|
20 | |
import org.apache.any23.configuration.Configuration; |
21 | |
import org.apache.any23.configuration.DefaultConfiguration; |
22 | |
import org.apache.any23.encoding.EncodingDetector; |
23 | |
import org.apache.any23.encoding.TikaEncodingDetector; |
24 | |
import org.apache.any23.extractor.html.DocumentReport; |
25 | |
import org.apache.any23.extractor.html.HTMLDocument; |
26 | |
import org.apache.any23.extractor.html.MicroformatExtractor; |
27 | |
import org.apache.any23.extractor.html.TagSoupParser; |
28 | |
import org.apache.any23.mime.MIMEType; |
29 | |
import org.apache.any23.mime.MIMETypeDetector; |
30 | |
import org.apache.any23.rdf.Any23ValueFactoryWrapper; |
31 | |
import org.apache.any23.rdf.RDFUtils; |
32 | |
import org.apache.any23.source.DocumentSource; |
33 | |
import org.apache.any23.source.LocalCopyFactory; |
34 | |
import org.apache.any23.source.MemCopyFactory; |
35 | |
import org.apache.any23.validator.EmptyValidationReport; |
36 | |
import org.apache.any23.validator.ValidatorException; |
37 | |
import org.apache.any23.vocab.SINDICE; |
38 | |
import org.apache.any23.writer.CompositeTripleHandler; |
39 | |
import org.apache.any23.writer.CountingTripleHandler; |
40 | |
import org.apache.any23.writer.TripleHandler; |
41 | |
import org.apache.any23.writer.TripleHandlerException; |
42 | |
import org.apache.any23.extractor.Extractor.BlindExtractor; |
43 | |
import org.apache.any23.extractor.Extractor.ContentExtractor; |
44 | |
import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor; |
45 | |
import org.openrdf.model.BNode; |
46 | |
import org.openrdf.model.URI; |
47 | |
import org.openrdf.model.impl.URIImpl; |
48 | |
import org.openrdf.model.impl.ValueFactoryImpl; |
49 | |
import org.slf4j.Logger; |
50 | |
import org.slf4j.LoggerFactory; |
51 | |
|
52 | |
import java.io.BufferedInputStream; |
53 | |
import java.io.ByteArrayOutputStream; |
54 | |
import java.io.IOException; |
55 | |
import java.io.InputStream; |
56 | |
import java.io.PrintStream; |
57 | |
import java.net.URISyntaxException; |
58 | |
import java.util.ArrayList; |
59 | |
import java.util.Collection; |
60 | |
import java.util.Collections; |
61 | |
import java.util.Date; |
62 | |
import java.util.HashMap; |
63 | |
import java.util.List; |
64 | |
import java.util.Map; |
65 | |
import java.util.UUID; |
66 | |
|
67 | |
import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath; |
68 | |
import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot; |
69 | |
|
70 | |
|
71 | |
|
72 | |
|
73 | |
public class SingleDocumentExtraction { |
74 | |
|
75 | |
public static final String EXTRACTION_CONTEXT_URI_PROPERTY = "any23.extraction.context.uri"; |
76 | |
|
77 | |
public static final String METADATA_TIMESIZE_FLAG = "any23.extraction.metadata.timesize"; |
78 | |
public static final String METADATA_NESTING_FLAG = "any23.extraction.metadata.nesting"; |
79 | |
public static final String METADATA_DOMAIN_PER_ENTITY_FLAG = "any23.extraction.metadata.domain.per.entity"; |
80 | |
|
81 | 0 | private static final SINDICE vSINDICE = SINDICE.getInstance(); |
82 | |
|
83 | 0 | private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class); |
84 | |
|
85 | |
private final Configuration configuration; |
86 | |
|
87 | |
private final DocumentSource in; |
88 | |
|
89 | |
private URI documentURI; |
90 | |
|
91 | |
private final ExtractorGroup extractors; |
92 | |
|
93 | |
private final TripleHandler output; |
94 | |
|
95 | |
private final EncodingDetector encoderDetector; |
96 | |
|
97 | 0 | private LocalCopyFactory copyFactory = null; |
98 | |
|
99 | 0 | private DocumentSource localDocumentSource = null; |
100 | |
|
101 | 0 | private MIMETypeDetector detector = null; |
102 | |
|
103 | 0 | private ExtractorGroup matchingExtractors = null; |
104 | |
|
105 | 0 | private MIMEType detectedMIMEType = null; |
106 | |
|
107 | 0 | private DocumentReport documentReport = null; |
108 | |
|
109 | 0 | private ExtractionParameters tagSoupDOMRelatedParameters = null; |
110 | |
|
111 | 0 | private String parserEncoding = null; |
112 | |
|
113 | |
|
114 | |
|
115 | |
|
116 | |
|
117 | |
|
118 | |
|
119 | |
|
120 | |
|
121 | |
|
122 | |
public SingleDocumentExtraction( |
123 | |
Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output |
124 | 0 | ) { |
125 | 0 | if(configuration == null) throw new NullPointerException("configuration cannot be null."); |
126 | 0 | if(in == null) throw new NullPointerException("in cannot be null."); |
127 | 0 | this.configuration = configuration; |
128 | 0 | this.in = in; |
129 | 0 | this.extractors = extractors; |
130 | |
|
131 | 0 | List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>(); |
132 | 0 | tripleHandlers.add(output); |
133 | 0 | tripleHandlers.add(new CountingTripleHandler()); |
134 | 0 | this.output = new CompositeTripleHandler(tripleHandlers); |
135 | 0 | this.encoderDetector = new TikaEncodingDetector(); |
136 | 0 | } |
137 | |
|
138 | |
|
139 | |
|
140 | |
|
141 | |
|
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | |
|
147 | |
public SingleDocumentExtraction( |
148 | |
Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output |
149 | |
) { |
150 | 0 | this( |
151 | |
configuration, |
152 | |
in, |
153 | |
new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)), |
154 | |
output |
155 | |
); |
156 | 0 | this.setMIMETypeDetector(null); |
157 | 0 | } |
158 | |
|
159 | |
|
160 | |
|
161 | |
|
162 | |
|
163 | |
|
164 | |
|
165 | |
|
166 | |
|
167 | |
|
168 | |
public SingleDocumentExtraction( |
169 | |
DocumentSource in, ExtractorFactory<?> factory, TripleHandler output |
170 | |
) { |
171 | 0 | this( |
172 | |
DefaultConfiguration.singleton(), |
173 | |
in, |
174 | |
new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)), |
175 | |
output |
176 | |
); |
177 | 0 | this.setMIMETypeDetector(null); |
178 | 0 | } |
179 | |
|
180 | |
|
181 | |
|
182 | |
|
183 | |
|
184 | |
|
185 | |
|
186 | |
|
187 | |
public void setLocalCopyFactory(LocalCopyFactory copyFactory) { |
188 | 0 | this.copyFactory = copyFactory; |
189 | 0 | } |
190 | |
|
191 | |
|
192 | |
|
193 | |
|
194 | |
|
195 | |
|
196 | |
|
197 | |
|
198 | |
public void setMIMETypeDetector(MIMETypeDetector detector) { |
199 | 0 | this.detector = detector; |
200 | 0 | } |
201 | |
|
202 | |
|
203 | |
|
204 | |
|
205 | |
|
206 | |
|
207 | |
|
208 | |
|
209 | |
|
210 | |
|
211 | |
public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters) |
212 | |
throws ExtractionException, IOException { |
213 | 0 | if(extractionParameters == null) { |
214 | 0 | extractionParameters = ExtractionParameters.newDefault(configuration); |
215 | |
} |
216 | |
|
217 | 0 | final String contextURI = extractionParameters.getProperty(EXTRACTION_CONTEXT_URI_PROPERTY); |
218 | 0 | ensureHasLocalCopy(); |
219 | |
try { |
220 | 0 | this.documentURI = new Any23ValueFactoryWrapper( |
221 | |
ValueFactoryImpl.getInstance() |
222 | |
).createURI( "?".equals(contextURI) ? in.getDocumentURI() : contextURI); |
223 | 0 | } catch (Exception ex) { |
224 | 0 | throw new IllegalArgumentException("Invalid URI: " + in.getDocumentURI(), ex); |
225 | 0 | } |
226 | 0 | if(log.isInfoEnabled()) { |
227 | 0 | log.info("Processing " + this.documentURI); |
228 | |
} |
229 | 0 | filterExtractorsByMIMEType(); |
230 | |
|
231 | 0 | if(log.isDebugEnabled()) { |
232 | 0 | StringBuffer sb = new StringBuffer("Extractors "); |
233 | 0 | for (ExtractorFactory<?> factory : matchingExtractors) { |
234 | 0 | sb.append(factory.getExtractorName()); |
235 | 0 | sb.append(' '); |
236 | |
} |
237 | 0 | sb.append("match ").append(documentURI); |
238 | 0 | log.debug(sb.toString()); |
239 | |
} |
240 | |
|
241 | |
|
242 | |
try { |
243 | 0 | output.startDocument(documentURI); |
244 | 0 | } catch (TripleHandlerException e) { |
245 | 0 | log.error(String.format("Error starting document with URI %s", documentURI)); |
246 | 0 | throw new ExtractionException(String.format("Error starting document with URI %s", documentURI), |
247 | |
e |
248 | |
); |
249 | 0 | } |
250 | 0 | output.setContentLength(in.getContentLength()); |
251 | |
|
252 | 0 | final List<ResourceRoot> resourceRoots = new ArrayList<ResourceRoot>(); |
253 | 0 | final List<PropertyPath> propertyPaths = new ArrayList<PropertyPath>(); |
254 | 0 | final Map<String,Collection<ErrorReporter.Error>> extractorToErrors = |
255 | |
new HashMap<String,Collection<ErrorReporter.Error>>(); |
256 | |
try { |
257 | 0 | final String documentLanguage = extractDocumentLanguage(extractionParameters); |
258 | 0 | for (ExtractorFactory<?> factory : matchingExtractors) { |
259 | 0 | final Extractor extractor = factory.createExtractor(); |
260 | 0 | final SingleExtractionReport er = runExtractor( |
261 | |
extractionParameters, |
262 | |
documentLanguage, |
263 | |
extractor |
264 | |
); |
265 | 0 | resourceRoots.addAll( er.resourceRoots ); |
266 | 0 | propertyPaths.addAll( er.propertyPaths ); |
267 | 0 | extractorToErrors.put(factory.getExtractorName(), er.errors); |
268 | 0 | } |
269 | 0 | } catch(ValidatorException ve) { |
270 | 0 | throw new ExtractionException("An error occurred during the validation phase.", ve); |
271 | 0 | } |
272 | |
|
273 | |
|
274 | 0 | final boolean addDomainTriples = extractionParameters.getFlag(METADATA_DOMAIN_PER_ENTITY_FLAG); |
275 | |
final ExtractionContext consolidationContext; |
276 | 0 | if(extractionParameters.getFlag(METADATA_NESTING_FLAG)) { |
277 | |
|
278 | 0 | consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output); |
279 | |
} else { |
280 | 0 | consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output); |
281 | |
} |
282 | |
|
283 | |
|
284 | 0 | if (extractionParameters.getFlag(METADATA_TIMESIZE_FLAG)) { |
285 | |
try { |
286 | 0 | addExtractionTimeSizeMetaTriples(consolidationContext); |
287 | 0 | } catch (TripleHandlerException e) { |
288 | 0 | throw new ExtractionException( |
289 | |
String.format( |
290 | |
"Error while adding extraction metadata triples document with URI %s", documentURI |
291 | |
), |
292 | |
e |
293 | |
); |
294 | 0 | } |
295 | |
} |
296 | |
|
297 | |
try { |
298 | 0 | output.endDocument(documentURI); |
299 | 0 | } catch (TripleHandlerException e) { |
300 | 0 | log.error(String.format("Error ending document with URI %s", documentURI)); |
301 | 0 | throw new ExtractionException(String.format("Error ending document with URI %s", documentURI), |
302 | |
e |
303 | |
); |
304 | 0 | } |
305 | |
|
306 | 0 | return new SingleDocumentExtractionReport( |
307 | |
documentReport == null |
308 | |
? |
309 | |
EmptyValidationReport.getInstance() : documentReport.getReport(), |
310 | |
extractorToErrors |
311 | |
); |
312 | |
} |
313 | |
|
314 | |
|
315 | |
|
316 | |
|
317 | |
|
318 | |
|
319 | |
|
320 | |
|
321 | |
|
322 | |
public SingleDocumentExtractionReport run() throws IOException, ExtractionException { |
323 | 0 | return run(ExtractionParameters.newDefault(configuration)); |
324 | |
} |
325 | |
|
326 | |
|
327 | |
|
328 | |
|
329 | |
|
330 | |
|
331 | |
|
332 | |
public String getDetectedMIMEType() throws IOException { |
333 | 0 | filterExtractorsByMIMEType(); |
334 | 0 | return detectedMIMEType == null ? null : detectedMIMEType.toString(); |
335 | |
} |
336 | |
|
337 | |
|
338 | |
|
339 | |
|
340 | |
|
341 | |
|
342 | |
|
343 | |
public boolean hasMatchingExtractors() throws IOException { |
344 | 0 | filterExtractorsByMIMEType(); |
345 | 0 | return !matchingExtractors.isEmpty(); |
346 | |
} |
347 | |
|
348 | |
|
349 | |
|
350 | |
|
351 | |
public List<Extractor> getMatchingExtractors() { |
352 | 0 | final List<Extractor> extractorsList = new ArrayList<Extractor>(); |
353 | 0 | for(ExtractorFactory extractorFactory : matchingExtractors) { |
354 | 0 | extractorsList.add( extractorFactory.createExtractor() ); |
355 | |
} |
356 | 0 | return extractorsList; |
357 | |
} |
358 | |
|
359 | |
|
360 | |
|
361 | |
|
362 | |
public String getParserEncoding() { |
363 | 0 | if(this.parserEncoding == null) { |
364 | 0 | this.parserEncoding = detectEncoding(); |
365 | |
} |
366 | 0 | return this.parserEncoding; |
367 | |
} |
368 | |
|
369 | |
|
370 | |
|
371 | |
|
372 | |
|
373 | |
|
374 | |
public void setParserEncoding(String encoding) { |
375 | 0 | this.parserEncoding = encoding; |
376 | 0 | documentReport = null; |
377 | 0 | } |
378 | |
|
379 | |
|
380 | |
|
381 | |
|
382 | |
|
383 | |
|
384 | |
|
385 | |
private boolean isHTMLDocument() throws IOException { |
386 | 0 | filterExtractorsByMIMEType(); |
387 | 0 | return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty(); |
388 | |
} |
389 | |
|
390 | |
|
391 | |
|
392 | |
|
393 | |
|
394 | |
|
395 | |
|
396 | |
|
397 | |
|
398 | |
private String extractDocumentLanguage(ExtractionParameters extractionParameters) |
399 | |
throws IOException, ValidatorException { |
400 | 0 | if( ! isHTMLDocument() ) { |
401 | 0 | return null; |
402 | |
} |
403 | |
final HTMLDocument document; |
404 | |
try { |
405 | 0 | document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() ); |
406 | 0 | } catch (IOException ioe) { |
407 | 0 | log.debug("Cannot extract language from document.", ioe); |
408 | 0 | return null; |
409 | 0 | } |
410 | 0 | return document.getDefaultLanguage(); |
411 | |
} |
412 | |
|
413 | |
|
414 | |
|
415 | |
|
416 | |
|
417 | |
|
418 | |
private void filterExtractorsByMIMEType() |
419 | |
throws IOException { |
420 | 0 | if (matchingExtractors != null) return; |
421 | |
|
422 | 0 | if (detector == null || extractors.allExtractorsSupportAllContentTypes()) { |
423 | 0 | matchingExtractors = extractors; |
424 | 0 | return; |
425 | |
} |
426 | 0 | ensureHasLocalCopy(); |
427 | 0 | detectedMIMEType = detector.guessMIMEType( |
428 | |
java.net.URI.create(documentURI.stringValue()).getPath(), |
429 | |
localDocumentSource.openInputStream(), |
430 | |
MIMEType.parse(localDocumentSource.getContentType()) |
431 | |
); |
432 | 0 | log.debug("detected media type: " + detectedMIMEType); |
433 | 0 | matchingExtractors = extractors.filterByMIMEType(detectedMIMEType); |
434 | 0 | } |
435 | |
|
436 | |
|
437 | |
|
438 | |
|
439 | |
|
440 | |
|
441 | |
|
442 | |
|
443 | |
|
444 | |
|
445 | |
|
446 | |
private SingleExtractionReport runExtractor( |
447 | |
final ExtractionParameters extractionParameters, |
448 | |
final String documentLanguage, |
449 | |
final Extractor<?> extractor |
450 | |
) throws ExtractionException, IOException, ValidatorException { |
451 | 0 | if(log.isDebugEnabled()) { |
452 | 0 | log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + documentURI); |
453 | |
} |
454 | 0 | long startTime = System.currentTimeMillis(); |
455 | 0 | final ExtractionContext extractionContext = new ExtractionContext( |
456 | |
extractor.getDescription().getExtractorName(), |
457 | |
documentURI, |
458 | |
documentLanguage |
459 | |
); |
460 | 0 | final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output); |
461 | |
try { |
462 | 0 | if (extractor instanceof BlindExtractor) { |
463 | 0 | final BlindExtractor blindExtractor = (BlindExtractor) extractor; |
464 | 0 | blindExtractor.run(extractionParameters, extractionContext, documentURI, extractionResult); |
465 | 0 | } else if (extractor instanceof ContentExtractor) { |
466 | 0 | ensureHasLocalCopy(); |
467 | 0 | final ContentExtractor contentExtractor = (ContentExtractor) extractor; |
468 | 0 | contentExtractor.run( |
469 | |
extractionParameters, |
470 | |
extractionContext, |
471 | |
localDocumentSource.openInputStream(), |
472 | |
extractionResult |
473 | |
); |
474 | 0 | } else if (extractor instanceof TagSoupDOMExtractor) { |
475 | 0 | final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor; |
476 | 0 | final DocumentReport documentReport = getTagSoupDOM(extractionParameters); |
477 | 0 | tagSoupDOMExtractor.run( |
478 | |
extractionParameters, |
479 | |
extractionContext, |
480 | |
documentReport.getDocument(), |
481 | |
extractionResult |
482 | |
); |
483 | 0 | } else { |
484 | 0 | throw new IllegalStateException("Extractor type not supported: " + extractor.getClass()); |
485 | |
} |
486 | 0 | return |
487 | |
new SingleExtractionReport( |
488 | |
extractionResult.getErrors(), |
489 | |
new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ), |
490 | |
new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() ) |
491 | |
); |
492 | 0 | } catch (ExtractionException ex) { |
493 | 0 | if(log.isDebugEnabled()) { |
494 | 0 | log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage()); |
495 | |
} |
496 | 0 | throw ex; |
497 | |
} finally { |
498 | |
|
499 | 0 | if( log.isDebugEnabled() && extractionResult.hasErrors() ) { |
500 | 0 | ByteArrayOutputStream baos = new ByteArrayOutputStream(); |
501 | 0 | extractionResult.printErrorsReport( new PrintStream(baos) ); |
502 | 0 | log.debug(baos.toString()); |
503 | |
} |
504 | 0 | extractionResult.close(); |
505 | |
|
506 | 0 | long elapsed = System.currentTimeMillis() - startTime; |
507 | 0 | if(log.isDebugEnabled()) { |
508 | 0 | log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms"); |
509 | |
} |
510 | 0 | } |
511 | |
} |
512 | |
|
513 | |
|
514 | |
|
515 | |
|
516 | |
|
517 | |
|
518 | |
private void ensureHasLocalCopy() throws IOException { |
519 | 0 | if (localDocumentSource != null) return; |
520 | 0 | if (in.isLocal()) { |
521 | 0 | localDocumentSource = in; |
522 | 0 | return; |
523 | |
} |
524 | 0 | if (copyFactory == null) { |
525 | 0 | copyFactory = new MemCopyFactory(); |
526 | |
} |
527 | 0 | localDocumentSource = copyFactory.createLocalCopy(in); |
528 | 0 | } |
529 | |
|
530 | |
|
531 | |
|
532 | |
|
533 | |
|
534 | |
|
535 | |
|
536 | |
|
537 | |
|
538 | |
|
539 | |
private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters) |
540 | |
throws IOException, ValidatorException { |
541 | 0 | if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) { |
542 | 0 | ensureHasLocalCopy(); |
543 | 0 | final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() ); |
544 | 0 | is.mark(Integer.MAX_VALUE); |
545 | 0 | final String candidateEncoding = getParserEncoding(); |
546 | 0 | is.reset(); |
547 | 0 | final TagSoupParser tagSoupParser = new TagSoupParser( |
548 | |
is, |
549 | |
documentURI.stringValue(), |
550 | |
candidateEncoding |
551 | |
); |
552 | 0 | if(extractionParameters.isValidate()) { |
553 | 0 | documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() ); |
554 | |
} else { |
555 | 0 | documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() ); |
556 | |
} |
557 | 0 | tagSoupDOMRelatedParameters = extractionParameters; |
558 | |
} |
559 | 0 | return documentReport; |
560 | |
} |
561 | |
|
562 | |
|
563 | |
|
564 | |
|
565 | |
|
566 | |
|
567 | |
private String detectEncoding() { |
568 | |
try { |
569 | 0 | ensureHasLocalCopy(); |
570 | 0 | InputStream is = new BufferedInputStream(localDocumentSource.openInputStream()); |
571 | 0 | String encoding = this.encoderDetector.guessEncoding(is); |
572 | 0 | is.close(); |
573 | 0 | return encoding; |
574 | 0 | } catch (Exception e) { |
575 | 0 | throw new RuntimeException("An error occurred while trying to detect the input encoding.", e); |
576 | |
} |
577 | |
} |
578 | |
|
579 | |
|
580 | |
|
581 | |
|
582 | |
|
583 | |
|
584 | |
|
585 | |
|
586 | |
|
587 | |
|
588 | |
private boolean subPath(String[] list, String[] candidateSub) { |
589 | 0 | if(candidateSub.length > list.length) { |
590 | 0 | return false; |
591 | |
} |
592 | 0 | for(int i = 0; i < candidateSub.length; i++) { |
593 | 0 | if( ! candidateSub[i].equals(list[i])) { |
594 | 0 | return false; |
595 | |
} |
596 | |
} |
597 | 0 | return true; |
598 | |
} |
599 | |
|
600 | |
|
601 | |
|
602 | |
|
603 | |
|
604 | |
|
605 | |
|
606 | |
|
607 | |
private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context) |
608 | |
throws ExtractionException { |
609 | |
try { |
610 | |
|
611 | |
String domain; |
612 | |
try { |
613 | 0 | domain = new java.net.URI(in.getDocumentURI()).getHost(); |
614 | 0 | } catch (URISyntaxException urise) { |
615 | 0 | throw new IllegalArgumentException( |
616 | |
"An error occurred while extracting the host from the document URI.", |
617 | |
urise |
618 | |
); |
619 | 0 | } |
620 | 0 | if (domain != null) { |
621 | 0 | for (ResourceRoot resourceRoot : resourceRoots) { |
622 | 0 | output.receiveTriple( |
623 | |
resourceRoot.getRoot(), |
624 | |
vSINDICE.getProperty(SINDICE.DOMAIN), |
625 | |
ValueFactoryImpl.getInstance().createLiteral(domain), |
626 | |
null, |
627 | |
context |
628 | |
); |
629 | |
} |
630 | |
} |
631 | 0 | } catch (TripleHandlerException e) { |
632 | 0 | throw new ExtractionException("Error while writing triple triple.", e); |
633 | |
} finally { |
634 | 0 | try { |
635 | 0 | output.closeContext(context); |
636 | 0 | } catch (TripleHandlerException e) { |
637 | 0 | throw new ExtractionException("Error while closing context.", e); |
638 | 0 | } |
639 | |
} |
640 | 0 | } |
641 | |
|
642 | |
|
643 | |
|
644 | |
|
645 | |
private ExtractionContext createExtractionContext() { |
646 | 0 | return new ExtractionContext( |
647 | |
"consolidation-extractor", |
648 | |
documentURI, |
649 | |
UUID.randomUUID().toString() |
650 | |
); |
651 | |
} |
652 | |
|
653 | |
|
654 | |
|
655 | |
|
656 | |
|
657 | |
|
658 | |
|
659 | |
|
660 | |
|
661 | |
|
662 | |
private void addNestingRelationship( |
663 | |
List<ResourceRoot> resourceRoots, |
664 | |
List<PropertyPath> propertyPaths, |
665 | |
ExtractionContext context |
666 | |
) throws TripleHandlerException { |
667 | |
ResourceRoot currentResourceRoot; |
668 | |
PropertyPath currentPropertyPath; |
669 | 0 | for (int r = 0; r < resourceRoots.size(); r++) { |
670 | 0 | currentResourceRoot = resourceRoots.get(r); |
671 | 0 | for (int p = 0; p < propertyPaths.size(); p++) { |
672 | 0 | currentPropertyPath = propertyPaths.get(p); |
673 | 0 | Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor(); |
674 | 0 | Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor(); |
675 | |
|
676 | 0 | if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) { |
677 | 0 | continue; |
678 | |
} |
679 | |
|
680 | 0 | if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) { |
681 | 0 | continue; |
682 | |
} |
683 | 0 | if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) { |
684 | 0 | createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context); |
685 | |
} |
686 | |
} |
687 | |
} |
688 | 0 | } |
689 | |
|
690 | |
|
691 | |
|
692 | |
|
693 | |
|
694 | |
|
695 | |
|
696 | |
|
697 | |
|
698 | |
|
699 | |
|
700 | |
|
701 | |
|
702 | |
|
703 | |
|
704 | |
|
705 | |
|
706 | |
|
707 | |
|
708 | |
private ExtractionContext consolidateResources( |
709 | |
List<ResourceRoot> resourceRoots, |
710 | |
List<PropertyPath> propertyPaths, |
711 | |
boolean addDomainTriples, |
712 | |
TripleHandler output |
713 | |
) throws ExtractionException { |
714 | 0 | final ExtractionContext context = createExtractionContext(); |
715 | |
|
716 | |
try { |
717 | 0 | output.openContext(context); |
718 | 0 | } catch (TripleHandlerException e) { |
719 | 0 | throw new ExtractionException( |
720 | |
String.format("Error starting document with URI %s", documentURI), |
721 | |
e |
722 | |
); |
723 | 0 | } |
724 | |
|
725 | |
try { |
726 | 0 | if(addDomainTriples) { |
727 | 0 | addDomainTriplesPerResourceRoots(resourceRoots, context); |
728 | |
} |
729 | 0 | addNestingRelationship(resourceRoots, propertyPaths, context); |
730 | 0 | } catch (TripleHandlerException the) { |
731 | 0 | throw new ExtractionException("Error while writing triple triple.", the); |
732 | |
} finally { |
733 | 0 | try { |
734 | 0 | output.closeContext(context); |
735 | 0 | } catch (TripleHandlerException e) { |
736 | 0 | throw new ExtractionException("Error while closing context.", e); |
737 | 0 | } |
738 | |
} |
739 | |
|
740 | 0 | return context; |
741 | |
} |
742 | |
|
743 | |
|
744 | |
|
745 | |
|
746 | |
|
747 | |
|
748 | |
|
749 | |
|
750 | |
|
751 | |
|
752 | |
|
753 | |
|
754 | |
|
755 | |
|
756 | |
|
757 | |
private ExtractionContext consolidateResources( |
758 | |
List<ResourceRoot> resourceRoots, |
759 | |
boolean addDomainTriples, |
760 | |
TripleHandler output |
761 | |
) throws ExtractionException { |
762 | 0 | final ExtractionContext context = createExtractionContext(); |
763 | |
|
764 | |
try { |
765 | 0 | output.openContext(context); |
766 | 0 | } catch (TripleHandlerException e) { |
767 | 0 | throw new ExtractionException( |
768 | |
String.format("Error starting document with URI %s", documentURI), |
769 | |
e |
770 | |
); |
771 | 0 | } |
772 | |
|
773 | |
try { |
774 | 0 | if(addDomainTriples) { |
775 | 0 | addDomainTriplesPerResourceRoots(resourceRoots, context); |
776 | |
} |
777 | |
} finally { |
778 | 0 | try { |
779 | 0 | output.closeContext(context); |
780 | 0 | } catch (TripleHandlerException the) { |
781 | 0 | throw new ExtractionException("Error while closing context.", the); |
782 | 0 | } |
783 | |
} |
784 | |
|
785 | 0 | return context; |
786 | |
} |
787 | |
|
788 | |
|
789 | |
|
790 | |
|
791 | |
|
792 | |
|
793 | |
|
794 | |
|
795 | |
private void addExtractionTimeSizeMetaTriples(ExtractionContext context) |
796 | |
throws TripleHandlerException { |
797 | |
|
798 | 0 | String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date()); |
799 | 0 | output.receiveTriple( |
800 | |
new URIImpl(documentURI.toString()), |
801 | |
vSINDICE.getProperty(SINDICE.DATE), |
802 | |
ValueFactoryImpl.getInstance().createLiteral(xsdDateTimeNow), |
803 | |
null, |
804 | |
context |
805 | |
); |
806 | |
|
807 | |
|
808 | 0 | int numberOfTriples = 0; |
809 | 0 | CompositeTripleHandler cth = (CompositeTripleHandler) output; |
810 | 0 | for (TripleHandler th : cth.getChilds()) { |
811 | 0 | if (th instanceof CountingTripleHandler) { |
812 | 0 | numberOfTriples = ((CountingTripleHandler) th).getCount(); |
813 | |
} |
814 | |
} |
815 | 0 | output.receiveTriple( |
816 | |
new URIImpl(documentURI.toString()), |
817 | |
vSINDICE.getProperty(SINDICE.SIZE), |
818 | |
ValueFactoryImpl.getInstance().createLiteral(numberOfTriples + 1), |
819 | |
null, |
820 | |
context |
821 | |
); |
822 | 0 | } |
823 | |
|
824 | |
|
825 | |
|
826 | |
|
827 | |
|
828 | |
|
829 | |
|
830 | |
|
831 | |
|
832 | |
|
833 | |
private void createNestingRelationship( |
834 | |
PropertyPath from, |
835 | |
ResourceRoot to, |
836 | |
TripleHandler th, |
837 | |
ExtractionContext ec |
838 | |
) throws TripleHandlerException { |
839 | 0 | final BNode fromObject = from.getObject(); |
840 | 0 | final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() ); |
841 | 0 | BNode bnode = RDFUtils.getBNode(bNodeHash); |
842 | 0 | th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec ); |
843 | 0 | th.receiveTriple( |
844 | |
bnode, |
845 | |
vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), |
846 | |
from.getObject() == null ? to.getRoot() : from.getObject(), |
847 | |
null, |
848 | |
ec |
849 | |
); |
850 | 0 | th.receiveTriple( |
851 | |
from.getSubject(), |
852 | |
vSINDICE.getProperty(SINDICE.NESTING), |
853 | |
bnode, |
854 | |
null, |
855 | |
ec |
856 | |
); |
857 | 0 | } |
858 | |
|
859 | |
|
860 | |
|
861 | |
|
862 | 0 | private class SingleExtractionReport { |
863 | |
private final Collection<ErrorReporter.Error> errors; |
864 | |
private final List<ResourceRoot> resourceRoots; |
865 | |
private final List<PropertyPath> propertyPaths; |
866 | |
|
867 | |
public SingleExtractionReport( |
868 | |
Collection<ErrorReporter.Error> errors, |
869 | |
List<ResourceRoot> resourceRoots, |
870 | |
List<PropertyPath> propertyPaths |
871 | 0 | ) { |
872 | 0 | this.errors = errors; |
873 | 0 | this.resourceRoots = resourceRoots; |
874 | 0 | this.propertyPaths = propertyPaths; |
875 | 0 | } |
876 | |
} |
877 | |
|
878 | |
} |