Coverage Report - org.apache.any23.extractor.SingleDocumentExtraction
 
Classes in this File Line Coverage Branch Coverage Complexity
SingleDocumentExtraction
0%
0/258
0%
0/100
4.286
SingleDocumentExtraction$SingleExtractionReport
0%
0/6
N/A
4.286
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor;
 19  
 
 20  
 import org.apache.any23.configuration.Configuration;
 21  
 import org.apache.any23.configuration.DefaultConfiguration;
 22  
 import org.apache.any23.encoding.EncodingDetector;
 23  
 import org.apache.any23.encoding.TikaEncodingDetector;
 24  
 import org.apache.any23.extractor.html.DocumentReport;
 25  
 import org.apache.any23.extractor.html.HTMLDocument;
 26  
 import org.apache.any23.extractor.html.MicroformatExtractor;
 27  
 import org.apache.any23.extractor.html.TagSoupParser;
 28  
 import org.apache.any23.mime.MIMEType;
 29  
 import org.apache.any23.mime.MIMETypeDetector;
 30  
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 31  
 import org.apache.any23.rdf.RDFUtils;
 32  
 import org.apache.any23.source.DocumentSource;
 33  
 import org.apache.any23.source.LocalCopyFactory;
 34  
 import org.apache.any23.source.MemCopyFactory;
 35  
 import org.apache.any23.validator.EmptyValidationReport;
 36  
 import org.apache.any23.validator.ValidatorException;
 37  
 import org.apache.any23.vocab.SINDICE;
 38  
 import org.apache.any23.writer.CompositeTripleHandler;
 39  
 import org.apache.any23.writer.CountingTripleHandler;
 40  
 import org.apache.any23.writer.TripleHandler;
 41  
 import org.apache.any23.writer.TripleHandlerException;
 42  
 import org.apache.any23.extractor.Extractor.BlindExtractor;
 43  
 import org.apache.any23.extractor.Extractor.ContentExtractor;
 44  
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 45  
 import org.openrdf.model.BNode;
 46  
 import org.openrdf.model.URI;
 47  
 import org.openrdf.model.impl.URIImpl;
 48  
 import org.openrdf.model.impl.ValueFactoryImpl;
 49  
 import org.slf4j.Logger;
 50  
 import org.slf4j.LoggerFactory;
 51  
 
 52  
 import java.io.BufferedInputStream;
 53  
 import java.io.ByteArrayOutputStream;
 54  
 import java.io.IOException;
 55  
 import java.io.InputStream;
 56  
 import java.io.PrintStream;
 57  
 import java.net.URISyntaxException;
 58  
 import java.util.ArrayList;
 59  
 import java.util.Collection;
 60  
 import java.util.Collections;
 61  
 import java.util.Date;
 62  
 import java.util.HashMap;
 63  
 import java.util.List;
 64  
 import java.util.Map;
 65  
 import java.util.UUID;
 66  
 
 67  
 import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
 68  
 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
 69  
 
 70  
 /**
 71  
  * This class acts as facade where all the extractors were called on a single document.
 72  
  */
 73  
 public class SingleDocumentExtraction {
 74  
 
 75  
     public static final String EXTRACTION_CONTEXT_URI_PROPERTY = "any23.extraction.context.uri";
 76  
 
 77  
     public static final String METADATA_TIMESIZE_FLAG           = "any23.extraction.metadata.timesize";
 78  
     public static final String METADATA_NESTING_FLAG            = "any23.extraction.metadata.nesting";
 79  
     public static final String METADATA_DOMAIN_PER_ENTITY_FLAG  = "any23.extraction.metadata.domain.per.entity";
 80  
 
 81  0
     private static final SINDICE vSINDICE = SINDICE.getInstance();
 82  
 
 83  0
     private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
 84  
 
 85  
     private final Configuration configuration;
 86  
 
 87  
     private final DocumentSource in;
 88  
 
 89  
     private URI documentURI;
 90  
     
 91  
     private final ExtractorGroup extractors;
 92  
 
 93  
     private final TripleHandler output;
 94  
 
 95  
     private final EncodingDetector encoderDetector;
 96  
 
 97  0
     private LocalCopyFactory copyFactory = null;
 98  
 
 99  0
     private DocumentSource localDocumentSource = null;
 100  
 
 101  0
     private MIMETypeDetector detector = null;
 102  
 
 103  0
     private ExtractorGroup matchingExtractors = null;
 104  
 
 105  0
     private MIMEType detectedMIMEType = null;
 106  
 
 107  0
     private DocumentReport documentReport = null;
 108  
 
 109  0
     private ExtractionParameters tagSoupDOMRelatedParameters = null;
 110  
 
 111  0
     private String parserEncoding = null;
 112  
 
 113  
     /**
 114  
      * Builds an extractor by the specification of document source,
 115  
      * list of extractors and output triple handler.
 116  
      *
 117  
      * @param configuration configuration applied during extraction.
 118  
      * @param in input document source.
 119  
      * @param extractors list of extractors to be applied.
 120  
      * @param output output triple handler.
 121  
      */
 122  
     public SingleDocumentExtraction(
 123  
             Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output
 124  0
     ) {
 125  0
         if(configuration == null) throw new NullPointerException("configuration cannot be null.");
 126  0
         if(in == null)            throw new NullPointerException("in cannot be null.");
 127  0
         this.configuration = configuration;
 128  0
         this.in = in;
 129  0
         this.extractors = extractors;
 130  
 
 131  0
         List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>();
 132  0
         tripleHandlers.add(output);
 133  0
         tripleHandlers.add(new CountingTripleHandler());
 134  0
         this.output = new CompositeTripleHandler(tripleHandlers);
 135  0
         this.encoderDetector = new TikaEncodingDetector();
 136  0
     }
 137  
 
 138  
     /**
 139  
      * Builds an extractor by the specification of document source,
 140  
      * extractors factory and output triple handler.
 141  
      *
 142  
      * @param configuration configuration applied during extraction.
 143  
      * @param in input document source.
 144  
      * @param factory the extractors factory.
 145  
      * @param output output triple handler.
 146  
      */
 147  
     public SingleDocumentExtraction(
 148  
             Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
 149  
     ) {
 150  0
         this(
 151  
                 configuration,
 152  
                 in,
 153  
                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
 154  
                 output
 155  
         );
 156  0
         this.setMIMETypeDetector(null);
 157  0
     }
 158  
 
 159  
     /**
 160  
      * Builds an extractor by the specification of document source,
 161  
      * extractors factory and output triple handler, using the
 162  
      * {@link org.apache.any23.configuration.DefaultConfiguration}.
 163  
      *
 164  
      * @param in input document source.
 165  
      * @param factory the extractors factory.
 166  
      * @param output output triple handler.
 167  
      */
 168  
     public SingleDocumentExtraction(
 169  
         DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
 170  
     ) {
 171  0
         this(
 172  
                 DefaultConfiguration.singleton(),
 173  
                 in,
 174  
                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
 175  
                 output
 176  
         );
 177  0
         this.setMIMETypeDetector(null);
 178  0
     }
 179  
 
 180  
     /**
 181  
      * Sets the internal factory for generating the document local copy,
 182  
      * if <code>null</code> the {@link org.apache.any23.source.MemCopyFactory} will be used.
 183  
      *
 184  
      * @param copyFactory local copy factory.
 185  
      * @see org.apache.any23.source.DocumentSource
 186  
      */
 187  
     public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
 188  0
         this.copyFactory = copyFactory;
 189  0
     }
 190  
 
 191  
     /**
 192  
      * Sets the internal mime type detector,
 193  
      * if <code>null</code> mimetype detection will
 194  
      * be skipped and all extractors will be activated.
 195  
      *
 196  
      * @param detector detector instance.
 197  
      */
 198  
     public void setMIMETypeDetector(MIMETypeDetector detector) {
 199  0
         this.detector = detector;
 200  0
     }
 201  
 
 202  
     /**
 203  
      * Triggers the execution of all the {@link Extractor}
 204  
      * registered to this class using the specified extraction parameters.
 205  
      *
 206  
      * @param extractionParameters the parameters applied to the run execution.
 207  
      * @return the report generated by the extraction.
 208  
      * @throws ExtractionException if an error occurred during the data extraction.
 209  
      * @throws IOException if an error occurred during the data access.
 210  
      */
 211  
     public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
 212  
     throws ExtractionException, IOException {
 213  0
         if(extractionParameters == null) {
 214  0
             extractionParameters = ExtractionParameters.newDefault(configuration);
 215  
         }
 216  
 
 217  0
         final String contextURI = extractionParameters.getProperty(EXTRACTION_CONTEXT_URI_PROPERTY);
 218  0
         ensureHasLocalCopy();
 219  
         try {
 220  0
             this.documentURI = new Any23ValueFactoryWrapper(
 221  
                     ValueFactoryImpl.getInstance()
 222  
             ).createURI( "?".equals(contextURI) ? in.getDocumentURI() : contextURI);
 223  0
         } catch (Exception ex) {
 224  0
             throw new IllegalArgumentException("Invalid URI: " + in.getDocumentURI(), ex);
 225  0
         }
 226  0
         if(log.isInfoEnabled()) {
 227  0
             log.info("Processing " + this.documentURI);
 228  
         }
 229  0
         filterExtractorsByMIMEType();
 230  
 
 231  0
         if(log.isDebugEnabled()) {
 232  0
             StringBuffer sb = new StringBuffer("Extractors ");
 233  0
             for (ExtractorFactory<?> factory : matchingExtractors) {
 234  0
                 sb.append(factory.getExtractorName());
 235  0
                 sb.append(' ');
 236  
             }
 237  0
             sb.append("match ").append(documentURI);
 238  0
             log.debug(sb.toString());
 239  
         }
 240  
 
 241  
         // Invoke all extractors.
 242  
         try {
 243  0
             output.startDocument(documentURI);
 244  0
         } catch (TripleHandlerException e) {
 245  0
             log.error(String.format("Error starting document with URI %s", documentURI));
 246  0
             throw new ExtractionException(String.format("Error starting document with URI %s", documentURI),
 247  
                     e
 248  
             );
 249  0
         }
 250  0
         output.setContentLength(in.getContentLength());
 251  
         // Create the document context.
 252  0
         final List<ResourceRoot> resourceRoots = new ArrayList<ResourceRoot>();
 253  0
         final List<PropertyPath> propertyPaths = new ArrayList<PropertyPath>();
 254  0
         final Map<String,Collection<ErrorReporter.Error>> extractorToErrors =
 255  
             new HashMap<String,Collection<ErrorReporter.Error>>();
 256  
         try {
 257  0
             final String documentLanguage = extractDocumentLanguage(extractionParameters);
 258  0
             for (ExtractorFactory<?> factory : matchingExtractors) {
 259  0
                 final Extractor extractor = factory.createExtractor();
 260  0
                 final SingleExtractionReport er = runExtractor(
 261  
                         extractionParameters,
 262  
                         documentLanguage,
 263  
                         extractor
 264  
                 );
 265  0
                 resourceRoots.addAll( er.resourceRoots );
 266  0
                 propertyPaths.addAll( er.propertyPaths );
 267  0
                 extractorToErrors.put(factory.getExtractorName(), er.errors);
 268  0
             }
 269  0
         } catch(ValidatorException ve) {
 270  0
             throw new ExtractionException("An error occurred during the validation phase.", ve);
 271  0
         }
 272  
 
 273  
         // Resource consolidation.
 274  0
         final boolean addDomainTriples = extractionParameters.getFlag(METADATA_DOMAIN_PER_ENTITY_FLAG);
 275  
         final ExtractionContext consolidationContext;
 276  0
         if(extractionParameters.getFlag(METADATA_NESTING_FLAG)) {
 277  
             // Consolidation with nesting.
 278  0
             consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output);
 279  
         } else {
 280  0
             consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output);
 281  
         }
 282  
 
 283  
         // Adding time/size meta triples.
 284  0
         if (extractionParameters.getFlag(METADATA_TIMESIZE_FLAG)) {
 285  
             try {
 286  0
                 addExtractionTimeSizeMetaTriples(consolidationContext);
 287  0
             } catch (TripleHandlerException e) {
 288  0
                 throw new ExtractionException(
 289  
                         String.format(
 290  
                                 "Error while adding extraction metadata triples document with URI %s", documentURI
 291  
                         ),
 292  
                         e
 293  
                 );
 294  0
             }
 295  
         }
 296  
 
 297  
         try {
 298  0
             output.endDocument(documentURI);
 299  0
         } catch (TripleHandlerException e) {
 300  0
             log.error(String.format("Error ending document with URI %s", documentURI));
 301  0
             throw new ExtractionException(String.format("Error ending document with URI %s", documentURI),
 302  
                     e
 303  
             );
 304  0
         }
 305  
 
 306  0
         return new SingleDocumentExtractionReport(
 307  
                 documentReport == null
 308  
                         ?
 309  
                 EmptyValidationReport.getInstance() : documentReport.getReport(),
 310  
                 extractorToErrors
 311  
         );
 312  
     }
 313  
 
 314  
     /**
 315  
      * Triggers the execution of all the {@link Extractor}
 316  
      * registered to this class using the <i>default</i> extraction parameters.
 317  
      *
 318  
      * @throws IOException
 319  
      * @throws ExtractionException
 320  
      * @return the extraction report.
 321  
      */
 322  
     public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
 323  0
         return run(ExtractionParameters.newDefault(configuration));
 324  
     }
 325  
 
 326  
     /**
 327  
      * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
 328  
      *
 329  
      * @return string containing the detected mimetype.
 330  
      * @throws IOException if an error occurred while accessing the data.
 331  
      */
 332  
     public String getDetectedMIMEType() throws IOException {
 333  0
         filterExtractorsByMIMEType();
 334  0
         return  detectedMIMEType == null ? null : detectedMIMEType.toString();
 335  
     }
 336  
 
 337  
     /**
 338  
      * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an extractor.
 339  
      *
 340  
      * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
 341  
      * @throws IOException
 342  
      */
 343  
     public boolean hasMatchingExtractors() throws IOException {
 344  0
         filterExtractorsByMIMEType();
 345  0
         return !matchingExtractors.isEmpty();
 346  
     }
 347  
 
 348  
     /**
 349  
      * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
 350  
      */
 351  
     public List<Extractor> getMatchingExtractors() {
 352  0
         final List<Extractor> extractorsList = new ArrayList<Extractor>();
 353  0
         for(ExtractorFactory extractorFactory : matchingExtractors) {
 354  0
             extractorsList.add( extractorFactory.createExtractor() );
 355  
         }
 356  0
         return extractorsList;
 357  
     }
 358  
 
 359  
     /**
 360  
      * @return the configured parsing encoding.
 361  
      */
 362  
     public String getParserEncoding() {
 363  0
         if(this.parserEncoding == null) {
 364  0
             this.parserEncoding = detectEncoding();
 365  
         }
 366  0
         return this.parserEncoding;
 367  
     }
 368  
 
 369  
     /**
 370  
      * Sets the document parser encoding.
 371  
      *
 372  
      * @param encoding parser encoding.
 373  
      */
 374  
     public void setParserEncoding(String encoding) {
 375  0
         this.parserEncoding = encoding;
 376  0
         documentReport = null;
 377  0
     }
 378  
 
 379  
     /**
 380  
      * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
 381  
      *
 382  
      * @return <code>true</code> if the document source is an HTML document.
 383  
      * @throws IOException if an error occurs while accessing data.
 384  
      */
 385  
     private boolean isHTMLDocument() throws IOException {
 386  0
         filterExtractorsByMIMEType();
 387  0
         return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty();
 388  
     }
 389  
 
 390  
     /**
 391  
      * Extracts the document language where possible.
 392  
      *
 393  
      * @param extractionParameters extraction parameters to be applied to determine the document language.
 394  
      * @return the document language if any, <code>null</code> otherwise.
 395  
      * @throws java.io.IOException if an error occurs during the document analysis.
 396  
      * @throws org.apache.any23.validator.ValidatorException
 397  
      */
 398  
     private String extractDocumentLanguage(ExtractionParameters extractionParameters)
 399  
     throws IOException, ValidatorException {
 400  0
         if( ! isHTMLDocument() ) {
 401  0
             return null;
 402  
         }
 403  
         final HTMLDocument document;
 404  
         try {
 405  0
             document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() );
 406  0
         } catch (IOException ioe) {
 407  0
             log.debug("Cannot extract language from document.", ioe);
 408  0
             return null;
 409  0
         }
 410  0
         return document.getDefaultLanguage();
 411  
     }
 412  
 
 413  
     /**
 414  
      * Generates a list of extractors that can be applied to the given document.
 415  
      *
 416  
      * @throws IOException
 417  
      */
 418  
     private void filterExtractorsByMIMEType()
 419  
     throws IOException {
 420  0
         if (matchingExtractors != null) return;  // has already been run.
 421  
 
 422  0
         if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
 423  0
             matchingExtractors = extractors;
 424  0
             return;
 425  
         }
 426  0
         ensureHasLocalCopy();
 427  0
         detectedMIMEType = detector.guessMIMEType(
 428  
                 java.net.URI.create(documentURI.stringValue()).getPath(),
 429  
                 localDocumentSource.openInputStream(),
 430  
                 MIMEType.parse(localDocumentSource.getContentType())
 431  
         );
 432  0
         log.debug("detected media type: " + detectedMIMEType);
 433  0
         matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
 434  0
     }
 435  
 
 436  
     /**
 437  
      * Triggers the execution of a specific {@link Extractor}.
 438  
      * 
 439  
      * @param extractionParameters the parameters used for the extraction.
 440  
      * @param extractor the {@link Extractor} to be executed.
 441  
      * @throws ExtractionException if an error specific to an extractor happens.
 442  
      * @throws IOException if an IO error occurs during the extraction.
 443  
      * @return the roots of the resources that have been extracted.
 444  
      * @throws org.apache.any23.validator.ValidatorException if an error occurs during validation.
 445  
      */
 446  
     private SingleExtractionReport runExtractor(
 447  
             final ExtractionParameters extractionParameters,
 448  
             final String documentLanguage,
 449  
             final Extractor<?> extractor
 450  
     ) throws ExtractionException, IOException, ValidatorException {
 451  0
         if(log.isDebugEnabled()) {
 452  0
             log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + documentURI);
 453  
         }
 454  0
         long startTime = System.currentTimeMillis();
 455  0
         final ExtractionContext extractionContext = new ExtractionContext(
 456  
                 extractor.getDescription().getExtractorName(),
 457  
                 documentURI,
 458  
                 documentLanguage
 459  
         );
 460  0
         final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
 461  
         try {
 462  0
             if (extractor instanceof BlindExtractor) {
 463  0
                 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
 464  0
                 blindExtractor.run(extractionParameters, extractionContext, documentURI, extractionResult);
 465  0
             } else if (extractor instanceof ContentExtractor) {
 466  0
                 ensureHasLocalCopy();
 467  0
                 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
 468  0
                 contentExtractor.run(
 469  
                         extractionParameters,
 470  
                         extractionContext,
 471  
                         localDocumentSource.openInputStream(),
 472  
                         extractionResult
 473  
                 );
 474  0
             } else if (extractor instanceof TagSoupDOMExtractor) {
 475  0
                 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
 476  0
                 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
 477  0
                 tagSoupDOMExtractor.run(
 478  
                         extractionParameters,
 479  
                         extractionContext,
 480  
                         documentReport.getDocument(),
 481  
                         extractionResult
 482  
                 );
 483  0
             } else {
 484  0
                 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
 485  
             }
 486  0
             return
 487  
                 new SingleExtractionReport(
 488  
                     extractionResult.getErrors(),
 489  
                     new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
 490  
                     new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() )
 491  
                 );
 492  0
         } catch (ExtractionException ex) {
 493  0
             if(log.isDebugEnabled()) {
 494  0
                 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
 495  
             }
 496  0
             throw ex;
 497  
         } finally {
 498  
             // Logging result error report.
 499  0
             if( log.isDebugEnabled() && extractionResult.hasErrors() ) {
 500  0
                 ByteArrayOutputStream baos = new ByteArrayOutputStream();
 501  0
                 extractionResult.printErrorsReport( new PrintStream(baos) );
 502  0
                 log.debug(baos.toString());
 503  
             }
 504  0
             extractionResult.close();
 505  
 
 506  0
             long elapsed = System.currentTimeMillis() - startTime;
 507  0
             if(log.isDebugEnabled()) {
 508  0
                 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
 509  
             }
 510  0
         }
 511  
     }
 512  
 
 513  
     /**
 514  
      * Forces the retrieval of the document data.
 515  
      *
 516  
      * @throws IOException
 517  
      */
 518  
     private void ensureHasLocalCopy() throws IOException {
 519  0
         if (localDocumentSource != null) return;
 520  0
         if (in.isLocal()) {
 521  0
             localDocumentSource = in;
 522  0
             return;
 523  
         }
 524  0
         if (copyFactory == null) {
 525  0
             copyFactory = new MemCopyFactory();
 526  
         }
 527  0
         localDocumentSource = copyFactory.createLocalCopy(in);
 528  0
     }
 529  
 
 530  
     /**
 531  
      * Returns the DOM of the given document source (that must be an HTML stream)
 532  
      * and the report of eventual fixes applied on it.
 533  
      *
 534  
      * @param extractionParameters parameters to be used during extraction.
 535  
      * @return document report.
 536  
      * @throws IOException if an error occurs during data access.
 537  
      * @throws ValidatorException if an error occurs during validation.
 538  
      */
 539  
     private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
 540  
     throws IOException, ValidatorException {
 541  0
         if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
 542  0
             ensureHasLocalCopy();
 543  0
             final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
 544  0
             is.mark(Integer.MAX_VALUE);
 545  0
             final String candidateEncoding = getParserEncoding();
 546  0
             is.reset();
 547  0
             final TagSoupParser tagSoupParser = new TagSoupParser(
 548  
                     is,
 549  
                     documentURI.stringValue(),
 550  
                     candidateEncoding
 551  
             );
 552  0
             if(extractionParameters.isValidate()) {
 553  0
                 documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
 554  
             } else {
 555  0
                 documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
 556  
             }
 557  0
             tagSoupDOMRelatedParameters = extractionParameters;
 558  
         }
 559  0
         return documentReport;
 560  
     }
 561  
 
 562  
     /**
 563  
      * Detects the encoding of the local document source input stream.
 564  
      * 
 565  
      * @return a valid encoding value.
 566  
      */
 567  
     private String detectEncoding() {
 568  
         try {
 569  0
             ensureHasLocalCopy();
 570  0
             InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
 571  0
             String encoding = this.encoderDetector.guessEncoding(is);
 572  0
             is.close();
 573  0
             return encoding;
 574  0
         } catch (Exception e) {
 575  0
             throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
 576  
         }
 577  
     }
 578  
 
 579  
     /**
 580  
      * This function verifies if the <i>candidateSub</i> list of strings
 581  
      * is a prefix of <i>list</i>.
 582  
      *
 583  
      * @param list a list of strings.
 584  
      * @param candidateSub a list of strings.
 585  
      * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>,
 586  
      *         <code>false</code> otherwise.
 587  
      */
 588  
     private boolean subPath(String[] list, String[] candidateSub) {
 589  0
         if(candidateSub.length > list.length) {
 590  0
             return false;
 591  
         }
 592  0
         for(int i = 0; i < candidateSub.length; i++) {
 593  0
             if( ! candidateSub[i].equals(list[i])) {
 594  0
                 return false;
 595  
             }
 596  
         }
 597  0
         return true;
 598  
     }
 599  
 
 600  
     /**
 601  
      * Adds for every resource root node a page domain triple.
 602  
      *
 603  
      * @param resourceRoots list of resource roots.
 604  
      * @param context extraction context to produce triples.
 605  
      * @throws ExtractionException
 606  
      */
 607  
     private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
 608  
     throws ExtractionException {
 609  
         try {
 610  
             // Add source Web domains to every resource root.
 611  
             String domain;
 612  
             try {
 613  0
                 domain = new java.net.URI(in.getDocumentURI()).getHost();
 614  0
             } catch (URISyntaxException urise) {
 615  0
                 throw new IllegalArgumentException(
 616  
                         "An error occurred while extracting the host from the document URI.",
 617  
                         urise
 618  
                 );
 619  0
             }
 620  0
             if (domain != null) {
 621  0
                 for (ResourceRoot resourceRoot : resourceRoots) {
 622  0
                     output.receiveTriple(
 623  
                             resourceRoot.getRoot(),
 624  
                             vSINDICE.getProperty(SINDICE.DOMAIN),
 625  
                             ValueFactoryImpl.getInstance().createLiteral(domain),
 626  
                             null,
 627  
                             context
 628  
                     );
 629  
                 }
 630  
             }
 631  0
         } catch (TripleHandlerException e) {
 632  0
             throw new ExtractionException("Error while writing triple triple.", e);
 633  
         } finally {
 634  0
             try {
 635  0
                 output.closeContext(context);
 636  0
             } catch (TripleHandlerException e) {
 637  0
                 throw new ExtractionException("Error while closing context.", e);
 638  0
             }
 639  
         }
 640  0
     }
 641  
 
 642  
     /**
 643  
      * @return an extraction context specific for consolidation triples.
 644  
      */
 645  
     private ExtractionContext createExtractionContext() {
 646  0
         return new ExtractionContext(
 647  
                 "consolidation-extractor",
 648  
                 documentURI,
 649  
                 UUID.randomUUID().toString()
 650  
         );
 651  
     }
 652  
 
 653  
     /**
 654  
      * Detect the nesting relationship among different
 655  
      * Microformats and explicit them adding connection triples.
 656  
      *
 657  
      * @param resourceRoots
 658  
      * @param propertyPaths
 659  
      * @param context
 660  
      * @throws TripleHandlerException
 661  
      */
 662  
     private void addNestingRelationship(
 663  
             List<ResourceRoot> resourceRoots,
 664  
             List<PropertyPath> propertyPaths,
 665  
             ExtractionContext context
 666  
     ) throws TripleHandlerException {
 667  
         ResourceRoot currentResourceRoot;
 668  
         PropertyPath currentPropertyPath;
 669  0
         for (int r = 0; r < resourceRoots.size(); r++) {
 670  0
             currentResourceRoot = resourceRoots.get(r);
 671  0
             for (int p = 0; p < propertyPaths.size(); p++) {
 672  0
                 currentPropertyPath = propertyPaths.get(p);
 673  0
                 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
 674  0
                 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
 675  
                 // Avoid wrong nesting relationships.
 676  0
                 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
 677  0
                     continue;
 678  
                 }
 679  
                 // Avoid self declaring relationships
 680  0
                 if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
 681  0
                     continue;
 682  
                 }
 683  0
                 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
 684  0
                     createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
 685  
                 }
 686  
             }
 687  
         }
 688  0
     }
 689  
 
 690  
     /**
 691  
      * This method consolidates the graphs extracted from the same document.
 692  
      * In particular it adds:
 693  
      * <ul>
 694  
      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
 695  
      *   <li>triples indicating the nesting relationship among a microformat root and property paths of
 696  
      *       other nested microformats.
 697  
      *   </li>
 698  
      * </ul>
 699  
      * @param resourceRoots list of RDF nodes representing roots of
 700  
      *        extracted microformat graphs and the corresponding HTML paths.
 701  
      * @param propertyPaths list of RDF nodes representing property subjects, property URIs and the HTML paths
 702  
      *        from which such properties have been extracted. 
 703  
      * @param addDomainTriples
 704  
      * @param output a triple handler event collector.
 705  
      * @return
 706  
      * @throws ExtractionException
 707  
      */
 708  
     private ExtractionContext consolidateResources(
 709  
             List<ResourceRoot> resourceRoots,
 710  
             List<PropertyPath> propertyPaths,
 711  
             boolean addDomainTriples,
 712  
             TripleHandler output
 713  
     ) throws ExtractionException {
 714  0
         final ExtractionContext context = createExtractionContext();
 715  
 
 716  
         try {
 717  0
             output.openContext(context);
 718  0
         } catch (TripleHandlerException e) {
 719  0
             throw new ExtractionException(
 720  
                     String.format("Error starting document with URI %s", documentURI),
 721  
                     e
 722  
             );
 723  0
         }
 724  
 
 725  
         try {
 726  0
             if(addDomainTriples) {
 727  0
                 addDomainTriplesPerResourceRoots(resourceRoots, context);
 728  
             }
 729  0
             addNestingRelationship(resourceRoots, propertyPaths, context);
 730  0
         } catch (TripleHandlerException the) {
 731  0
             throw new ExtractionException("Error while writing triple triple.", the);
 732  
         } finally {
 733  0
             try {
 734  0
                 output.closeContext(context);
 735  0
             } catch (TripleHandlerException e) {
 736  0
                 throw new ExtractionException("Error while closing context.", e);
 737  0
             }
 738  
         }
 739  
 
 740  0
         return context;
 741  
     }
 742  
 
 743  
     /**
 744  
      * This method consolidates the graphs extracted from the same document.
 745  
      * In particular it adds:
 746  
      * <ul>
 747  
      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
 748  
      * </ul>
 749  
      * @param resourceRoots list of RDF nodes representing roots of
 750  
      *        extracted microformat graphs and the corresponding HTML paths.
 751  
      *        from which such properties have been extracted.
 752  
      * @param addDomainTriples
 753  
      * @param output a triple handler event collector.
 754  
      * @return
 755  
      * @throws ExtractionException
 756  
      */
 757  
     private ExtractionContext consolidateResources(
 758  
             List<ResourceRoot> resourceRoots,
 759  
             boolean addDomainTriples,
 760  
             TripleHandler output
 761  
     ) throws ExtractionException {
 762  0
         final ExtractionContext context = createExtractionContext();
 763  
 
 764  
         try {
 765  0
             output.openContext(context);
 766  0
         } catch (TripleHandlerException e) {
 767  0
             throw new ExtractionException(
 768  
                     String.format("Error starting document with URI %s", documentURI),
 769  
                     e
 770  
             );
 771  0
         }
 772  
 
 773  
         try {
 774  0
             if(addDomainTriples) {
 775  0
                 addDomainTriplesPerResourceRoots(resourceRoots, context);
 776  
             }
 777  
         } finally {
 778  0
             try {
 779  0
                 output.closeContext(context);
 780  0
             } catch (TripleHandlerException the) {
 781  0
                 throw new ExtractionException("Error while closing context.", the);
 782  0
             }
 783  
         }
 784  
 
 785  0
         return context;
 786  
     }
 787  
 
 788  
     /**
 789  
      * Adds metadata triples containing the number of extracted triples
 790  
      * and the extraction timestamp.
 791  
      *
 792  
      * @param context
 793  
      * @throws TripleHandlerException
 794  
      */
 795  
     private void addExtractionTimeSizeMetaTriples(ExtractionContext context)
 796  
     throws TripleHandlerException {
 797  
         // adding extraction date
 798  0
         String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
 799  0
         output.receiveTriple(
 800  
                 new URIImpl(documentURI.toString()),
 801  
                 vSINDICE.getProperty(SINDICE.DATE),
 802  
                 ValueFactoryImpl.getInstance().createLiteral(xsdDateTimeNow),
 803  
                 null,
 804  
                 context
 805  
         );
 806  
 
 807  
         // adding number of extracted triples
 808  0
         int numberOfTriples = 0;
 809  0
         CompositeTripleHandler cth = (CompositeTripleHandler) output;
 810  0
         for (TripleHandler th : cth.getChilds()) {
 811  0
             if (th instanceof CountingTripleHandler) {
 812  0
                 numberOfTriples = ((CountingTripleHandler) th).getCount();
 813  
             }
 814  
         }
 815  0
         output.receiveTriple(
 816  
                 new URIImpl(documentURI.toString()),
 817  
                 vSINDICE.getProperty(SINDICE.SIZE),
 818  
                 ValueFactoryImpl.getInstance().createLiteral(numberOfTriples + 1), // the number of triples plus itself
 819  
                 null,
 820  
                 context
 821  
         );
 822  0
     }
 823  
 
 824  
     /**
 825  
      * Creates a nesting relationship triple.
 826  
      * 
 827  
      * @param from the property containing the nested microformat.
 828  
      * @param to the root to the nested microformat.
 829  
      * @param th the triple handler.
 830  
      * @param ec the extraction context used to add such information.
 831  
      * @throws org.apache.any23.writer.TripleHandlerException
 832  
      */
 833  
     private void createNestingRelationship(
 834  
             PropertyPath from,
 835  
             ResourceRoot to,
 836  
             TripleHandler th,
 837  
             ExtractionContext ec
 838  
     ) throws TripleHandlerException {
 839  0
         final BNode fromObject = from.getObject();
 840  0
         final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() );
 841  0
         BNode bnode = RDFUtils.getBNode(bNodeHash);
 842  0
         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec );
 843  0
         th.receiveTriple(
 844  
                 bnode,
 845  
                 vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
 846  
                 from.getObject() == null ? to.getRoot() : from.getObject(),
 847  
                 null,
 848  
                 ec
 849  
         );
 850  0
         th.receiveTriple(
 851  
                 from.getSubject(),
 852  
                 vSINDICE.getProperty(SINDICE.NESTING),
 853  
                 bnode,
 854  
                 null,
 855  
                 ec
 856  
         );
 857  0
     }
 858  
 
 859  
     /**
 860  
      * Entity detection report.
 861  
      */
 862  0
     private class SingleExtractionReport {
 863  
         private final Collection<ErrorReporter.Error> errors;
 864  
         private final List<ResourceRoot> resourceRoots;
 865  
         private final List<PropertyPath> propertyPaths;
 866  
 
 867  
         public SingleExtractionReport(
 868  
                 Collection<ErrorReporter.Error>  errors,
 869  
                 List<ResourceRoot> resourceRoots,
 870  
                 List<PropertyPath> propertyPaths
 871  0
         ) {
 872  0
             this.errors        = errors;
 873  0
             this.resourceRoots = resourceRoots;
 874  0
             this.propertyPaths = propertyPaths;
 875  0
         }
 876  
     }
 877  
 
 878  
 }