Coverage Report - org.apache.any23.extractor.html.TurtleHTMLExtractor
 
Classes in this File Line Coverage Branch Coverage Complexity
TurtleHTMLExtractor
0%
0/26
0%
0/8
2.5
 
 1  
 /*
 2  
  * Licensed to the Apache Software Foundation (ASF) under one or more
 3  
  * contributor license agreements.  See the NOTICE file distributed with
 4  
  * this work for additional information regarding copyright ownership.
 5  
  * The ASF licenses this file to You under the Apache License, Version 2.0
 6  
  * (the "License"); you may not use this file except in compliance with
 7  
  * the License.  You may obtain a copy of the License at
 8  
  *
 9  
  *  http://www.apache.org/licenses/LICENSE-2.0
 10  
  *
 11  
  * Unless required by applicable law or agreed to in writing, software
 12  
  * distributed under the License is distributed on an "AS IS" BASIS,
 13  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14  
  * See the License for the specific language governing permissions and
 15  
  * limitations under the License.
 16  
  */
 17  
 
 18  
 package org.apache.any23.extractor.html;
 19  
 
 20  
 import org.apache.any23.extractor.ErrorReporter;
 21  
 import org.apache.any23.extractor.ExtractionContext;
 22  
 import org.apache.any23.extractor.ExtractionException;
 23  
 import org.apache.any23.extractor.ExtractionParameters;
 24  
 import org.apache.any23.extractor.ExtractionResult;
 25  
 import org.apache.any23.extractor.Extractor;
 26  
 import org.apache.any23.extractor.ExtractorDescription;
 27  
 import org.apache.any23.extractor.ExtractorFactory;
 28  
 import org.apache.any23.extractor.SimpleExtractorFactory;
 29  
 import org.apache.any23.extractor.rdf.RDFParserFactory;
 30  
 import org.apache.any23.rdf.PopularPrefixes;
 31  
 import org.openrdf.model.URI;
 32  
 import org.openrdf.rio.RDFParseException;
 33  
 import org.openrdf.rio.turtle.TurtleParser;
 34  
 import org.w3c.dom.Document;
 35  
 import org.w3c.dom.Node;
 36  
 
 37  
 import java.io.IOException;
 38  
 import java.io.StringReader;
 39  
 import java.util.Arrays;
 40  
 import java.util.List;
 41  
 
 42  
 /**
 43  
  * Extractor for <i>Turtle/N3</i> format embedded within <i>HTML</i>
 44  
  * <i>script</i> tags.
 45  
  *
 46  
  * See specification draft <a href="http://esw.w3.org/N3inHTML">here</a>. 
 47  
  *
 48  
  * @author Michele Mostarda (mostarda@fbk.eu)
 49  
  */
 50  0
 public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
 51  
 
 52  
     public final static String NAME = "html-script-turtle";
 53  
 
 54  0
     public final static ExtractorFactory<TurtleHTMLExtractor> factory =
 55  
             SimpleExtractorFactory.create(
 56  
                     NAME,
 57  
                     PopularPrefixes.get(),
 58  
                     Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
 59  
                     "example-script-turtle.html",
 60  
                     TurtleHTMLExtractor.class
 61  
             );
 62  
 
 63  
     private TurtleParser turtleParser;
 64  
 
 65  
     public void run(
 66  
             ExtractionParameters extractionParameters,
 67  
             ExtractionContext extractionContext,
 68  
             Document in,
 69  
             ExtractionResult out
 70  
     ) throws IOException, ExtractionException {
 71  
         List<Node> scriptNodes;
 72  0
         HTMLDocument htmlDocument = new HTMLDocument(in);
 73  0
         final URI documentURI = extractionContext.getDocumentURI();
 74  
 
 75  0
         scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
 76  0
         processScriptNodes(documentURI, extractionContext, out, scriptNodes);
 77  
 
 78  0
         scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
 79  0
         processScriptNodes(documentURI, extractionContext, out, scriptNodes);
 80  
 
 81  0
         scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
 82  0
         processScriptNodes(documentURI, extractionContext,out, scriptNodes);
 83  0
     }
 84  
 
 85  
     public ExtractorDescription getDescription() {
 86  0
         return factory;
 87  
     }
 88  
 
 89  
     /**
 90  
      * Processes a list of <i>html script</i> nodes retrieving the N3 / Turtle content.
 91  
      *
 92  
      * @param documentURI the URI of the original HTML document.
 93  
      * @param er the extraction result used to store triples.
 94  
      * @param ns the list of script nodes.
 95  
      */
 96  
     private void processScriptNodes(URI documentURI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
 97  0
         if(ns.size() > 0 && turtleParser == null) {
 98  0
             turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
 99  
         }
 100  0
         for(Node n : ns) {
 101  0
             processScriptNode(turtleParser, documentURI, n, er);
 102  
         }
 103  0
     }
 104  
 
 105  
     /**
 106  
      * Processes a single <i>html script</i> node.
 107  
      *
 108  
      * @param turtleParser the parser used to digest node content.
 109  
      * @param documentURI the URI of the original HTML document.
 110  
      * @param n the script node.
 111  
      * @param er the extraction result used to store triples.
 112  
      */
 113  
     private void processScriptNode(TurtleParser turtleParser, URI documentURI, Node n, ExtractionResult er) {
 114  0
         final Node idAttribute = n.getAttributes().getNamedItem("id");
 115  0
         final String graphName =
 116  
                 documentURI.stringValue() +
 117  
                 ( idAttribute == null ? "" : "#" +   idAttribute.getTextContent() ); 
 118  
         try {
 119  0
             turtleParser.parse( new StringReader(n.getTextContent()), graphName );
 120  0
         } catch (RDFParseException rdfpe) {
 121  0
             er.notifyError(
 122  
                     ErrorReporter.ErrorLevel.ERROR,
 123  
                     String.format(
 124  
                             "An error occurred while parsing turtle content within script node: %s",
 125  
                             Arrays.toString( DomUtils.getXPathListForNode(n) )
 126  
                     ),
 127  
                     rdfpe.getLineNumber(), rdfpe.getColumnNumber()
 128  
             );
 129  0
         } catch (Exception e) {
 130  0
             er.notifyError(ErrorReporter.ErrorLevel.ERROR, "An error occurred while processing RDF data.", -1 , -1);
 131  0
         }
 132  0
     }
 133  
 
 134  
 }