Coverage Report

Coverage Report - org.apache.any23.extractor.rdfa.RDFaExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

RDFaExtractor

0/37

0/4

1.9

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.rdfa;
 
 import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.ExtractorFactory;
 import org.apache.any23.extractor.SimpleExtractorFactory;
 import org.apache.any23.extractor.rdf.RDFParserFactory;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.openrdf.rio.RDFHandlerException;
 import org.openrdf.rio.RDFParseException;
 import org.openrdf.rio.RDFParser;
 import org.w3c.dom.Document;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringReader;
 import java.io.StringWriter;
 import java.util.Arrays;
 
 /**
  * Extractor for RDFa in HTML, based on Fabien Gadon's XSLT transform, found
  * <a href="http://ns.inria.fr/grddl/rdfa/">here</a>. It works by first
  * parsing the HTML using a tagsoup parser, then applies the XSLT to the
  * DOM tree, then parses the resulting RDF/XML.
  *
  * @author Gabriele Renzi
  * @author Richard Cyganiak (richard@cyganiak.de)
  */
 public class RDFaExtractor implements TagSoupDOMExtractor {
 
     public final static String NAME = "html-rdfa";
 
     public final static String xsltFilename =
             DefaultConfiguration.singleton().getPropertyOrFail("any23.rdfa.extractor.xslt");
 
     private static XSLTStylesheet xslt = null;
 
     public final static ExtractorFactory<RDFaExtractor> factory =
         SimpleExtractorFactory.create(
                 NAME,
                 null,
                 Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
                 null,
                 RDFaExtractor.class
         );
 
     /**
      * Returns a {@link XSLTStylesheet} able to distill RDFa from
      * HTML pages.
      *
      * @return returns a not <code>null</code> XSLT instance.
      */
     public static synchronized XSLTStylesheet getXSLT() {
         // Lazily initialized static instance, so we don't parse
         // the XSLT unless really necessary, and only once
         if (xslt == null) {
             InputStream in = RDFaExtractor.class.getResourceAsStream(xsltFilename);
             if (in == null) {
                 throw new RuntimeException("Couldn't load '" + xsltFilename +
                         "', maybe the file is not bundled in the jar?");
             }
             xslt = new XSLTStylesheet(in);
         }
         return xslt;
     }
 
     private boolean verifyDataType;
 
     private boolean stopAtFirstError;
 
     /**
      * Constructor, allows to specify the validation and error handling policies.
      *
      * @param verifyDataType if <code>true</code> the data types will be verified,
      *         if <code>false</code> will be ignored.
      * @param stopAtFirstError if <code>true</code> the parser will stop at first parsing error,
      *        if <code>false</code> will ignore non blocking errors.
      */
     public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
         this.verifyDataType   = verifyDataType;
         this.stopAtFirstError = stopAtFirstError;
     }
 
     /**
      * Default constructor, with no verification of data types and not stop at first error.
      */    
     public RDFaExtractor() {
         this(false, false);
     }
 
     public boolean isVerifyDataType() {
         return verifyDataType;
     }
 
     public void setVerifyDataType(boolean verifyDataType) {
         this.verifyDataType = verifyDataType;
     }
 
     public boolean isStopAtFirstError() {
         return stopAtFirstError;
     }
 
     public void setStopAtFirstError(boolean stopAtFirstError) {
         this.stopAtFirstError = stopAtFirstError;
     }
 
     public void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             Document in,
             ExtractionResult out
     ) throws IOException, ExtractionException {
 
         StringWriter buffer = new StringWriter();
         try {
             getXSLT().applyTo(in, buffer);
         } catch (XSLTStylesheetException xslte) {
             throw new ExtractionException("An error occurred during the XSLT application.", xslte);
         }
 
         try {
             RDFParser parser
                     = RDFParserFactory.getInstance().getRDFXMLParser(
                         verifyDataType, stopAtFirstError, extractionContext, out
                     );
             parser.parse(
                     new StringReader(buffer.getBuffer().toString()),
                     extractionContext.getDocumentURI().stringValue()
             );
         } catch (RDFHandlerException ex) {
             throw new IllegalStateException(
                     "Should not happen, RDFHandlerAdapter does not throw RDFHandlerException", ex
             );
         } catch (RDFParseException ex) {
             throw new ExtractionException(
                     "Invalid RDF/XML produced by RDFa transform.", ex, out
             );
         }
     }
 
     private String getDocType(Document in) {
         return in.getDoctype().getPublicId();
     }
 
     /**
      * @return the {@link org.apache.any23.extractor.ExtractorDescription} of this extractor
      */
     public ExtractorDescription getDescription() {
         return factory;
     }
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.rdfa;
19
20		import org.apache.any23.configuration.DefaultConfiguration;
21		import org.apache.any23.extractor.ExtractionContext;
22		import org.apache.any23.extractor.ExtractionException;
23		import org.apache.any23.extractor.ExtractionParameters;
24		import org.apache.any23.extractor.ExtractionResult;
25		import org.apache.any23.extractor.ExtractorDescription;
26		import org.apache.any23.extractor.ExtractorFactory;
27		import org.apache.any23.extractor.SimpleExtractorFactory;
28		import org.apache.any23.extractor.rdf.RDFParserFactory;
29		import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30		import org.openrdf.rio.RDFHandlerException;
31		import org.openrdf.rio.RDFParseException;
32		import org.openrdf.rio.RDFParser;
33		import org.w3c.dom.Document;
34
35		import java.io.IOException;
36		import java.io.InputStream;
37		import java.io.StringReader;
38		import java.io.StringWriter;
39		import java.util.Arrays;
40
41		/**
42		* Extractor for RDFa in HTML, based on Fabien Gadon's XSLT transform, found
43		* <a href="http://ns.inria.fr/grddl/rdfa/">here</a>. It works by first
44		* parsing the HTML using a tagsoup parser, then applies the XSLT to the
45		* DOM tree, then parses the resulting RDF/XML.
46		*
47		* @author Gabriele Renzi
48		* @author Richard Cyganiak (richard@cyganiak.de)
49		*/
50	0	public class RDFaExtractor implements TagSoupDOMExtractor {
51
52		public final static String NAME = "html-rdfa";
53
54	0	public final static String xsltFilename =
55		DefaultConfiguration.singleton().getPropertyOrFail("any23.rdfa.extractor.xslt");
56
57	0	private static XSLTStylesheet xslt = null;
58
59	0	public final static ExtractorFactory<RDFaExtractor> factory =
60		SimpleExtractorFactory.create(
61		NAME,
62		null,
63		Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
64		null,
65		RDFaExtractor.class
66		);
67
68		/**
69		* Returns a {@link XSLTStylesheet} able to distill RDFa from
70		* HTML pages.
71		*
72		* @return returns a not <code>null</code> XSLT instance.
73		*/
74		public static synchronized XSLTStylesheet getXSLT() {
75		// Lazily initialized static instance, so we don't parse
76		// the XSLT unless really necessary, and only once
77	0	if (xslt == null) {
78	0	InputStream in = RDFaExtractor.class.getResourceAsStream(xsltFilename);
79	0	if (in == null) {
80	0	throw new RuntimeException("Couldn't load '" + xsltFilename +
81		"', maybe the file is not bundled in the jar?");
82		}
83	0	xslt = new XSLTStylesheet(in);
84		}
85	0	return xslt;
86		}
87
88		private boolean verifyDataType;
89
90		private boolean stopAtFirstError;
91
92		/**
93		* Constructor, allows to specify the validation and error handling policies.
94		*
95		* @param verifyDataType if <code>true</code> the data types will be verified,
96		* if <code>false</code> will be ignored.
97		* @param stopAtFirstError if <code>true</code> the parser will stop at first parsing error,
98		* if <code>false</code> will ignore non blocking errors.
99		*/
100	0	public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
101	0	this.verifyDataType = verifyDataType;
102	0	this.stopAtFirstError = stopAtFirstError;
103	0	}
104
105		/**
106		* Default constructor, with no verification of data types and not stop at first error.
107		*/
108		public RDFaExtractor() {
109	0	this(false, false);
110	0	}
111
112		public boolean isVerifyDataType() {
113	0	return verifyDataType;
114		}
115
116		public void setVerifyDataType(boolean verifyDataType) {
117	0	this.verifyDataType = verifyDataType;
118	0	}
119
120		public boolean isStopAtFirstError() {
121	0	return stopAtFirstError;
122		}
123
124		public void setStopAtFirstError(boolean stopAtFirstError) {
125	0	this.stopAtFirstError = stopAtFirstError;
126	0	}
127
128		public void run(
129		ExtractionParameters extractionParameters,
130		ExtractionContext extractionContext,
131		Document in,
132		ExtractionResult out
133		) throws IOException, ExtractionException {
134
135	0	StringWriter buffer = new StringWriter();
136		try {
137	0	getXSLT().applyTo(in, buffer);
138	0	} catch (XSLTStylesheetException xslte) {
139	0	throw new ExtractionException("An error occurred during the XSLT application.", xslte);
140	0	}
141
142		try {
143	0	RDFParser parser
144		= RDFParserFactory.getInstance().getRDFXMLParser(
145		verifyDataType, stopAtFirstError, extractionContext, out
146		);
147	0	parser.parse(
148		new StringReader(buffer.getBuffer().toString()),
149		extractionContext.getDocumentURI().stringValue()
150		);
151	0	} catch (RDFHandlerException ex) {
152	0	throw new IllegalStateException(
153		"Should not happen, RDFHandlerAdapter does not throw RDFHandlerException", ex
154		);
155	0	} catch (RDFParseException ex) {
156	0	throw new ExtractionException(
157		"Invalid RDF/XML produced by RDFa transform.", ex, out
158		);
159	0	}
160	0	}
161
162		private String getDocType(Document in) {
163	0	return in.getDoctype().getPublicId();
164		}
165
166		/**
167		* @return the {@link org.apache.any23.extractor.ExtractorDescription} of this extractor
168		*/
169		public ExtractorDescription getDescription() {
170	0	return factory;
171		}
172
173		}