Coverage Report

Coverage Report - org.apache.any23.extractor.html.MicroformatExtractor

Classes in this File

Line Coverage

Branch Coverage

Complexity

MicroformatExtractor

0/53

0/24

1.833

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.extractor.ExtractionContext;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.ExtractionResult;
 import org.apache.any23.extractor.ExtractorDescription;
 import org.apache.any23.extractor.TagSoupExtractionResult;
 import org.apache.any23.extractor.html.annotations.Includes;
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
 import org.openrdf.model.BNode;
 import org.openrdf.model.Literal;
 import org.openrdf.model.Resource;
 import org.openrdf.model.URI;
 import org.openrdf.model.impl.ValueFactoryImpl;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
 
 import java.io.IOException;
 
 /**
  * The abstract base class for any
  * <a href="microformats.org/">Microformat specification</a> extractor.
  */
 public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
 
     public static final String BEGIN_SCRIPT = "<script>";
     public static final String END_SCRIPT   = "</script>";
 
     private HTMLDocument htmlDocument;
 
     private ExtractionContext context;
 
     private URI documentURI;
 
     private ExtractionResult out;
 
     protected final Any23ValueFactoryWrapper valueFactory =
             new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
 
     /**
      * Returns the description of this extractor.
      *
      * @return a human readable description.
      */
     public abstract ExtractorDescription getDescription();
 
     /**
      * Performs the extraction of the data and writes them to the model.
      * The nodes generated in the model can have any name or implicit label
      * but if possible they </i>SHOULD</i> have names (either URIs or AnonId) that
      * are uniquely derivable from their position in the DOM tree, so that
      * multiple extractors can merge information.
      */
     protected abstract boolean extract() throws ExtractionException;
 
     public HTMLDocument getHTMLDocument() {
         return htmlDocument;
     }
 
     public ExtractionContext getExtractionContext() {
         return context;
     }
 
     public URI getDocumentURI() {
         return documentURI;
     }
 
     public final void run(
             ExtractionParameters extractionParameters,
             ExtractionContext extractionContext,
             Document in,
             ExtractionResult out
     ) throws IOException, ExtractionException {
         this.htmlDocument = new HTMLDocument(in);
         this.context      = extractionContext;
         this.documentURI  = extractionContext.getDocumentURI();
         this.out          = out;
         valueFactory.setErrorReporter(out);
         try {
             extract();
         } finally {
             valueFactory.setErrorReporter(null);
         }
     }
 
     /**
      * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
      * to the extraction session.
      *
      * @return a valid extraction result.
      */
     protected ExtractionResult getCurrentExtractionResult() {
         return out;
     }
 
     protected ExtractionResult openSubResult(ExtractionContext context) {
         return out.openSubResult(context);
     }
 
     /**
      * Helper method that adds a literal property to a subject only if the value of the property
      * is a valid string.
      *
      * @param n the <i>HTML</i> node from which the property value has been extracted.
      * @param subject the property subject.
      * @param p the property URI.
      * @param value the property value.
      * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddStringProperty(
             Node n,
             Resource subject, URI p, String value
     ) {
         if (value == null) return false;
         value = value.trim();
         return
                 value.length() > 0 
                         &&
                 conditionallyAddLiteralProperty(
                         n,
                         subject, p, valueFactory.createLiteral(value)
                 );
     }
 
     /**
      * Helper method that adds a literal property to a node.
      *
      * @param n the <i>HTML</i> node from which the property value has been extracted.
      * @param subject subject the property subject.
      * @param property the property URI.
      * @param literal value the property value.
      * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
      */
     protected boolean conditionallyAddLiteralProperty(
             Node n,
             Resource subject,
             URI property,
             Literal literal
     ) {
         final String literalStr = literal.stringValue();
         if( containsScriptBlock(literalStr) ) {
             out.notifyError(
                     ExtractionResult.ErrorLevel.WARN,
                     String.format("Detected script in literal: [%s]", literalStr)
                     ,-1
                     ,-1
             );
             return false;
         }
         out.writeTriple(subject, property, literal);
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
         return true;
     }
 
     /**
      * Helper method that adds a URI property to a node.
      * @param subject the property subject.
      * @param property the property URI.
      * @param uri the property object.
      * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise. 
      */
     protected boolean conditionallyAddResourceProperty(Resource subject, URI property, URI uri) {
         if (uri == null) return false;
         out.writeTriple(subject, property, uri);
         return true;
     }
 
     /**
      * Helper method that adds a BNode property to a node.
      *
      * @param n the <i>HTML</i> node used for extracting such property.
      * @param subject the property subject.
      * @param property the property URI.
      * @param bnode the property value.
      */
     protected void addBNodeProperty(Node n, Resource subject, URI property, BNode bnode) {
         out.writeTriple(subject, property, bnode);
         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
         tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
     }
 
     /**
      * Helper method that adds a BNode property to a node.
      *
      * @param subject the property subject.
      * @param property the property URI.
      * @param bnode the property value.
      */
     protected void addBNodeProperty( Resource subject, URI property, BNode bnode) {
         out.writeTriple(subject, property, bnode);
     }
 
     /**
      * Helper method that adds a URI property to a node.
      *
      * @param subject
      * @param property
      * @param object
      */
     protected void addURIProperty(Resource subject, URI property, URI object) {
         out.writeTriple(subject, property, object);    
     }
 
     protected URI fixLink(String link) {
         return valueFactory.fixLink(link, null);
     }
 
     protected URI fixLink(String link, String defaultSchema) {
         return valueFactory.fixLink(link, defaultSchema);
     }
 
     private boolean containsScriptBlock(String in) {
         final String inLowerCase = in.toLowerCase();
         final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
         if(beginBlock == -1) {
             return false;
         }
         return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
     }
 
         /**
      * This method checks if there is a native nesting relationship between two
      * {@link MicroformatExtractor}.
      *
      * @see {@link org.apache.any23.extractor.html.annotations.Includes}
      * @param including the including {@link MicroformatExtractor}
      * @param included the included {@link MicroformatExtractor}
      * @return <code>true</code> if there is a declared nesting relationship
      */
     public static boolean includes(
             Class<? extends MicroformatExtractor>including,
             Class<? extends MicroformatExtractor> included) {
         Includes includes = including.getAnnotation(Includes.class);
         if (includes != null) {
             Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
             if (extractors != null && extractors.length > 0) {
                 for (Class<? extends MicroformatExtractor> extractor : extractors) {
                     if (extractor.equals(included)) {
                         return true;
                     }
                 }
             }
         }
         return false;
     }
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.extractor.ExtractionContext;
21		import org.apache.any23.extractor.ExtractionException;
22		import org.apache.any23.extractor.ExtractionParameters;
23		import org.apache.any23.extractor.ExtractionResult;
24		import org.apache.any23.extractor.ExtractorDescription;
25		import org.apache.any23.extractor.TagSoupExtractionResult;
26		import org.apache.any23.extractor.html.annotations.Includes;
27		import org.apache.any23.rdf.Any23ValueFactoryWrapper;
28		import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
29		import org.openrdf.model.BNode;
30		import org.openrdf.model.Literal;
31		import org.openrdf.model.Resource;
32		import org.openrdf.model.URI;
33		import org.openrdf.model.impl.ValueFactoryImpl;
34		import org.w3c.dom.Document;
35		import org.w3c.dom.Node;
36
37		import java.io.IOException;
38
39		/**
40		* The abstract base class for any
41		* <a href="microformats.org/">Microformat specification</a> extractor.
42		*/
43	0	public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
44
45		public static final String BEGIN_SCRIPT = "<script>";
46		public static final String END_SCRIPT = "</script>";
47
48		private HTMLDocument htmlDocument;
49
50		private ExtractionContext context;
51
52		private URI documentURI;
53
54		private ExtractionResult out;
55
56	0	protected final Any23ValueFactoryWrapper valueFactory =
57		new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
58
59		/**
60		* Returns the description of this extractor.
61		*
62		* @return a human readable description.
63		*/
64		public abstract ExtractorDescription getDescription();
65
66		/**
67		* Performs the extraction of the data and writes them to the model.
68		* The nodes generated in the model can have any name or implicit label
69		* but if possible they </i>SHOULD</i> have names (either URIs or AnonId) that
70		* are uniquely derivable from their position in the DOM tree, so that
71		* multiple extractors can merge information.
72		*/
73		protected abstract boolean extract() throws ExtractionException;
74
75		public HTMLDocument getHTMLDocument() {
76	0	return htmlDocument;
77		}
78
79		public ExtractionContext getExtractionContext() {
80	0	return context;
81		}
82
83		public URI getDocumentURI() {
84	0	return documentURI;
85		}
86
87		public final void run(
88		ExtractionParameters extractionParameters,
89		ExtractionContext extractionContext,
90		Document in,
91		ExtractionResult out
92		) throws IOException, ExtractionException {
93	0	this.htmlDocument = new HTMLDocument(in);
94	0	this.context = extractionContext;
95	0	this.documentURI = extractionContext.getDocumentURI();
96	0	this.out = out;
97	0	valueFactory.setErrorReporter(out);
98		try {
99	0	extract();
100		} finally {
101	0	valueFactory.setErrorReporter(null);
102	0	}
103	0	}
104
105		/**
106		* Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
107		* to the extraction session.
108		*
109		* @return a valid extraction result.
110		*/
111		protected ExtractionResult getCurrentExtractionResult() {
112	0	return out;
113		}
114
115		protected ExtractionResult openSubResult(ExtractionContext context) {
116	0	return out.openSubResult(context);
117		}
118
119		/**
120		* Helper method that adds a literal property to a subject only if the value of the property
121		* is a valid string.
122		*
123		* @param n the <i>HTML</i> node from which the property value has been extracted.
124		* @param subject the property subject.
125		* @param p the property URI.
126		* @param value the property value.
127		* @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
128		*/
129		protected boolean conditionallyAddStringProperty(
130		Node n,
131		Resource subject, URI p, String value
132		) {
133	0	if (value == null) return false;
134	0	value = value.trim();
135	0	return
136		value.length() > 0
137		&&
138		conditionallyAddLiteralProperty(
139		n,
140		subject, p, valueFactory.createLiteral(value)
141		);
142		}
143
144		/**
145		* Helper method that adds a literal property to a node.
146		*
147		* @param n the <i>HTML</i> node from which the property value has been extracted.
148		* @param subject subject the property subject.
149		* @param property the property URI.
150		* @param literal value the property value.
151		* @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
152		*/
153		protected boolean conditionallyAddLiteralProperty(
154		Node n,
155		Resource subject,
156		URI property,
157		Literal literal
158		) {
159	0	final String literalStr = literal.stringValue();
160	0	if( containsScriptBlock(literalStr) ) {
161	0	out.notifyError(
162		ExtractionResult.ErrorLevel.WARN,
163		String.format("Detected script in literal: [%s]", literalStr)
164		,-1
165		,-1
166		);
167	0	return false;
168		}
169	0	out.writeTriple(subject, property, literal);
170	0	TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
171	0	tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
172	0	return true;
173		}
174
175		/**
176		* Helper method that adds a URI property to a node.
177		* @param subject the property subject.
178		* @param property the property URI.
179		* @param uri the property object.
180		* @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
181		*/
182		protected boolean conditionallyAddResourceProperty(Resource subject, URI property, URI uri) {
183	0	if (uri == null) return false;
184	0	out.writeTriple(subject, property, uri);
185	0	return true;
186		}
187
188		/**
189		* Helper method that adds a BNode property to a node.
190		*
191		* @param n the <i>HTML</i> node used for extracting such property.
192		* @param subject the property subject.
193		* @param property the property URI.
194		* @param bnode the property value.
195		*/
196		protected void addBNodeProperty(Node n, Resource subject, URI property, BNode bnode) {
197	0	out.writeTriple(subject, property, bnode);
198	0	TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
199	0	tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
200	0	}
201
202		/**
203		* Helper method that adds a BNode property to a node.
204		*
205		* @param subject the property subject.
206		* @param property the property URI.
207		* @param bnode the property value.
208		*/
209		protected void addBNodeProperty( Resource subject, URI property, BNode bnode) {
210	0	out.writeTriple(subject, property, bnode);
211	0	}
212
213		/**
214		* Helper method that adds a URI property to a node.
215		*
216		* @param subject
217		* @param property
218		* @param object
219		*/
220		protected void addURIProperty(Resource subject, URI property, URI object) {
221	0	out.writeTriple(subject, property, object);
222	0	}
223
224		protected URI fixLink(String link) {
225	0	return valueFactory.fixLink(link, null);
226		}
227
228		protected URI fixLink(String link, String defaultSchema) {
229	0	return valueFactory.fixLink(link, defaultSchema);
230		}
231
232		private boolean containsScriptBlock(String in) {
233	0	final String inLowerCase = in.toLowerCase();
234	0	final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
235	0	if(beginBlock == -1) {
236	0	return false;
237		}
238	0	return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
239		}
240
241		/**
242		* This method checks if there is a native nesting relationship between two
243		* {@link MicroformatExtractor}.
244		*
245		* @see {@link org.apache.any23.extractor.html.annotations.Includes}
246		* @param including the including {@link MicroformatExtractor}
247		* @param included the included {@link MicroformatExtractor}
248		* @return <code>true</code> if there is a declared nesting relationship
249		*/
250		public static boolean includes(
251		Class<? extends MicroformatExtractor>including,
252		Class<? extends MicroformatExtractor> included) {
253	0	Includes includes = including.getAnnotation(Includes.class);
254	0	if (includes != null) {
255	0	Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
256	0	if (extractors != null && extractors.length > 0) {
257	0	for (Class<? extends MicroformatExtractor> extractor : extractors) {
258	0	if (extractor.equals(included)) {
259	0	return true;
260		}
261		}
262		}
263		}
264	0	return false;
265		}
266
267		}