Coverage Report

Coverage Report - org.apache.any23.extractor.html.TagSoupParser

Classes in this File

Line Coverage

Branch Coverage

Complexity

TagSoupParser

0/46

0/10

2.769

TagSoupParser$1

0/19

0/8

2.769

TagSoupParser$ElementLocation

0/11

N/A

2.769

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.extractor.html;
 
 import org.apache.any23.validator.DefaultValidator;
 import org.apache.any23.validator.Validator;
 import org.apache.any23.validator.ValidatorException;
 import org.apache.xerces.xni.Augmentations;
 import org.apache.xerces.xni.QName;
 import org.apache.xerces.xni.XMLAttributes;
 import org.apache.xerces.xni.XNIException;
 import org.cyberneko.html.parsers.DOMParser;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
 import javax.xml.transform.TransformerException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 
 /**
  * Parses an {@link java.io.InputStream}
  * into an <io>HTML DOM</i> tree using a <i>TagSoup</i> parser.
  * <p/>
  * <strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace
  * aware, and all element names will be upper case, while attributes
  * will be lower case. This is because the
  * <a href="http://nekohtml.sourceforge.net/">NekoHTML</a> based <i>TagSoup</i> parser
  * by default uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
  * implementation, which doesn't support namespaces and forces uppercase element names. This works
  * with the <i>RDFa XSLT Converter</i> and with </i>XPath</i>, so we left it this way.
  *
  * @author Richard Cyganiak (richard at cyganiak dot de)
  * @author Michele Mostarda (mostarda@fbk.eu)
  * @author Davide Palmisano (palmisano@fbk.eu)
  */
 public class TagSoupParser {
 
     public static final String ELEMENT_LOCATION = "Element-Location";
 
     private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations";
 
     private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
 
     private final InputStream input;
 
     private final String documentURI;
 
     private final String encoding;
     
     private Document result = null;
 
     public TagSoupParser(InputStream input, String documentURI) {
         this.input = input;
         this.documentURI = documentURI;
         this.encoding = null;
     }
 
     public TagSoupParser(InputStream input, String documentURI, String encoding) {
         if(encoding != null && !Charset.isSupported(encoding))
             throw new UnsupportedCharsetException(String.format("Charset %s is not supported", encoding));
 
         this.input = input;
         this.documentURI = documentURI;
         this.encoding = encoding;
     }
 
     /**
      * Returns the DOM of the given document URI. 
      *
      * @return the <i>HTML</i> DOM.
      * @throws IOException
      */
     public Document getDOM() throws IOException {
         if (result == null) {
             long startTime = System.currentTimeMillis();
             try {
                 result = parse();
             } catch (SAXException ex) {
                 // should not happen, it's a tag soup parser
                 throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex);
             } catch (TransformerException ex) {
                 // should not happen, it's a tag soup parser
                 throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex);
             } catch (NullPointerException ex) {
                 if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
                     throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex);
                 } else {
                     throw ex;
                 }
             } finally {
                 long elapsed = System.currentTimeMillis() - startTime;
                 logger.debug("Parsed " + documentURI + " with NekoHTML, " + elapsed + "ms");
             }
         }
         result.setDocumentURI(documentURI);
         return result;
     }
 
     /**
      * Returns the validated DOM and applies fixes on it if <i>applyFix</i>
      * is set to <code>true</code>.
      *
      * @param applyFix
      * @return a report containing the <i>HTML</i> DOM that has been validated and fixed if <i>applyFix</i>
      *         if <code>true</code>. The reports contains also information about the activated rules and the
      *         the detected issues.
      * @throws IOException
      * @throws org.apache.any23.validator.ValidatorException
      */
     public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
         final URI dURI;
         try {
             dURI = new URI(documentURI);
         } catch (URISyntaxException urise) {
             throw new ValidatorException("Error while performing validation, invalid document URI.", urise);
         }
         Validator validator = new DefaultValidator();
         Document document = getDOM();
         return new DocumentReport( validator.validate(dURI, document, applyFix), document );
     }
 
     private Document parse() throws IOException, SAXException, TransformerException {
         final DOMParser parser = new DOMParser() {
 
             private QName currentQName;
             private Augmentations currentAugmentations;
 
             @Override
             protected Element createElementNode(QName qName) {
                 final Element created = super.createElementNode(qName);
                 if (qName.equals(currentQName) && currentAugmentations != null) {
                     final ElementLocation elementLocation = createElementLocation(
                         currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
                     );
                     created.setUserData(ELEMENT_LOCATION, elementLocation, null);
                 }
                 return created;
             }
 
             @Override
             public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations)
             throws XNIException {
                 super.startElement(qName, xmlAttributes, augmentations);
                 currentQName = qName;
                 currentAugmentations = augmentations;
             }
 
             private ElementLocation createElementLocation(Object obj) {
                 if(obj == null) return null;
                 String pattern = null;
                 try {
                     pattern = obj.toString();
                     if( "synthesized".equals(pattern) ) return null;
                     final String[] parts = pattern.split(":");
                     return new ElementLocation(
                             Integer.parseInt(parts[0]),
                             Integer.parseInt(parts[1]),
                             Integer.parseInt(parts[3]),
                             Integer.parseInt(parts[4])
 
                     );
                 } catch (Exception e) {
                     logger.warn(
                             String.format("Unexpected string format for given augmentation: [%s]", pattern),
                             e
                     );
                     return null;
                 }
             }
         };
         parser.setFeature("http://xml.org/sax/features/namespaces", false);
         parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true);
         parser.setFeature(AUGMENTATIONS_FEATURE, true);
         if (this.encoding != null)
             parser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.encoding);
 
         /*
          * NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko
          *       parser. This will ensure the correct handling of inline HTML SPAN tags.
          *       This fix is documented at issue #78.       
          */
         parser.parse(new InputSource( new SpanCloserInputStream(input)));
         return parser.getDocument();
     }
 
     /**
      * Describes a <i>DOM Element</i> location.
      */
     public static class ElementLocation {
 
         private int beginLineNumber;
         private int beginColumnNumber;
         private int endLineNumber;
         private int endColumnNumber;
 
         private ElementLocation(
                 int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber
         ) {
             this.beginLineNumber = beginLineNumber;
             this.beginColumnNumber = beginColumnNumber;
             this.endLineNumber = endLineNumber;
             this.endColumnNumber = endColumnNumber;
         }
 
         public int getBeginLineNumber() {
             return beginLineNumber;
         }
 
         public int getBeginColumnNumber() {
             return beginColumnNumber;
         }
 
         public int getEndLineNumber() {
             return endLineNumber;
         }
 
         public int getEndColumnNumber() {
             return endColumnNumber;
         }
     }
     
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.extractor.html;
19
20		import org.apache.any23.validator.DefaultValidator;
21		import org.apache.any23.validator.Validator;
22		import org.apache.any23.validator.ValidatorException;
23		import org.apache.xerces.xni.Augmentations;
24		import org.apache.xerces.xni.QName;
25		import org.apache.xerces.xni.XMLAttributes;
26		import org.apache.xerces.xni.XNIException;
27		import org.cyberneko.html.parsers.DOMParser;
28		import org.slf4j.Logger;
29		import org.slf4j.LoggerFactory;
30		import org.w3c.dom.Document;
31		import org.w3c.dom.Element;
32		import org.xml.sax.InputSource;
33		import org.xml.sax.SAXException;
34
35		import javax.xml.transform.TransformerException;
36		import java.io.IOException;
37		import java.io.InputStream;
38		import java.net.URI;
39		import java.net.URISyntaxException;
40		import java.nio.charset.Charset;
41		import java.nio.charset.UnsupportedCharsetException;
42
43		/**
44		* Parses an {@link java.io.InputStream}
45		* into an <io>HTML DOM</i> tree using a <i>TagSoup</i> parser.
46		* <p/>
47		* <strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace
48		* aware, and all element names will be upper case, while attributes
49		* will be lower case. This is because the
50		* <a href="http://nekohtml.sourceforge.net/">NekoHTML</a> based <i>TagSoup</i> parser
51		* by default uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
52		* implementation, which doesn't support namespaces and forces uppercase element names. This works
53		* with the <i>RDFa XSLT Converter</i> and with </i>XPath</i>, so we left it this way.
54		*
55		* @author Richard Cyganiak (richard at cyganiak dot de)
56		* @author Michele Mostarda (mostarda@fbk.eu)
57		* @author Davide Palmisano (palmisano@fbk.eu)
58		*/
59	0	public class TagSoupParser {
60
61		public static final String ELEMENT_LOCATION = "Element-Location";
62
63		private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations";
64
65	0	private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
66
67		private final InputStream input;
68
69		private final String documentURI;
70
71		private final String encoding;
72
73	0	private Document result = null;
74
75	0	public TagSoupParser(InputStream input, String documentURI) {
76	0	this.input = input;
77	0	this.documentURI = documentURI;
78	0	this.encoding = null;
79	0	}
80
81	0	public TagSoupParser(InputStream input, String documentURI, String encoding) {
82	0	if(encoding != null && !Charset.isSupported(encoding))
83	0	throw new UnsupportedCharsetException(String.format("Charset %s is not supported", encoding));
84
85	0	this.input = input;
86	0	this.documentURI = documentURI;
87	0	this.encoding = encoding;
88	0	}
89
90		/**
91		* Returns the DOM of the given document URI.
92		*
93		* @return the <i>HTML</i> DOM.
94		* @throws IOException
95		*/
96		public Document getDOM() throws IOException {
97	0	if (result == null) {
98	0	long startTime = System.currentTimeMillis();
99		try {
100	0	result = parse();
101	0	} catch (SAXException ex) {
102		// should not happen, it's a tag soup parser
103	0	throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex);
104	0	} catch (TransformerException ex) {
105		// should not happen, it's a tag soup parser
106	0	throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex);
107	0	} catch (NullPointerException ex) {
108	0	if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) {
109	0	throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex);
110		} else {
111	0	throw ex;
112		}
113		} finally {
114	0	long elapsed = System.currentTimeMillis() - startTime;
115	0	logger.debug("Parsed " + documentURI + " with NekoHTML, " + elapsed + "ms");
116	0	}
117		}
118	0	result.setDocumentURI(documentURI);
119	0	return result;
120		}
121
122		/**
123		* Returns the validated DOM and applies fixes on it if <i>applyFix</i>
124		* is set to <code>true</code>.
125		*
126		* @param applyFix
127		* @return a report containing the <i>HTML</i> DOM that has been validated and fixed if <i>applyFix</i>
128		* if <code>true</code>. The reports contains also information about the activated rules and the
129		* the detected issues.
130		* @throws IOException
131		* @throws org.apache.any23.validator.ValidatorException
132		*/
133		public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
134		final URI dURI;
135		try {
136	0	dURI = new URI(documentURI);
137	0	} catch (URISyntaxException urise) {
138	0	throw new ValidatorException("Error while performing validation, invalid document URI.", urise);
139	0	}
140	0	Validator validator = new DefaultValidator();
141	0	Document document = getDOM();
142	0	return new DocumentReport( validator.validate(dURI, document, applyFix), document );
143		}
144
145		private Document parse() throws IOException, SAXException, TransformerException {
146	0	final DOMParser parser = new DOMParser() {
147
148		private QName currentQName;
149		private Augmentations currentAugmentations;
150
151		@Override
152		protected Element createElementNode(QName qName) {
153	0	final Element created = super.createElementNode(qName);
154	0	if (qName.equals(currentQName) && currentAugmentations != null) {
155	0	final ElementLocation elementLocation = createElementLocation(
156		currentAugmentations.getItem(AUGMENTATIONS_FEATURE)
157		);
158	0	created.setUserData(ELEMENT_LOCATION, elementLocation, null);
159		}
160	0	return created;
161		}
162
163		@Override
164		public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations)
165		throws XNIException {
166	0	super.startElement(qName, xmlAttributes, augmentations);
167	0	currentQName = qName;
168	0	currentAugmentations = augmentations;
169	0	}
170
171		private ElementLocation createElementLocation(Object obj) {
172	0	if(obj == null) return null;
173	0	String pattern = null;
174		try {
175	0	pattern = obj.toString();
176	0	if( "synthesized".equals(pattern) ) return null;
177	0	final String[] parts = pattern.split(":");
178	0	return new ElementLocation(
179		Integer.parseInt(parts[0]),
180		Integer.parseInt(parts[1]),
181		Integer.parseInt(parts[3]),
182		Integer.parseInt(parts[4])
183
184		);
185	0	} catch (Exception e) {
186	0	logger.warn(
187		String.format("Unexpected string format for given augmentation: [%s]", pattern),
188		e
189		);
190	0	return null;
191		}
192		}
193		};
194	0	parser.setFeature("http://xml.org/sax/features/namespaces", false);
195	0	parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true);
196	0	parser.setFeature(AUGMENTATIONS_FEATURE, true);
197	0	if (this.encoding != null)
198	0	parser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.encoding);
199
200		/*
201		* NOTE: the SpanCloserInputStream has been added to wrap the stream passed to the CyberNeko
202		* parser. This will ensure the correct handling of inline HTML SPAN tags.
203		* This fix is documented at issue #78.
204		*/
205	0	parser.parse(new InputSource( new SpanCloserInputStream(input)));
206	0	return parser.getDocument();
207		}
208
209		/**
210		* Describes a <i>DOM Element</i> location.
211		*/
212	0	public static class ElementLocation {
213
214		private int beginLineNumber;
215		private int beginColumnNumber;
216		private int endLineNumber;
217		private int endColumnNumber;
218
219		private ElementLocation(
220		int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber
221	0	) {
222	0	this.beginLineNumber = beginLineNumber;
223	0	this.beginColumnNumber = beginColumnNumber;
224	0	this.endLineNumber = endLineNumber;
225	0	this.endColumnNumber = endColumnNumber;
226	0	}
227
228		public int getBeginLineNumber() {
229	0	return beginLineNumber;
230		}
231
232		public int getBeginColumnNumber() {
233	0	return beginColumnNumber;
234		}
235
236		public int getEndLineNumber() {
237	0	return endLineNumber;
238		}
239
240		public int getEndColumnNumber() {
241	0	return endColumnNumber;
242		}
243		}
244
245		}