1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
|
18 | |
package org.apache.any23.extractor.html; |
19 | |
|
20 | |
import org.apache.any23.validator.DefaultValidator; |
21 | |
import org.apache.any23.validator.Validator; |
22 | |
import org.apache.any23.validator.ValidatorException; |
23 | |
import org.apache.xerces.xni.Augmentations; |
24 | |
import org.apache.xerces.xni.QName; |
25 | |
import org.apache.xerces.xni.XMLAttributes; |
26 | |
import org.apache.xerces.xni.XNIException; |
27 | |
import org.cyberneko.html.parsers.DOMParser; |
28 | |
import org.slf4j.Logger; |
29 | |
import org.slf4j.LoggerFactory; |
30 | |
import org.w3c.dom.Document; |
31 | |
import org.w3c.dom.Element; |
32 | |
import org.xml.sax.InputSource; |
33 | |
import org.xml.sax.SAXException; |
34 | |
|
35 | |
import javax.xml.transform.TransformerException; |
36 | |
import java.io.IOException; |
37 | |
import java.io.InputStream; |
38 | |
import java.net.URI; |
39 | |
import java.net.URISyntaxException; |
40 | |
import java.nio.charset.Charset; |
41 | |
import java.nio.charset.UnsupportedCharsetException; |
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
|
49 | |
|
50 | |
|
51 | |
|
52 | |
|
53 | |
|
54 | |
|
55 | |
|
56 | |
|
57 | |
|
58 | |
|
59 | 0 | public class TagSoupParser { |
60 | |
|
61 | |
public static final String ELEMENT_LOCATION = "Element-Location"; |
62 | |
|
63 | |
private static final String AUGMENTATIONS_FEATURE = "http://cyberneko.org/html/features/augmentations"; |
64 | |
|
65 | 0 | private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class); |
66 | |
|
67 | |
private final InputStream input; |
68 | |
|
69 | |
private final String documentURI; |
70 | |
|
71 | |
private final String encoding; |
72 | |
|
73 | 0 | private Document result = null; |
74 | |
|
75 | 0 | public TagSoupParser(InputStream input, String documentURI) { |
76 | 0 | this.input = input; |
77 | 0 | this.documentURI = documentURI; |
78 | 0 | this.encoding = null; |
79 | 0 | } |
80 | |
|
81 | 0 | public TagSoupParser(InputStream input, String documentURI, String encoding) { |
82 | 0 | if(encoding != null && !Charset.isSupported(encoding)) |
83 | 0 | throw new UnsupportedCharsetException(String.format("Charset %s is not supported", encoding)); |
84 | |
|
85 | 0 | this.input = input; |
86 | 0 | this.documentURI = documentURI; |
87 | 0 | this.encoding = encoding; |
88 | 0 | } |
89 | |
|
90 | |
|
91 | |
|
92 | |
|
93 | |
|
94 | |
|
95 | |
|
96 | |
public Document getDOM() throws IOException { |
97 | 0 | if (result == null) { |
98 | 0 | long startTime = System.currentTimeMillis(); |
99 | |
try { |
100 | 0 | result = parse(); |
101 | 0 | } catch (SAXException ex) { |
102 | |
|
103 | 0 | throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex); |
104 | 0 | } catch (TransformerException ex) { |
105 | |
|
106 | 0 | throw new RuntimeException("Shouldn not happen, it's a tag soup parser", ex); |
107 | 0 | } catch (NullPointerException ex) { |
108 | 0 | if (ex.getStackTrace()[0].getClassName().equals("java.io.Reader")) { |
109 | 0 | throw new RuntimeException("Bug in NekoHTML, try upgrading to newer release!", ex); |
110 | |
} else { |
111 | 0 | throw ex; |
112 | |
} |
113 | |
} finally { |
114 | 0 | long elapsed = System.currentTimeMillis() - startTime; |
115 | 0 | logger.debug("Parsed " + documentURI + " with NekoHTML, " + elapsed + "ms"); |
116 | 0 | } |
117 | |
} |
118 | 0 | result.setDocumentURI(documentURI); |
119 | 0 | return result; |
120 | |
} |
121 | |
|
122 | |
|
123 | |
|
124 | |
|
125 | |
|
126 | |
|
127 | |
|
128 | |
|
129 | |
|
130 | |
|
131 | |
|
132 | |
|
133 | |
public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException { |
134 | |
final URI dURI; |
135 | |
try { |
136 | 0 | dURI = new URI(documentURI); |
137 | 0 | } catch (URISyntaxException urise) { |
138 | 0 | throw new ValidatorException("Error while performing validation, invalid document URI.", urise); |
139 | 0 | } |
140 | 0 | Validator validator = new DefaultValidator(); |
141 | 0 | Document document = getDOM(); |
142 | 0 | return new DocumentReport( validator.validate(dURI, document, applyFix), document ); |
143 | |
} |
144 | |
|
145 | |
private Document parse() throws IOException, SAXException, TransformerException { |
146 | 0 | final DOMParser parser = new DOMParser() { |
147 | |
|
148 | |
private QName currentQName; |
149 | |
private Augmentations currentAugmentations; |
150 | |
|
151 | |
@Override |
152 | |
protected Element createElementNode(QName qName) { |
153 | 0 | final Element created = super.createElementNode(qName); |
154 | 0 | if (qName.equals(currentQName) && currentAugmentations != null) { |
155 | 0 | final ElementLocation elementLocation = createElementLocation( |
156 | |
currentAugmentations.getItem(AUGMENTATIONS_FEATURE) |
157 | |
); |
158 | 0 | created.setUserData(ELEMENT_LOCATION, elementLocation, null); |
159 | |
} |
160 | 0 | return created; |
161 | |
} |
162 | |
|
163 | |
@Override |
164 | |
public void startElement(QName qName, XMLAttributes xmlAttributes, Augmentations augmentations) |
165 | |
throws XNIException { |
166 | 0 | super.startElement(qName, xmlAttributes, augmentations); |
167 | 0 | currentQName = qName; |
168 | 0 | currentAugmentations = augmentations; |
169 | 0 | } |
170 | |
|
171 | |
private ElementLocation createElementLocation(Object obj) { |
172 | 0 | if(obj == null) return null; |
173 | 0 | String pattern = null; |
174 | |
try { |
175 | 0 | pattern = obj.toString(); |
176 | 0 | if( "synthesized".equals(pattern) ) return null; |
177 | 0 | final String[] parts = pattern.split(":"); |
178 | 0 | return new ElementLocation( |
179 | |
Integer.parseInt(parts[0]), |
180 | |
Integer.parseInt(parts[1]), |
181 | |
Integer.parseInt(parts[3]), |
182 | |
Integer.parseInt(parts[4]) |
183 | |
|
184 | |
); |
185 | 0 | } catch (Exception e) { |
186 | 0 | logger.warn( |
187 | |
String.format("Unexpected string format for given augmentation: [%s]", pattern), |
188 | |
e |
189 | |
); |
190 | 0 | return null; |
191 | |
} |
192 | |
} |
193 | |
}; |
194 | 0 | parser.setFeature("http://xml.org/sax/features/namespaces", false); |
195 | 0 | parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-cdata-delims", true); |
196 | 0 | parser.setFeature(AUGMENTATIONS_FEATURE, true); |
197 | 0 | if (this.encoding != null) |
198 | 0 | parser.setProperty("http://cyberneko.org/html/properties/default-encoding", this.encoding); |
199 | |
|
200 | |
|
201 | |
|
202 | |
|
203 | |
|
204 | |
|
205 | 0 | parser.parse(new InputSource( new SpanCloserInputStream(input))); |
206 | 0 | return parser.getDocument(); |
207 | |
} |
208 | |
|
209 | |
|
210 | |
|
211 | |
|
212 | 0 | public static class ElementLocation { |
213 | |
|
214 | |
private int beginLineNumber; |
215 | |
private int beginColumnNumber; |
216 | |
private int endLineNumber; |
217 | |
private int endColumnNumber; |
218 | |
|
219 | |
private ElementLocation( |
220 | |
int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber |
221 | 0 | ) { |
222 | 0 | this.beginLineNumber = beginLineNumber; |
223 | 0 | this.beginColumnNumber = beginColumnNumber; |
224 | 0 | this.endLineNumber = endLineNumber; |
225 | 0 | this.endColumnNumber = endColumnNumber; |
226 | 0 | } |
227 | |
|
228 | |
public int getBeginLineNumber() { |
229 | 0 | return beginLineNumber; |
230 | |
} |
231 | |
|
232 | |
public int getBeginColumnNumber() { |
233 | 0 | return beginColumnNumber; |
234 | |
} |
235 | |
|
236 | |
public int getEndLineNumber() { |
237 | 0 | return endLineNumber; |
238 | |
} |
239 | |
|
240 | |
public int getEndColumnNumber() { |
241 | 0 | return endColumnNumber; |
242 | |
} |
243 | |
} |
244 | |
|
245 | |
} |