1 | |
|
2 | |
|
3 | |
|
4 | |
|
5 | |
|
6 | |
|
7 | |
|
8 | |
|
9 | |
|
10 | |
|
11 | |
|
12 | |
|
13 | |
|
14 | |
|
15 | |
|
16 | |
|
17 | |
package org.apache.commons.feedparser; |
18 | |
|
19 | |
import java.util.HashMap; |
20 | |
import java.util.regex.Matcher; |
21 | |
import java.util.regex.Pattern; |
22 | |
|
23 | |
import org.apache.log4j.Logger; |
24 | |
|
25 | |
|
26 | |
|
27 | |
|
28 | |
|
29 | |
|
30 | 0 | public class FeedFilter { |
31 | |
|
32 | 0 | private static Logger log = Logger.getLogger( FeedFilter.class ); |
33 | |
|
34 | 0 | public static boolean DO_REMOVE_LEADING_PROLOG = true; |
35 | 0 | public static boolean DO_DECODE_ENTITIES = true; |
36 | |
|
37 | 0 | public static HashMap LATIN1_ENTITIES = new HashMap(); |
38 | |
|
39 | 0 | private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" ); |
40 | |
|
41 | |
|
42 | |
|
43 | |
|
44 | |
|
45 | |
|
46 | |
|
47 | |
|
48 | |
public static byte[] parse( byte[] bytes ) |
49 | |
throws Exception { |
50 | |
|
51 | 0 | return parse( bytes, "UTF-8" ); |
52 | |
|
53 | |
} |
54 | |
|
55 | |
public static byte[] parse( byte[] bytes, String encoding ) |
56 | |
throws Exception { |
57 | |
|
58 | 0 | String content = new String( bytes, encoding ); |
59 | |
|
60 | 0 | return parse( content, encoding ); |
61 | |
|
62 | |
} |
63 | |
|
64 | |
|
65 | |
|
66 | |
|
67 | |
|
68 | |
|
69 | |
|
70 | |
|
71 | |
public static byte[] parse( String content, String encoding ) |
72 | |
throws Exception { |
73 | |
|
74 | |
|
75 | |
|
76 | |
|
77 | 0 | if ( DO_REMOVE_LEADING_PROLOG ) |
78 | 0 | content = doRemoveLeadingProlog( content, encoding ); |
79 | |
|
80 | |
|
81 | 0 | if ( DO_DECODE_ENTITIES ) |
82 | 0 | content = doDecodeEntities( content ); |
83 | |
|
84 | |
|
85 | |
|
86 | |
|
87 | |
|
88 | |
|
89 | |
|
90 | 0 | return content.getBytes( encoding ); |
91 | |
|
92 | |
} |
93 | |
|
94 | |
|
95 | |
|
96 | |
|
97 | |
|
98 | |
|
99 | |
|
100 | |
private static String doRemoveLeadingProlog( String content, String encoding ) { |
101 | |
|
102 | |
|
103 | |
|
104 | |
|
105 | |
|
106 | |
|
107 | |
|
108 | |
|
109 | |
|
110 | 0 | if ( "UTF-16".equals( encoding ) || |
111 | |
"UTF-32".equals( encoding ) ) |
112 | 0 | return content; |
113 | |
|
114 | |
|
115 | |
|
116 | 0 | int begin = content.indexOf( "<" ); |
117 | |
|
118 | 0 | if ( begin > 0 ) { |
119 | 0 | content = content.substring( begin, content.length() ); |
120 | 0 | log.warn( "Skipped whitespace in prolog and moved towards first element." ); |
121 | |
} |
122 | |
|
123 | |
|
124 | |
|
125 | |
|
126 | 0 | begin = content.indexOf( "<?xml" ); |
127 | |
|
128 | 0 | if ( begin > 0 ) { |
129 | 0 | content = content.substring( begin, content.length() ); |
130 | 0 | log.warn( "Removed prolog towards first processing instruction." ); |
131 | |
} |
132 | |
|
133 | 0 | content = doRemoveElementProlog( content ); |
134 | |
|
135 | 0 | return content; |
136 | |
|
137 | |
} |
138 | |
|
139 | |
|
140 | |
|
141 | |
|
142 | |
|
143 | |
|
144 | |
|
145 | |
|
146 | |
|
147 | |
|
148 | |
|
149 | |
|
150 | |
private static String doRemoveElementProlog( String content ) { |
151 | |
|
152 | 0 | int end = content.lastIndexOf( "?>", 100 ); |
153 | |
|
154 | 0 | if ( end == -1 ) |
155 | 0 | return content; |
156 | |
|
157 | 0 | StringBuffer buff = new StringBuffer( content.length() ); |
158 | 0 | end = end + 2; |
159 | 0 | buff.append( content.substring( 0, end ) ); |
160 | |
|
161 | 0 | int begin = content.indexOf( "<", end ); |
162 | |
|
163 | 0 | if ( begin != -1 ) { |
164 | |
|
165 | 0 | buff.append( "\n" ); |
166 | 0 | buff.append( content.substring( begin, content.length() ) ); |
167 | |
|
168 | |
} |
169 | |
|
170 | 0 | return buff.toString(); |
171 | |
|
172 | |
} |
173 | |
|
174 | |
private static String doDecodeEntities( String content ) { |
175 | |
|
176 | 0 | StringBuffer buff = new StringBuffer( content.length() + 1000 ); |
177 | |
|
178 | 0 | Matcher m = entity_pattern.matcher( content ); |
179 | |
|
180 | 0 | int begin = 0; |
181 | |
|
182 | 0 | boolean hasFilterDecodedEntities = false; |
183 | 0 | boolean hasFilterFoundUnknownEntity = false; |
184 | |
|
185 | |
|
186 | |
|
187 | |
|
188 | 0 | while ( m.find() ) { |
189 | |
|
190 | 0 | buff.append( content.substring( begin, m.start() ) ); |
191 | |
|
192 | 0 | String entity = m.group( 1 ); |
193 | |
|
194 | 0 | String value = (String)LATIN1_ENTITIES.get( entity ); |
195 | |
|
196 | 0 | if ( value != null ) { |
197 | 0 | buff.append( "&#" ); |
198 | 0 | buff.append( value ); |
199 | 0 | buff.append( ";" ); |
200 | |
|
201 | 0 | hasFilterDecodedEntities = true; |
202 | |
|
203 | |
} else { |
204 | |
|
205 | |
|
206 | |
|
207 | |
|
208 | 0 | buff.append( "&" ); |
209 | 0 | buff.append( entity ); |
210 | 0 | buff.append( ";" ); |
211 | |
|
212 | 0 | hasFilterFoundUnknownEntity = true; |
213 | |
} |
214 | |
|
215 | 0 | begin = m.end( 0 ); |
216 | |
|
217 | 0 | } |
218 | |
|
219 | 0 | buff.append( content.substring( begin, content.length() ) ); |
220 | |
|
221 | 0 | if ( hasFilterFoundUnknownEntity ) |
222 | 0 | log.warn( "Filter encountered unknown entities" ); |
223 | |
|
224 | 0 | if ( hasFilterDecodedEntities ) |
225 | 0 | log.warn( "Filter has decoded latin1 entities." ); |
226 | |
|
227 | 0 | return buff.toString(); |
228 | |
|
229 | |
} |
230 | |
|
231 | |
public static void main( String[] args ) throws Exception { |
232 | |
|
233 | 0 | byte[] b = parse( "hello é world".getBytes() ); |
234 | |
|
235 | 0 | String v = new String( b ); |
236 | |
|
237 | 0 | System.out.println( "v: " + v ); |
238 | |
|
239 | 0 | } |
240 | |
|
241 | |
static { |
242 | |
|
243 | |
|
244 | |
|
245 | |
|
246 | |
|
247 | |
|
248 | |
|
249 | |
|
250 | |
|
251 | |
|
252 | |
|
253 | |
|
254 | |
|
255 | |
|
256 | |
|
257 | 0 | LATIN1_ENTITIES.put( "nbsp", "160" ); |
258 | 0 | LATIN1_ENTITIES.put( "iexcl", "161" ); |
259 | 0 | LATIN1_ENTITIES.put( "cent", "162" ); |
260 | 0 | LATIN1_ENTITIES.put( "pound", "163" ); |
261 | 0 | LATIN1_ENTITIES.put( "curren", "164" ); |
262 | 0 | LATIN1_ENTITIES.put( "yen", "165" ); |
263 | 0 | LATIN1_ENTITIES.put( "brvbar", "166" ); |
264 | 0 | LATIN1_ENTITIES.put( "sect", "167" ); |
265 | 0 | LATIN1_ENTITIES.put( "uml", "168" ); |
266 | 0 | LATIN1_ENTITIES.put( "copy", "169" ); |
267 | 0 | LATIN1_ENTITIES.put( "ordf", "170" ); |
268 | 0 | LATIN1_ENTITIES.put( "laquo", "171" ); |
269 | 0 | LATIN1_ENTITIES.put( "not", "172" ); |
270 | 0 | LATIN1_ENTITIES.put( "shy", "173" ); |
271 | 0 | LATIN1_ENTITIES.put( "reg", "174" ); |
272 | 0 | LATIN1_ENTITIES.put( "macr", "175" ); |
273 | 0 | LATIN1_ENTITIES.put( "deg", "176" ); |
274 | 0 | LATIN1_ENTITIES.put( "plusmn", "177" ); |
275 | 0 | LATIN1_ENTITIES.put( "sup2", "178" ); |
276 | 0 | LATIN1_ENTITIES.put( "sup3", "179" ); |
277 | 0 | LATIN1_ENTITIES.put( "acute", "180" ); |
278 | 0 | LATIN1_ENTITIES.put( "micro", "181" ); |
279 | 0 | LATIN1_ENTITIES.put( "para", "182" ); |
280 | 0 | LATIN1_ENTITIES.put( "middot", "183" ); |
281 | 0 | LATIN1_ENTITIES.put( "cedil", "184" ); |
282 | 0 | LATIN1_ENTITIES.put( "sup1", "185" ); |
283 | 0 | LATIN1_ENTITIES.put( "ordm", "186" ); |
284 | 0 | LATIN1_ENTITIES.put( "raquo", "187" ); |
285 | 0 | LATIN1_ENTITIES.put( "frac14", "188" ); |
286 | 0 | LATIN1_ENTITIES.put( "frac12", "189" ); |
287 | 0 | LATIN1_ENTITIES.put( "frac34", "190" ); |
288 | 0 | LATIN1_ENTITIES.put( "iquest", "191" ); |
289 | 0 | LATIN1_ENTITIES.put( "Agrave", "192" ); |
290 | 0 | LATIN1_ENTITIES.put( "Aacute", "193" ); |
291 | 0 | LATIN1_ENTITIES.put( "Acirc", "194" ); |
292 | 0 | LATIN1_ENTITIES.put( "Atilde", "195" ); |
293 | 0 | LATIN1_ENTITIES.put( "Auml", "196" ); |
294 | 0 | LATIN1_ENTITIES.put( "Aring", "197" ); |
295 | 0 | LATIN1_ENTITIES.put( "AElig", "198" ); |
296 | 0 | LATIN1_ENTITIES.put( "Ccedil", "199" ); |
297 | 0 | LATIN1_ENTITIES.put( "Egrave", "200" ); |
298 | 0 | LATIN1_ENTITIES.put( "Eacute", "201" ); |
299 | 0 | LATIN1_ENTITIES.put( "Ecirc", "202" ); |
300 | 0 | LATIN1_ENTITIES.put( "Euml", "203" ); |
301 | 0 | LATIN1_ENTITIES.put( "Igrave", "204" ); |
302 | 0 | LATIN1_ENTITIES.put( "Iacute", "205" ); |
303 | 0 | LATIN1_ENTITIES.put( "Icirc", "206" ); |
304 | 0 | LATIN1_ENTITIES.put( "Iuml", "207" ); |
305 | 0 | LATIN1_ENTITIES.put( "ETH", "208" ); |
306 | 0 | LATIN1_ENTITIES.put( "Ntilde", "209" ); |
307 | 0 | LATIN1_ENTITIES.put( "Ograve", "210" ); |
308 | 0 | LATIN1_ENTITIES.put( "Oacute", "211" ); |
309 | 0 | LATIN1_ENTITIES.put( "Ocirc", "212" ); |
310 | 0 | LATIN1_ENTITIES.put( "Otilde", "213" ); |
311 | 0 | LATIN1_ENTITIES.put( "Ouml", "214" ); |
312 | 0 | LATIN1_ENTITIES.put( "times", "215" ); |
313 | 0 | LATIN1_ENTITIES.put( "Oslash", "216" ); |
314 | 0 | LATIN1_ENTITIES.put( "Ugrave", "217" ); |
315 | 0 | LATIN1_ENTITIES.put( "Uacute", "218" ); |
316 | 0 | LATIN1_ENTITIES.put( "Ucirc", "219" ); |
317 | 0 | LATIN1_ENTITIES.put( "Uuml", "220" ); |
318 | 0 | LATIN1_ENTITIES.put( "Yacute", "221" ); |
319 | 0 | LATIN1_ENTITIES.put( "THORN", "222" ); |
320 | 0 | LATIN1_ENTITIES.put( "szlig", "223" ); |
321 | 0 | LATIN1_ENTITIES.put( "agrave", "224" ); |
322 | 0 | LATIN1_ENTITIES.put( "aacute", "225" ); |
323 | 0 | LATIN1_ENTITIES.put( "acirc", "226" ); |
324 | 0 | LATIN1_ENTITIES.put( "atilde", "227" ); |
325 | 0 | LATIN1_ENTITIES.put( "auml", "228" ); |
326 | 0 | LATIN1_ENTITIES.put( "aring", "229" ); |
327 | 0 | LATIN1_ENTITIES.put( "aelig", "230" ); |
328 | 0 | LATIN1_ENTITIES.put( "ccedil", "231" ); |
329 | 0 | LATIN1_ENTITIES.put( "egrave", "232" ); |
330 | 0 | LATIN1_ENTITIES.put( "eacute", "233" ); |
331 | 0 | LATIN1_ENTITIES.put( "ecirc", "234" ); |
332 | 0 | LATIN1_ENTITIES.put( "euml", "235" ); |
333 | 0 | LATIN1_ENTITIES.put( "igrave", "236" ); |
334 | 0 | LATIN1_ENTITIES.put( "iacute", "237" ); |
335 | 0 | LATIN1_ENTITIES.put( "icirc", "238" ); |
336 | 0 | LATIN1_ENTITIES.put( "iuml", "239" ); |
337 | 0 | LATIN1_ENTITIES.put( "eth", "240" ); |
338 | 0 | LATIN1_ENTITIES.put( "ntilde", "241" ); |
339 | 0 | LATIN1_ENTITIES.put( "ograve", "242" ); |
340 | 0 | LATIN1_ENTITIES.put( "oacute", "243" ); |
341 | 0 | LATIN1_ENTITIES.put( "ocirc", "244" ); |
342 | 0 | LATIN1_ENTITIES.put( "otilde", "245" ); |
343 | 0 | LATIN1_ENTITIES.put( "ouml", "246" ); |
344 | 0 | LATIN1_ENTITIES.put( "divide", "247" ); |
345 | 0 | LATIN1_ENTITIES.put( "oslash", "248" ); |
346 | 0 | LATIN1_ENTITIES.put( "ugrave", "249" ); |
347 | 0 | LATIN1_ENTITIES.put( "uacute", "250" ); |
348 | 0 | LATIN1_ENTITIES.put( "ucirc", "251" ); |
349 | 0 | LATIN1_ENTITIES.put( "uuml", "252" ); |
350 | 0 | LATIN1_ENTITIES.put( "yacute", "253" ); |
351 | 0 | LATIN1_ENTITIES.put( "thorn", "254" ); |
352 | 0 | LATIN1_ENTITIES.put( "yuml", "255" ); |
353 | |
|
354 | 0 | } |
355 | |
|
356 | |
} |