1 package org.apache.maven.doxia.util;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.UnsupportedEncodingException;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Map;
27
28 import javax.swing.text.html.HTML.Tag;
29
30 import org.apache.commons.lang3.StringEscapeUtils;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.codehaus.plexus.util.StringUtils;
33
34
35
36
37
38
39
40
41 public class HtmlTools
42 {
43 private static final Tag[] ALL_TAGS =
44 {
45 HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
46 HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
47 HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
48 HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
49 HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
50 HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
51 HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
52 HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
53 HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
54 HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
55 HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
56 HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
57 HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
58 HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
59 HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
60 HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
61 HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
62 HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
63 };
64
65 private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
66
67 private static final int ASCII = 0x7E;
68
69 static
70 {
71 for ( Tag tag : ALL_TAGS )
72 {
73 TAG_MAP.put( tag.toString(), tag );
74 }
75 }
76
77
78
79
80
81
82
83
84
85
86
87
88
89 public static Tag getHtmlTag( String tagName )
90 {
91 Object t = TAG_MAP.get( tagName );
92
93 return (Tag) t;
94 }
95
96
97
98
99
100
101
102
103
104
105 public static String escapeHTML( String text )
106 {
107 return escapeHTML( text, true );
108 }
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138 public static String escapeHTML( final String text, final boolean xmlMode )
139 {
140 if ( text == null )
141 {
142 return "";
143 }
144
145 int length = text.length();
146 StringBuilder buffer = new StringBuilder( length );
147
148 for ( int i = 0; i < length; ++i )
149 {
150 char c = text.charAt( i );
151 switch ( c )
152 {
153 case '<':
154 buffer.append( "<" );
155 break;
156 case '>':
157 buffer.append( ">" );
158 break;
159 case '&':
160 buffer.append( "&" );
161 break;
162 case '\"':
163 buffer.append( """ );
164 break;
165 default:
166 if ( xmlMode )
167 {
168 if ( c == '\'' )
169 {
170 buffer.append( "'" );
171 }
172 else
173 {
174 buffer.append( c );
175 }
176 }
177 else
178 {
179 if ( c <= ASCII )
180 {
181
182 buffer.append( c );
183 }
184 else
185 {
186 buffer.append( "&#x" );
187 if ( isHighSurrogate( c ) )
188 {
189 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
190 }
191 else
192 {
193 buffer.append( Integer.toHexString( c ) );
194 }
195 buffer.append( ';' );
196 }
197 }
198 }
199 }
200
201 return buffer.toString();
202 }
203
204
205
206
207
208
209
210
211
212 public static String unescapeHTML( String text )
213 {
214 return unescapeHTML( text, false );
215 }
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238 public static String unescapeHTML( String text, boolean xmlMode )
239 {
240 if ( text == null )
241 {
242 return null;
243 }
244
245 String unescaped;
246 if ( xmlMode )
247 {
248 unescaped = StringEscapeUtils.unescapeXml( text );
249 }
250 else
251 {
252
253 unescaped = StringEscapeUtils.unescapeHtml4( text );
254 }
255
256 String tmp = unescaped;
257 List<String> entities = new ArrayList<String>();
258 while ( true )
259 {
260 int i = tmp.indexOf( "&#x" );
261 if ( i == -1 )
262 {
263 break;
264 }
265
266 tmp = tmp.substring( i + 3 );
267 if ( tmp.indexOf( ';' ) != -1 )
268 {
269 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
270 try
271 {
272 Integer.parseInt( entity, 16 );
273 entities.add( entity );
274 }
275 catch ( NumberFormatException e )
276 {
277
278 }
279 }
280 }
281
282 for ( String entity : entities )
283 {
284 int codePoint = Integer.parseInt( entity, 16 );
285 unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
286 }
287
288 return unescaped;
289 }
290
291
292
293
294
295
296
297 public static String encodeURL( String url )
298 {
299 if ( url == null )
300 {
301 return null;
302 }
303
304 StringBuilder encoded = new StringBuilder();
305 int length = url.length();
306
307 char[] unicode = new char[1];
308
309 for ( int i = 0; i < length; ++i )
310 {
311 char c = url.charAt( i );
312
313 switch ( c )
314 {
315 case ';':
316 case '/':
317 case '?':
318 case ':':
319 case '@':
320 case '&':
321 case '=':
322 case '+':
323 case '$':
324 case ',':
325 case '[':
326 case ']':
327 case '-':
328 case '_':
329 case '.':
330 case '!':
331 case '~':
332 case '*':
333 case '\'':
334 case '(':
335 case ')':
336 case '#':
337 encoded.append( c );
338 break;
339 default:
340 if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
341 {
342 encoded.append( c );
343 }
344 else
345 {
346 byte[] bytes;
347
348 try
349 {
350 if ( isHighSurrogate( c ) )
351 {
352 int codePoint = toCodePoint( c, url.charAt( ++i ) );
353 unicode = toChars( codePoint );
354 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
355 }
356 else
357 {
358 unicode[0] = c;
359 bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
360 }
361 }
362 catch ( UnsupportedEncodingException cannotHappen )
363 {
364 bytes = new byte[0];
365 }
366
367 for ( int j = 0; j < bytes.length; ++j )
368 {
369 encoded.append( '%' );
370 encoded.append( String.format( "%02X", bytes[j] ) );
371 }
372 }
373 }
374 }
375
376 return encoded.toString();
377 }
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392 public static String encodeId( String id )
393 {
394 return DoxiaUtils.encodeId( id, false );
395 }
396
397
398
399
400
401
402
403
404
405 public static boolean isId( String text )
406 {
407 return DoxiaUtils.isValidId( text );
408 }
409
410 private HtmlTools()
411 {
412
413 }
414
415
416
417
418
419
420 private static final char LUNATE_SIGMA = 0x3FF;
421 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
422 private static final char LOW_SURROGATE = 0xDC00;
423
424 private static int toCodePoint( char high, char low )
425 {
426
427
428 int h = ( high & LUNATE_SIGMA ) << 10;
429 int l = low & LUNATE_SIGMA;
430 return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
431 }
432
433 private static final char MIN_HIGH_SURROGATE = '\uD800';
434 private static final char MAX_HIGH_SURROGATE = '\uDBFF';
435
436 private static boolean isHighSurrogate( char ch )
437 {
438 return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
439 }
440
441 private static final int MIN_CODE_POINT = 0x000000;
442 private static final int MAX_CODE_POINT = 0x10FFFF;
443 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
444
445 private static boolean isValidCodePoint( int codePoint )
446 {
447 return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
448 }
449
450 private static boolean isSupplementaryCodePoint( int codePoint )
451 {
452 return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
453 }
454
455
456
457
458
459
460
461
462 public static char[] toChars( int codePoint )
463 {
464 if ( !isValidCodePoint( codePoint ) )
465 {
466 throw new IllegalArgumentException();
467 }
468
469 if ( isSupplementaryCodePoint( codePoint ) )
470 {
471 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
472 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
473 int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
474 return new char[] { (char) high, (char) low };
475 }
476 return new char[] { (char) codePoint };
477 }
478 }