1 package org.apache.maven.doxia.util;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.io.UnsupportedEncodingException;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Map;
27
28 import javax.swing.text.html.HTML.Tag;
29
30 import org.apache.commons.lang.StringEscapeUtils;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.codehaus.plexus.util.StringUtils;
33
34
35
36
37
38
39
40
41 public class HtmlTools
42 {
43 private static final Tag[] ALL_TAGS =
44 {
45 HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
46 HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
47 HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
48 HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
49 HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
50 HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
51 HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
52 HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
53 HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
54 HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
55 HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
56 HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
57 HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
58 HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
59 HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
60 HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
61 HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
62 HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
63 };
64
65 private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
66
67 private static final int ASCII = 0x7E;
68
69 static
70 {
71 for ( Tag tag : ALL_TAGS )
72 {
73 TAG_MAP.put( tag.toString(), tag );
74 }
75 }
76
77
78
79
80
81
82
83
84
85
86
87
88
89 public static Tag getHtmlTag( String tagName )
90 {
91 Object t = TAG_MAP.get( tagName );
92
93 return (Tag) t;
94 }
95
96
97
98
99
100
101
102
103
104
105 public static String escapeHTML( String text )
106 {
107 return escapeHTML( text, true );
108 }
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138 public static String escapeHTML( final String text, final boolean xmlMode )
139 {
140 if ( text == null )
141 {
142 return "";
143 }
144
145 int length = text.length();
146 StringBuilder buffer = new StringBuilder( length );
147
148 for ( int i = 0; i < length; ++i )
149 {
150 char c = text.charAt( i );
151 switch ( c )
152 {
153 case '<':
154 buffer.append( "<" );
155 break;
156 case '>':
157 buffer.append( ">" );
158 break;
159 case '&':
160 buffer.append( "&" );
161 break;
162 case '\"':
163 buffer.append( """ );
164 break;
165 default:
166 if ( xmlMode )
167 {
168 if ( c == '\'' )
169 {
170 buffer.append( "'" );
171 }
172 else
173 {
174 buffer.append( c );
175 }
176 }
177 else
178 {
179 if ( c <= ASCII )
180 {
181
182 buffer.append( c );
183 }
184 else
185 {
186 buffer.append( "&#x" );
187 if ( isHighSurrogate( c ) )
188 {
189 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
190 }
191 else
192 {
193 buffer.append( Integer.toHexString( c ) );
194 }
195 buffer.append( ';' );
196 }
197 }
198 }
199 }
200
201 return buffer.toString();
202 }
203
204
205
206
207
208
209
210
211
212 public static String unescapeHTML( String text )
213 {
214 return unescapeHTML( text, false );
215 }
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238 public static String unescapeHTML( String text, boolean xmlMode )
239 {
240 if ( text == null )
241 {
242 return null;
243 }
244
245 String unescaped;
246 if ( xmlMode )
247 {
248 unescaped = StringEscapeUtils.unescapeXml( text );
249 }
250 else
251 {
252
253 unescaped = StringEscapeUtils.unescapeHtml( text );
254 }
255
256 String tmp = unescaped;
257 List<String> entities = new ArrayList<String>();
258 while ( true )
259 {
260 int i = tmp.indexOf( "&#x" );
261 if ( i == -1 )
262 {
263 break;
264 }
265
266 tmp = tmp.substring( i + 3 );
267 if ( tmp.indexOf( ';' ) != -1 )
268 {
269 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
270 try
271 {
272 Integer.parseInt( entity, 16 );
273 entities.add( entity );
274 }
275 catch ( NumberFormatException e )
276 {
277
278 }
279 }
280 }
281
282 for ( String entity : entities )
283 {
284 int codePoint = Integer.parseInt( entity, 16 );
285 unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
286 }
287
288 return unescaped;
289 }
290
291
292
293
294
295
296
297 public static String encodeURL( String url )
298 {
299 if ( url == null )
300 {
301 return null;
302 }
303
304 StringBuilder encoded = new StringBuilder();
305 int length = url.length();
306
307 char[] unicode = new char[1];
308
309 for ( int i = 0; i < length; ++i )
310 {
311 char c = url.charAt( i );
312
313 switch ( c )
314 {
315 case ';':
316 case '/':
317 case '?':
318 case ':':
319 case '@':
320 case '&':
321 case '=':
322 case '+':
323 case '$':
324 case ',':
325 case '[':
326 case ']':
327 case '-':
328 case '_':
329 case '.':
330 case '!':
331 case '~':
332 case '*':
333 case '\'':
334 case '(':
335 case ')':
336 case '#':
337 encoded.append( c );
338 break;
339 default:
340 if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
341 {
342 encoded.append( c );
343 }
344 else
345 {
346 byte[] bytes;
347
348 try
349 {
350 if ( isHighSurrogate( c ) )
351 {
352 int codePoint = toCodePoint( c, url.charAt( ++i ) );
353 unicode = toChars( codePoint );
354 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
355 }
356 else
357 {
358 unicode[0] = c;
359 bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
360 }
361 }
362 catch ( UnsupportedEncodingException cannotHappen )
363 {
364 bytes = new byte[0];
365 }
366
367 for ( int j = 0; j < bytes.length; ++j )
368 {
369 String hex = DoxiaUtils.byteToHex( bytes[j] );
370
371 encoded.append( '%' );
372 if ( hex.length() == 1 )
373 {
374 encoded.append( '0' );
375 }
376 encoded.append( hex );
377 }
378 }
379 }
380 }
381
382 return encoded.toString();
383 }
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398 public static String encodeId( String id )
399 {
400 return DoxiaUtils.encodeId( id, true );
401 }
402
403
404
405
406
407
408
409
410
411 public static boolean isId( String text )
412 {
413 return DoxiaUtils.isValidId( text );
414 }
415
416 private HtmlTools()
417 {
418
419 }
420
421
422
423
424
425
426 private static final char LUNATE_SIGMA = 0x3FF;
427 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
428 private static final char LOW_SURROGATE = 0xDC00;
429
430 private static int toCodePoint( char high, char low )
431 {
432
433
434 int h = ( high & LUNATE_SIGMA ) << 10;
435 int l = low & LUNATE_SIGMA;
436 return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
437 }
438
439 private static final char MIN_HIGH_SURROGATE = '\uD800';
440 private static final char MAX_HIGH_SURROGATE = '\uDBFF';
441
442 private static boolean isHighSurrogate( char ch )
443 {
444 return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
445 }
446
447 private static final int MIN_CODE_POINT = 0x000000;
448 private static final int MAX_CODE_POINT = 0x10FFFF;
449 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
450
451 private static boolean isValidCodePoint( int codePoint )
452 {
453 return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
454 }
455
456 private static boolean isSupplementaryCodePoint( int codePoint )
457 {
458 return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
459 }
460
461
462
463
464
465
466
467
468 public static char[] toChars( int codePoint )
469 {
470 if ( !isValidCodePoint( codePoint ) )
471 {
472 throw new IllegalArgumentException();
473 }
474
475 if ( isSupplementaryCodePoint( codePoint ) )
476 {
477 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
478 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
479 int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
480 return new char[] { (char) high, (char) low };
481 }
482 return new char[] { (char) codePoint };
483 }
484 }