1 package org.apache.maven.doxia.util;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.nio.charset.StandardCharsets;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Map;
27
28 import javax.swing.text.html.HTML.Tag;
29
30 import org.apache.commons.lang3.StringEscapeUtils;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.codehaus.plexus.util.StringUtils;
33
34
35
36
37
38
39
40
41 public class HtmlTools
42 {
43 private static final Tag[] ALL_TAGS =
44 {
45 HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
46 HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
47 HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
48 HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
49 HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
50 HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
51 HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
52 HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
53 HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
54 HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
55 HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
56 HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
57 HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
58 HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
59 HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
60 HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
61 HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
62 HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
63 };
64
65 private static final Map<String, Tag> TAG_MAP = new HashMap<>( ALL_TAGS.length );
66
67 private static final int ASCII = 0x7E;
68
69 static
70 {
71 for ( Tag tag : ALL_TAGS )
72 {
73 TAG_MAP.put( tag.toString(), tag );
74 }
75 }
76
77
78
79
80
81
82
83
84
85
86
87
88
89 public static Tag getHtmlTag( String tagName )
90 {
91 return TAG_MAP.get( tagName );
92 }
93
94
95
96
97
98
99
100
101
102
103 public static String escapeHTML( String text )
104 {
105 return escapeHTML( text, true );
106 }
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136 public static String escapeHTML( final String text, final boolean xmlMode )
137 {
138 if ( text == null )
139 {
140 return "";
141 }
142
143 int length = text.length();
144 StringBuilder buffer = new StringBuilder( length );
145
146 for ( int i = 0; i < length; ++i )
147 {
148 char c = text.charAt( i );
149 switch ( c )
150 {
151 case '<':
152 buffer.append( "<" );
153 break;
154 case '>':
155 buffer.append( ">" );
156 break;
157 case '&':
158 buffer.append( "&" );
159 break;
160 case '\"':
161 buffer.append( """ );
162 break;
163 default:
164 if ( xmlMode )
165 {
166 if ( c == '\'' )
167 {
168 buffer.append( "'" );
169 }
170 else
171 {
172 buffer.append( c );
173 }
174 }
175 else
176 {
177 if ( c <= ASCII )
178 {
179
180 buffer.append( c );
181 }
182 else
183 {
184 buffer.append( "&#x" );
185 if ( isHighSurrogate( c ) )
186 {
187 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
188 }
189 else
190 {
191 buffer.append( Integer.toHexString( c ) );
192 }
193 buffer.append( ';' );
194 }
195 }
196 }
197 }
198
199 return buffer.toString();
200 }
201
202
203
204
205
206
207
208
209
210 public static String unescapeHTML( String text )
211 {
212 return unescapeHTML( text, false );
213 }
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236 public static String unescapeHTML( String text, boolean xmlMode )
237 {
238 if ( text == null )
239 {
240 return null;
241 }
242
243 String unescaped;
244 if ( xmlMode )
245 {
246 unescaped = StringEscapeUtils.unescapeXml( text );
247 }
248 else
249 {
250
251 unescaped = StringEscapeUtils.unescapeHtml4( text );
252 }
253
254 String tmp = unescaped;
255 List<String> entities = new ArrayList<>();
256 while ( true )
257 {
258 int i = tmp.indexOf( "&#x" );
259 if ( i == -1 )
260 {
261 break;
262 }
263
264 tmp = tmp.substring( i + 3 );
265 if ( tmp.indexOf( ';' ) != -1 )
266 {
267 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
268 try
269 {
270 Integer.parseInt( entity, 16 );
271 entities.add( entity );
272 }
273 catch ( NumberFormatException e )
274 {
275
276 }
277 }
278 }
279
280 for ( String entity : entities )
281 {
282 int codePoint = Integer.parseInt( entity, 16 );
283 unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
284 }
285
286 return unescaped;
287 }
288
289
290
291
292
293
294
295 public static String encodeURL( String url )
296 {
297 if ( url == null )
298 {
299 return null;
300 }
301
302 StringBuilder encoded = new StringBuilder();
303 int length = url.length();
304
305 char[] unicode = new char[1];
306
307 for ( int i = 0; i < length; ++i )
308 {
309 char c = url.charAt( i );
310
311 switch ( c )
312 {
313 case ';':
314 case '/':
315 case '?':
316 case ':':
317 case '@':
318 case '&':
319 case '=':
320 case '+':
321 case '$':
322 case ',':
323 case '[':
324 case ']':
325 case '-':
326 case '_':
327 case '.':
328 case '!':
329 case '~':
330 case '*':
331 case '\'':
332 case '(':
333 case ')':
334 case '#':
335 encoded.append( c );
336 break;
337 default:
338 if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
339 {
340 encoded.append( c );
341 }
342 else
343 {
344 byte[] bytes;
345
346 if ( isHighSurrogate( c ) )
347 {
348 int codePoint = toCodePoint( c, url.charAt( ++i ) );
349 unicode = toChars( codePoint );
350 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( StandardCharsets.UTF_8 );
351 }
352 else
353 {
354 unicode[0] = c;
355 bytes = ( new String( unicode, 0, 1 ) ).getBytes( StandardCharsets.UTF_8 );
356 }
357
358 for ( byte aByte : bytes )
359 {
360 encoded.append( '%' );
361 encoded.append( String.format( "%02X", aByte ) );
362 }
363 }
364 }
365 }
366
367 return encoded.toString();
368 }
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383 public static String encodeId( String id )
384 {
385 return DoxiaUtils.encodeId( id, false );
386 }
387
388
389
390
391
392
393
394
395
396 public static boolean isId( String text )
397 {
398 return DoxiaUtils.isValidId( text );
399 }
400
401 private HtmlTools()
402 {
403
404 }
405
406
407
408
409
410
411 private static final char LUNATE_SIGMA = 0x3FF;
412 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
413 private static final char LOW_SURROGATE = 0xDC00;
414
415 private static int toCodePoint( char high, char low )
416 {
417
418
419 int h = ( high & LUNATE_SIGMA ) << 10;
420 int l = low & LUNATE_SIGMA;
421 return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
422 }
423
424 private static final char MIN_HIGH_SURROGATE = '\uD800';
425 private static final char MAX_HIGH_SURROGATE = '\uDBFF';
426
427 private static boolean isHighSurrogate( char ch )
428 {
429 return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
430 }
431
432 private static final int MIN_CODE_POINT = 0x000000;
433 private static final int MAX_CODE_POINT = 0x10FFFF;
434 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
435
436 private static boolean isValidCodePoint( int codePoint )
437 {
438 return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
439 }
440
441 private static boolean isSupplementaryCodePoint( int codePoint )
442 {
443 return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
444 }
445
446
447
448
449
450
451
452
453 public static char[] toChars( int codePoint )
454 {
455 if ( !isValidCodePoint( codePoint ) )
456 {
457 throw new IllegalArgumentException();
458 }
459
460 if ( isSupplementaryCodePoint( codePoint ) )
461 {
462 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
463 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
464 int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
465 return new char[] { (char) high, (char) low };
466 }
467 return new char[] { (char) codePoint };
468 }
469 }