1 package org.apache.maven.doxia.util;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 import java.nio.charset.StandardCharsets;
23 import java.util.ArrayList;
24 import java.util.HashMap;
25 import java.util.List;
26 import java.util.Map;
27
28 import javax.swing.text.html.HTML.Tag;
29
30 import org.apache.commons.text.StringEscapeUtils;
31 import org.apache.maven.doxia.markup.HtmlMarkup;
32 import org.codehaus.plexus.util.StringUtils;
33
34
35
36
37
38
39
40 public class HtmlTools
41 {
42 private static final Tag[] ALL_TAGS =
43 {
44 HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
45 HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO, HtmlMarkup.BIG,
46 HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON, HtmlMarkup.CAPTION,
47 HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL, HtmlMarkup.COLGROUP,
48 HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR, HtmlMarkup.DIV, HtmlMarkup.DL,
49 HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET, HtmlMarkup.FONT, HtmlMarkup.FORM,
50 HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1, HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4,
51 HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD, HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I,
52 HtmlMarkup.IFRAME, HtmlMarkup.IMG, HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX,
53 HtmlMarkup.KBD, HtmlMarkup.KEYGEN, HtmlMarkup.LABEL, HtmlMarkup.LEGEND, HtmlMarkup.LI,
54 HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU, HtmlMarkup.META, HtmlMarkup.NOFRAMES,
55 HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL, HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION,
56 HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE, HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP,
57 HtmlMarkup.SCRIPT, HtmlMarkup.SELECT, HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE,
58 HtmlMarkup.STRONG, HtmlMarkup.STYLE, HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE,
59 HtmlMarkup.TBODY, HtmlMarkup.TD, HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH,
60 HtmlMarkup.THEAD, HtmlMarkup.TITLE, HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL,
61 HtmlMarkup.VAR
62 };
63
64 private static final Map<String, Tag> TAG_MAP = new HashMap<>( ALL_TAGS.length );
65
66 private static final int ASCII = 0x7E;
67
68 static
69 {
70 for ( Tag tag : ALL_TAGS )
71 {
72 TAG_MAP.put( tag.toString(), tag );
73 }
74 }
75
76
77
78
79
80
81
82
83
84
85
86
87
88 public static Tag getHtmlTag( String tagName )
89 {
90 return TAG_MAP.get( tagName );
91 }
92
93
94
95
96
97
98
99
100
101
102 public static String escapeHTML( String text )
103 {
104 return escapeHTML( text, true );
105 }
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135 public static String escapeHTML( final String text, final boolean xmlMode )
136 {
137 if ( text == null )
138 {
139 return "";
140 }
141
142 int length = text.length();
143 StringBuilder buffer = new StringBuilder( length );
144
145 for ( int i = 0; i < length; ++i )
146 {
147 char c = text.charAt( i );
148 switch ( c )
149 {
150 case '<':
151 buffer.append( "<" );
152 break;
153 case '>':
154 buffer.append( ">" );
155 break;
156 case '&':
157 buffer.append( "&" );
158 break;
159 case '\"':
160 buffer.append( """ );
161 break;
162 default:
163 if ( xmlMode )
164 {
165 if ( c == '\'' )
166 {
167 buffer.append( "'" );
168 }
169 else
170 {
171 buffer.append( c );
172 }
173 }
174 else
175 {
176 if ( c <= ASCII )
177 {
178
179 buffer.append( c );
180 }
181 else
182 {
183 buffer.append( "&#x" );
184 if ( isHighSurrogate( c ) )
185 {
186 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
187 }
188 else
189 {
190 buffer.append( Integer.toHexString( c ) );
191 }
192 buffer.append( ';' );
193 }
194 }
195 }
196 }
197
198 return buffer.toString();
199 }
200
201
202
203
204
205
206
207
208
209 public static String unescapeHTML( String text )
210 {
211 return unescapeHTML( text, false );
212 }
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235 public static String unescapeHTML( String text, boolean xmlMode )
236 {
237 if ( text == null )
238 {
239 return null;
240 }
241
242 String unescaped;
243 if ( xmlMode )
244 {
245 unescaped = StringEscapeUtils.unescapeXml( text );
246 }
247 else
248 {
249
250 unescaped = StringEscapeUtils.unescapeHtml4( text );
251 }
252
253 String tmp = unescaped;
254 List<String> entities = new ArrayList<>();
255 while ( true )
256 {
257 int i = tmp.indexOf( "&#x" );
258 if ( i == -1 )
259 {
260 break;
261 }
262
263 tmp = tmp.substring( i + 3 );
264 if ( tmp.indexOf( ';' ) != -1 )
265 {
266 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
267 try
268 {
269 Integer.parseInt( entity, 16 );
270 entities.add( entity );
271 }
272 catch ( NumberFormatException e )
273 {
274
275 }
276 }
277 }
278
279 for ( String entity : entities )
280 {
281 int codePoint = Integer.parseInt( entity, 16 );
282 unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
283 }
284
285 return unescaped;
286 }
287
288
289
290
291
292
293
294 public static String encodeURL( String url )
295 {
296 if ( url == null )
297 {
298 return null;
299 }
300
301 StringBuilder encoded = new StringBuilder();
302 int length = url.length();
303
304 char[] unicode = new char[1];
305
306 for ( int i = 0; i < length; ++i )
307 {
308 char c = url.charAt( i );
309
310 switch ( c )
311 {
312 case ';':
313 case '/':
314 case '?':
315 case ':':
316 case '@':
317 case '&':
318 case '=':
319 case '+':
320 case '$':
321 case ',':
322 case '[':
323 case ']':
324 case '-':
325 case '_':
326 case '.':
327 case '!':
328 case '~':
329 case '*':
330 case '\'':
331 case '(':
332 case ')':
333 case '#':
334 encoded.append( c );
335 break;
336 default:
337 if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
338 {
339 encoded.append( c );
340 }
341 else
342 {
343 byte[] bytes;
344
345 if ( isHighSurrogate( c ) )
346 {
347 int codePoint = toCodePoint( c, url.charAt( ++i ) );
348 unicode = toChars( codePoint );
349 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( StandardCharsets.UTF_8 );
350 }
351 else
352 {
353 unicode[0] = c;
354 bytes = ( new String( unicode, 0, 1 ) ).getBytes( StandardCharsets.UTF_8 );
355 }
356
357 for ( byte aByte : bytes )
358 {
359 encoded.append( '%' );
360 encoded.append( String.format( "%02X", aByte ) );
361 }
362 }
363 }
364 }
365
366 return encoded.toString();
367 }
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382 public static String encodeId( String id )
383 {
384 return DoxiaUtils.encodeId( id, false );
385 }
386
387
388
389
390
391
392
393
394
395 public static boolean isId( String text )
396 {
397 return DoxiaUtils.isValidId( text );
398 }
399
400 private HtmlTools()
401 {
402
403 }
404
405
406
407
408
409
410 private static final char LUNATE_SIGMA = 0x3FF;
411 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
412 private static final char LOW_SURROGATE = 0xDC00;
413
414 private static int toCodePoint( char high, char low )
415 {
416
417
418 int h = ( high & LUNATE_SIGMA ) << 10;
419 int l = low & LUNATE_SIGMA;
420 return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
421 }
422
423 private static final char MIN_HIGH_SURROGATE = '\uD800';
424 private static final char MAX_HIGH_SURROGATE = '\uDBFF';
425
426 private static boolean isHighSurrogate( char ch )
427 {
428 return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
429 }
430
431 private static final int MIN_CODE_POINT = 0x000000;
432 private static final int MAX_CODE_POINT = 0x10FFFF;
433 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
434
435 private static boolean isValidCodePoint( int codePoint )
436 {
437 return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
438 }
439
440 private static boolean isSupplementaryCodePoint( int codePoint )
441 {
442 return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
443 }
444
445
446
447
448
449
450
451
452 public static char[] toChars( int codePoint )
453 {
454 if ( !isValidCodePoint( codePoint ) )
455 {
456 throw new IllegalArgumentException();
457 }
458
459 if ( isSupplementaryCodePoint( codePoint ) )
460 {
461 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
462 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
463 int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
464 return new char[] { (char) high, (char) low };
465 }
466 return new char[] { (char) codePoint };
467 }
468 }