001package org.apache.maven.doxia.util; 002 003/* 004 * Licensed to the Apache Software Foundation (ASF) under one 005 * or more contributor license agreements. See the NOTICE file 006 * distributed with this work for additional information 007 * regarding copyright ownership. The ASF licenses this file 008 * to you under the Apache License, Version 2.0 (the 009 * "License"); you may not use this file except in compliance 010 * with the License. You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, 015 * software distributed under the License is distributed on an 016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 017 * KIND, either express or implied. See the License for the 018 * specific language governing permissions and limitations 019 * under the License. 020 */ 021 022import java.io.UnsupportedEncodingException; 023import java.util.ArrayList; 024import java.util.HashMap; 025import java.util.List; 026import java.util.Map; 027 028import javax.swing.text.html.HTML.Tag; 029 030import org.apache.commons.lang.StringEscapeUtils; 031import org.apache.maven.doxia.markup.HtmlMarkup; 032import org.codehaus.plexus.util.StringUtils; 033 034/** 035 * The <code>HtmlTools</code> class defines methods to HTML handling. 036 * 037 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a> 038 * @version $Id$ 039 * @since 1.0 040 */ 041public class HtmlTools 042{ 043 private static final Tag[] ALL_TAGS = 044 { 045 HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET, 046 HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO, 047 HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON, 048 HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL, 049 HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR, 050 HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET, 051 HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1, 052 HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD, 053 HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG, 054 HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL, 055 HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU, 056 HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL, 057 HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE, 058 HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT, 059 HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE, 060 HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD, 061 HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE, 062 HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR 063 }; 064 065 private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length ); 066 067 private static final int ASCII = 0x7E; 068 069 static 070 { 071 for ( Tag tag : ALL_TAGS ) 072 { 073 TAG_MAP.put( tag.toString(), tag ); 074 } 075 } 076 077 /** 078 * Returns a tag for a defined HTML tag name. This is one of 079 * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}. 080 * If the given name does not represent one of the defined tags, then 081 * <code>null</code> will be returned. 082 * 083 * @param tagName the <code>String</code> name requested. 084 * @return a tag constant corresponding to the <code>tagName</code>, 085 * or <code>null</code> if not found. 086 * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a> 087 * @since 1.1 088 */ 089 public static Tag getHtmlTag( String tagName ) 090 { 091 Object t = TAG_MAP.get( tagName ); 092 093 return (Tag) t; 094 } 095 096 /** 097 * Escape special HTML characters in a String in <code>xml</code> mode. 098 * 099 * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references. 100 * 101 * @param text the String to escape, may be null. 102 * @return The escaped text or the empty string if text == null. 103 * @see #escapeHTML(String,boolean) 104 */ 105 public static String escapeHTML( String text ) 106 { 107 return escapeHTML( text, true ); 108 } 109 110 /** 111 * Escape special HTML characters in a String. 112 * 113 * <pre> 114 * < becomes <code>&lt;</code> 115 * > becomes <code>&gt;</code> 116 * & becomes <code>&amp;</code> 117 * " becomes <code>&quot;</code> 118 * ' becomes <code>&apos;</code> if xmlMode = true 119 * </pre> 120 * 121 * If <code>xmlMode</code> is true, every other character than the above remains unchanged, 122 * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code. 123 * 124 * <b>Note</b>: all characters are encoded, i.e.: 125 * <pre> 126 * \u0159 = &#x159; 127 * \uD835\uDFED = &#x1d7ed; 128 * </pre> 129 * 130 * @param text The String to escape, may be null. 131 * @param xmlMode <code>true</code> to replace also ' to &apos, <code>false</code> to replace non-ascii 132 * characters by numeric characters references. 133 * @return The escaped text or the empty string if text == null. 134 * @since 1.1 135 * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a> 136 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a> 137 */ 138 public static String escapeHTML( final String text, final boolean xmlMode ) 139 { 140 if ( text == null ) 141 { 142 return ""; 143 } 144 145 int length = text.length(); 146 StringBuilder buffer = new StringBuilder( length ); 147 148 for ( int i = 0; i < length; ++i ) 149 { 150 char c = text.charAt( i ); 151 switch ( c ) 152 { 153 case '<': 154 buffer.append( "<" ); 155 break; 156 case '>': 157 buffer.append( ">" ); 158 break; 159 case '&': 160 buffer.append( "&" ); 161 break; 162 case '\"': 163 buffer.append( """ ); 164 break; 165 default: 166 if ( xmlMode ) 167 { 168 if ( c == '\'' ) 169 { 170 buffer.append( "'" ); 171 } 172 else 173 { 174 buffer.append( c ); 175 } 176 } 177 else 178 { 179 if ( c <= ASCII ) 180 { 181 // ASCII. 182 buffer.append( c ); 183 } 184 else 185 { 186 buffer.append( "&#x" ); 187 if ( isHighSurrogate( c ) ) 188 { 189 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) ); 190 } 191 else 192 { 193 buffer.append( Integer.toHexString( c ) ); 194 } 195 buffer.append( ';' ); 196 } 197 } 198 } 199 } 200 201 return buffer.toString(); 202 } 203 204 /** 205 * Unescapes HTML entities in a string in non xml mode. 206 * 207 * @param text the <code>String</code> to unescape, may be null. 208 * @return a new unescaped <code>String</code>, <code>null</code> if null string input. 209 * @since 1.1.1. 210 * @see #unescapeHTML(String, boolean) 211 */ 212 public static String unescapeHTML( String text ) 213 { 214 return unescapeHTML( text, false ); 215 } 216 217 /** 218 * Unescapes HTML entities in a string. 219 * 220 * <p> Unescapes a string containing entity escapes to a string 221 * containing the actual Unicode characters corresponding to the 222 * escapes. Supports HTML 4.0 entities.</p> 223 * 224 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;" 225 * will become "<Français>".</p> 226 * 227 * <b>Note</b>: all unicode entities are decoded, i.e.: 228 * <pre> 229 * &#x159; = \u0159 230 * &#x1d7ed; = \uD835\uDFED 231 * </pre> 232 * 233 * @param text the <code>String</code> to unescape, may be null. 234 * @param xmlMode set to <code>true</code> to replace &apos by '. 235 * @return a new unescaped <code>String</code>, <code>null</code> if null string input. 236 * @since 1.1.1. 237 */ 238 public static String unescapeHTML( String text, boolean xmlMode ) 239 { 240 if ( text == null ) 241 { 242 return null; 243 } 244 245 String unescaped; 246 if ( xmlMode ) 247 { 248 unescaped = StringEscapeUtils.unescapeXml( text ); 249 } 250 else 251 { 252 // StringEscapeUtils.unescapeHtml returns entities it doesn't recognize unchanged 253 unescaped = StringEscapeUtils.unescapeHtml( text ); 254 } 255 256 String tmp = unescaped; 257 List<String> entities = new ArrayList<String>(); 258 while ( true ) 259 { 260 int i = tmp.indexOf( "&#x" ); 261 if ( i == -1 ) 262 { 263 break; 264 } 265 266 tmp = tmp.substring( i + 3 ); 267 if ( tmp.indexOf( ';' ) != -1 ) 268 { 269 String entity = tmp.substring( 0, tmp.indexOf( ';' ) ); 270 try 271 { 272 Integer.parseInt( entity, 16 ); 273 entities.add( entity ); 274 } 275 catch ( NumberFormatException e ) 276 { 277 // nop 278 } 279 } 280 } 281 282 for ( String entity : entities ) 283 { 284 int codePoint = Integer.parseInt( entity, 16 ); 285 unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) ); 286 } 287 288 return unescaped; 289 } 290 291 /** 292 * Encode an url 293 * 294 * @param url the String to encode, may be null 295 * @return the text encoded, null if null String input 296 */ 297 public static String encodeURL( String url ) 298 { 299 if ( url == null ) 300 { 301 return null; 302 } 303 304 StringBuilder encoded = new StringBuilder(); 305 int length = url.length(); 306 307 char[] unicode = new char[1]; 308 309 for ( int i = 0; i < length; ++i ) 310 { 311 char c = url.charAt( i ); 312 313 switch ( c ) 314 { 315 case ';': 316 case '/': 317 case '?': 318 case ':': 319 case '@': 320 case '&': 321 case '=': 322 case '+': 323 case '$': 324 case ',': 325 case '[': 326 case ']': // RFC 2732 (IPV6) 327 case '-': 328 case '_': 329 case '.': 330 case '!': 331 case '~': 332 case '*': 333 case '\'': 334 case '(': 335 case ')': 336 case '#': // XLink mark 337 encoded.append( c ); 338 break; 339 default: 340 if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) ) 341 { 342 encoded.append( c ); 343 } 344 else 345 { 346 byte[] bytes; 347 348 try 349 { 350 if ( isHighSurrogate( c ) ) 351 { 352 int codePoint = toCodePoint( c, url.charAt( ++i ) ); 353 unicode = toChars( codePoint ); 354 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" ); 355 } 356 else 357 { 358 unicode[0] = c; 359 bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" ); 360 } 361 } 362 catch ( UnsupportedEncodingException cannotHappen ) 363 { 364 bytes = new byte[0]; 365 } 366 367 for ( int j = 0; j < bytes.length; ++j ) 368 { 369 String hex = DoxiaUtils.byteToHex( bytes[j] ); 370 371 encoded.append( '%' ); 372 if ( hex.length() == 1 ) 373 { 374 encoded.append( '0' ); 375 } 376 encoded.append( hex ); 377 } 378 } 379 } 380 } 381 382 return encoded.toString(); 383 } 384 385 /** 386 * Construct a valid id. 387 * 388 * <p> 389 * <b>Note</b>: this method is identical to 390 * {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, true)}, 391 * the rules to encode an id are laid out there. 392 * </p> 393 * 394 * @param id The id to be encoded. 395 * @return The trimmed and encoded id, or null if id is null. 396 * @see DoxiaUtils#encodeId(java.lang.String,boolean) 397 */ 398 public static String encodeId( String id ) 399 { 400 return DoxiaUtils.encodeId( id, true ); 401 } 402 403 /** 404 * Determines if the specified text is a valid id according to the rules 405 * laid out in {@link #encodeId(String)}. 406 * 407 * @param text The text to be tested. 408 * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>. 409 * @see #encodeId(String). 410 */ 411 public static boolean isId( String text ) 412 { 413 return DoxiaUtils.isValidId( text ); 414 } 415 416 private HtmlTools() 417 { 418 // utility class 419 } 420 421// 422// Imported code from ASF Harmony project rev 770909 423// http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java 424// 425 426 private static final char LUNATE_SIGMA = 0x3FF; 427 private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800; 428 private static final char LOW_SURROGATE = 0xDC00; 429 430 private static int toCodePoint( char high, char low ) 431 { 432 // See RFC 2781, Section 2.2 433 // http://www.faqs.org/rfcs/rfc2781.html 434 int h = ( high & LUNATE_SIGMA ) << 10; 435 int l = low & LUNATE_SIGMA; 436 return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT; 437 } 438 439 private static final char MIN_HIGH_SURROGATE = '\uD800'; 440 private static final char MAX_HIGH_SURROGATE = '\uDBFF'; 441 442 private static boolean isHighSurrogate( char ch ) 443 { 444 return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch ); 445 } 446 447 private static final int MIN_CODE_POINT = 0x000000; 448 private static final int MAX_CODE_POINT = 0x10FFFF; 449 private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000; 450 451 private static boolean isValidCodePoint( int codePoint ) 452 { 453 return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint ); 454 } 455 456 private static boolean isSupplementaryCodePoint( int codePoint ) 457 { 458 return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint ); 459 } 460 461 /** 462 * Converts the given code point to an equivalent character array. 463 * 464 * @param codePoint the code point to convert. 465 * @return If codePoint is a supplementary code point, returns a character array of length 2, 466 * otherwise a character array of length 1 containing only the original int as a char. 467 */ 468 public static char[] toChars( int codePoint ) 469 { 470 if ( !isValidCodePoint( codePoint ) ) 471 { 472 throw new IllegalArgumentException(); 473 } 474 475 if ( isSupplementaryCodePoint( codePoint ) ) 476 { 477 int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT; 478 int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA ); 479 int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA ); 480 return new char[] { (char) high, (char) low }; 481 } 482 return new char[] { (char) codePoint }; 483 } 484}