Coverage Report - org.apache.maven.doxia.util.HtmlTools
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlTools
93%
99/106
36%
58/160
5,357
 
 1  
 package org.apache.maven.doxia.util;
 2  
 
 3  
 /*
 4  
  * Licensed to the Apache Software Foundation (ASF) under one
 5  
  * or more contributor license agreements.  See the NOTICE file
 6  
  * distributed with this work for additional information
 7  
  * regarding copyright ownership.  The ASF licenses this file
 8  
  * to you under the Apache License, Version 2.0 (the
 9  
  * "License"); you may not use this file except in compliance
 10  
  * with the License.  You may obtain a copy of the License at
 11  
  *
 12  
  *   http://www.apache.org/licenses/LICENSE-2.0
 13  
  *
 14  
  * Unless required by applicable law or agreed to in writing,
 15  
  * software distributed under the License is distributed on an
 16  
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 17  
  * KIND, either express or implied.  See the License for the
 18  
  * specific language governing permissions and limitations
 19  
  * under the License.
 20  
  */
 21  
 
 22  
 import java.io.UnsupportedEncodingException;
 23  
 import java.util.ArrayList;
 24  
 import java.util.HashMap;
 25  
 import java.util.List;
 26  
 import java.util.Map;
 27  
 
 28  
 import javax.swing.text.html.HTML.Tag;
 29  
 
 30  
 import org.apache.commons.lang.StringEscapeUtils;
 31  
 import org.apache.maven.doxia.markup.HtmlMarkup;
 32  
 import org.codehaus.plexus.util.StringUtils;
 33  
 
 34  
 /**
 35  
  * The <code>HtmlTools</code> class defines methods to HTML handling.
 36  
  *
 37  
  * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
 38  
  * @version $Id: HtmlTools.java 1185112 2011-10-17 11:33:00Z ltheussl $
 39  
  * @since 1.0
 40  
  */
 41  
 public class HtmlTools
 42  
 {
 43  2
     private static final Tag[] ALL_TAGS  =
 44  
     {
 45  
         HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
 46  
         HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
 47  
         HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
 48  
         HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
 49  
         HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
 50  
         HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
 51  
         HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
 52  
         HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
 53  
         HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
 54  
         HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
 55  
         HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
 56  
         HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
 57  
         HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
 58  
         HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
 59  
         HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
 60  
         HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
 61  
         HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
 62  
         HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
 63  
     };
 64  
 
 65  2
     private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
 66  
 
 67  
     private static final int ASCII = 0x7E;
 68  
 
 69  
     static
 70  
     {
 71  184
         for ( Tag tag : ALL_TAGS )
 72  
         {
 73  182
             TAG_MAP.put( tag.toString(), tag );
 74  
         }
 75  2
     }
 76  
 
 77  
     /**
 78  
      * Returns a tag for a defined HTML tag name. This is one of
 79  
      * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
 80  
      * If the given name does not represent one of the defined tags, then
 81  
      * <code>null</code> will be returned.
 82  
      *
 83  
      * @param tagName the <code>String</code> name requested.
 84  
      * @return a tag constant corresponding to the <code>tagName</code>,
 85  
      *    or <code>null</code> if not found.
 86  
      * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
 87  
      * @since 1.1
 88  
      */
 89  
     public static Tag getHtmlTag( String tagName )
 90  
     {
 91  12
         Object t =  TAG_MAP.get( tagName );
 92  
 
 93  12
         return (Tag) t;
 94  
     }
 95  
 
 96  
     /**
 97  
      * Escape special HTML characters in a String in <code>xml</code> mode.
 98  
      *
 99  
      * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
 100  
      *
 101  
      * @param text the String to escape, may be null.
 102  
      * @return The escaped text or the empty string if text == null.
 103  
      * @see #escapeHTML(String,boolean)
 104  
      */
 105  
     public static String escapeHTML( String text )
 106  
     {
 107  32
         return escapeHTML( text, true );
 108  
     }
 109  
 
 110  
     /**
 111  
      * Escape special HTML characters in a String.
 112  
      *
 113  
      * <pre>
 114  
      * < becomes <code>&#38;lt;</code>
 115  
      * > becomes <code>&#38;gt;</code>
 116  
      * & becomes <code>&#38;amp;</code>
 117  
      * " becomes <code>&#38;quot;</code>
 118  
      * ' becomes <code>&#38;apos;</code> if xmlMode = true
 119  
      * </pre>
 120  
      *
 121  
      * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
 122  
      * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
 123  
      *
 124  
      * <b>Note</b>: all characters are encoded, i.e.:
 125  
      * <pre>
 126  
      * \u0159       = &#38;#x159;
 127  
      * \uD835\uDFED = &#38;#x1d7ed;
 128  
      * </pre>
 129  
      *
 130  
      * @param text The String to escape, may be null.
 131  
      * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
 132  
      * characters by numeric characters references.
 133  
      * @return The escaped text or the empty string if text == null.
 134  
      * @since 1.1
 135  
      * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
 136  
      * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
 137  
      */
 138  
     public static String escapeHTML( final String text, final boolean xmlMode )
 139  
     {
 140  102
         if ( text == null )
 141  
         {
 142  2
             return "";
 143  
         }
 144  
 
 145  100
         int length = text.length();
 146  100
         StringBuilder buffer = new StringBuilder( length );
 147  
 
 148  872
         for ( int i = 0; i < length; ++i )
 149  
         {
 150  772
             char c = text.charAt( i );
 151  772
             switch ( c )
 152  
             {
 153  
                 case '<':
 154  2
                     buffer.append( "&lt;" );
 155  2
                     break;
 156  
                 case '>':
 157  2
                     buffer.append( "&gt;" );
 158  2
                     break;
 159  
                 case '&':
 160  14
                     buffer.append( "&amp;" );
 161  14
                     break;
 162  
                 case '\"':
 163  2
                     buffer.append( "&quot;" );
 164  2
                     break;
 165  
                 default:
 166  752
                     if ( xmlMode )
 167  
                     {
 168  70
                         if ( c == '\'' )
 169  
                         {
 170  2
                             buffer.append( "&apos;" );
 171  
                         }
 172  
                         else
 173  
                         {
 174  68
                             buffer.append( c );
 175  
                         }
 176  
                     }
 177  
                     else
 178  
                     {
 179  682
                         if ( c <= ASCII )
 180  
                         {
 181  
                             // ASCII.
 182  670
                             buffer.append( c );
 183  
                         }
 184  
                         else
 185  
                         {
 186  12
                             buffer.append( "&#x" );
 187  12
                             if ( isHighSurrogate( c ) )
 188  
                             {
 189  2
                                 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
 190  
                             }
 191  
                             else
 192  
                             {
 193  10
                                 buffer.append( Integer.toHexString( c ) );
 194  
                             }
 195  12
                             buffer.append( ';' );
 196  
                         }
 197  
                     }
 198  
             }
 199  
         }
 200  
 
 201  100
         return buffer.toString();
 202  
     }
 203  
 
 204  
     /**
 205  
      * Unescapes HTML entities in a string in non xml mode.
 206  
      *
 207  
      * @param text the <code>String</code> to unescape, may be null.
 208  
      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
 209  
      * @since 1.1.1.
 210  
      * @see #unescapeHTML(String, boolean)
 211  
      */
 212  
     public static String unescapeHTML( String text )
 213  
     {
 214  70
         return unescapeHTML( text, false );
 215  
     }
 216  
 
 217  
     /**
 218  
      * Unescapes HTML entities in a string.
 219  
      *
 220  
      * <p> Unescapes a string containing entity escapes to a string
 221  
      * containing the actual Unicode characters corresponding to the
 222  
      * escapes. Supports HTML 4.0 entities.</p>
 223  
      *
 224  
      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
 225  
      * will become "&lt;Fran&ccedil;ais&gt;".</p>
 226  
      *
 227  
      * <b>Note</b>: all unicode entities are decoded, i.e.:
 228  
      * <pre>
 229  
      * &#38;#x159;   = \u0159
 230  
      * &#38;#x1d7ed; = \uD835\uDFED
 231  
      * </pre>
 232  
      *
 233  
      * @param text the <code>String</code> to unescape, may be null.
 234  
      * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
 235  
      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
 236  
      * @since 1.1.1.
 237  
      */
 238  
     public static String unescapeHTML( String text, boolean xmlMode )
 239  
     {
 240  72
         if ( text == null )
 241  
         {
 242  2
             return null;
 243  
         }
 244  
 
 245  
         String unescaped;
 246  70
         if ( xmlMode )
 247  
         {
 248  2
             unescaped = StringEscapeUtils.unescapeXml( text );
 249  
         }
 250  
         else
 251  
         {
 252  
             // StringEscapeUtils.unescapeHtml returns entities it doesn't recognize unchanged
 253  68
             unescaped = StringEscapeUtils.unescapeHtml( text );
 254  
         }
 255  
 
 256  70
         String tmp = unescaped;
 257  70
         List<String> entities = new ArrayList<String>();
 258  
         while ( true )
 259  
         {
 260  96
             int i = tmp.indexOf( "&#x" );
 261  96
             if ( i == -1 )
 262  
             {
 263  70
                 break;
 264  
             }
 265  
 
 266  26
             tmp = tmp.substring( i + 3 );
 267  26
             if ( tmp.indexOf( ';' ) != -1 )
 268  
             {
 269  22
                 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
 270  
                 try
 271  
                 {
 272  22
                     Integer.parseInt( entity, 16 );
 273  18
                     entities.add( entity );
 274  
                 }
 275  4
                 catch ( NumberFormatException e )
 276  
                 {
 277  
                     // nop
 278  18
                 }
 279  
             }
 280  26
         }
 281  
 
 282  70
         for ( String entity : entities )
 283  
         {
 284  18
             int codePoint = Integer.parseInt( entity, 16 );
 285  18
             unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
 286  18
         }
 287  
 
 288  70
         return unescaped;
 289  
     }
 290  
 
 291  
     /**
 292  
      * Encode an url
 293  
      *
 294  
      * @param url the String to encode, may be null
 295  
      * @return the text encoded, null if null String input
 296  
      */
 297  
     public static String encodeURL( String url )
 298  
     {
 299  10
         if ( url == null )
 300  
         {
 301  2
             return null;
 302  
         }
 303  
 
 304  8
         StringBuilder encoded = new StringBuilder();
 305  8
         int length = url.length();
 306  
 
 307  8
         char[] unicode = new char[1];
 308  
 
 309  210
         for ( int i = 0; i < length; ++i )
 310  
         {
 311  202
             char c = url.charAt( i );
 312  
 
 313  202
             switch ( c )
 314  
             {
 315  
                 case ';':
 316  
                 case '/':
 317  
                 case '?':
 318  
                 case ':':
 319  
                 case '@':
 320  
                 case '&':
 321  
                 case '=':
 322  
                 case '+':
 323  
                 case '$':
 324  
                 case ',':
 325  
                 case '[':
 326  
                 case ']': // RFC 2732 (IPV6)
 327  
                 case '-':
 328  
                 case '_':
 329  
                 case '.':
 330  
                 case '!':
 331  
                 case '~':
 332  
                 case '*':
 333  
                 case '\'':
 334  
                 case '(':
 335  
                 case ')':
 336  
                 case '#': // XLink mark
 337  34
                     encoded.append( c );
 338  34
                     break;
 339  
                 default:
 340  168
                     if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
 341  
                     {
 342  146
                         encoded.append( c );
 343  
                     }
 344  
                     else
 345  
                     {
 346  
                         byte[] bytes;
 347  
 
 348  
                         try
 349  
                         {
 350  22
                             if ( isHighSurrogate( c ) )
 351  
                             {
 352  2
                                 int codePoint = toCodePoint( c, url.charAt( ++i ) );
 353  2
                                 unicode = toChars( codePoint );
 354  2
                                 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
 355  2
                             }
 356  
                             else
 357  
                             {
 358  20
                                 unicode[0] = c;
 359  20
                                 bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
 360  
                             }
 361  
                         }
 362  0
                         catch ( UnsupportedEncodingException cannotHappen )
 363  
                         {
 364  0
                             bytes = new byte[0];
 365  22
                         }
 366  
 
 367  50
                         for ( int j = 0; j < bytes.length; ++j )
 368  
                         {
 369  28
                             String hex = DoxiaUtils.byteToHex( bytes[j] );
 370  
 
 371  28
                             encoded.append( '%' );
 372  28
                             if ( hex.length() == 1 )
 373  
                             {
 374  0
                                 encoded.append( '0' );
 375  
                             }
 376  28
                             encoded.append( hex );
 377  
                         }
 378  
                     }
 379  
             }
 380  
         }
 381  
 
 382  8
         return encoded.toString();
 383  
     }
 384  
 
 385  
     /**
 386  
      * Construct a valid id.
 387  
      *
 388  
      * <p>
 389  
      *   <b>Note</b>: this method is identical to
 390  
      *   {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, true)},
 391  
      *   the rules to encode an id are laid out there.
 392  
      * </p>
 393  
      *
 394  
      * @param id The id to be encoded.
 395  
      * @return The trimmed and encoded id, or null if id is null.
 396  
      * @see DoxiaUtils#encodeId(java.lang.String,boolean)
 397  
      */
 398  
     public static String encodeId( String id )
 399  
     {
 400  84
         return DoxiaUtils.encodeId( id, true );
 401  
     }
 402  
 
 403  
     /**
 404  
      * Determines if the specified text is a valid id according to the rules
 405  
      * laid out in {@link #encodeId(String)}.
 406  
      *
 407  
      * @param text The text to be tested.
 408  
      * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
 409  
      * @see #encodeId(String).
 410  
      */
 411  
     public static boolean isId( String text )
 412  
     {
 413  30
         return DoxiaUtils.isValidId( text );
 414  
     }
 415  
 
 416  
     private HtmlTools()
 417  0
     {
 418  
         // utility class
 419  0
     }
 420  
 
 421  
 //
 422  
 // Imported code from ASF Harmony project rev 770909
 423  
 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
 424  
 //
 425  
 
 426  
     private static final char LUNATE_SIGMA = 0x3FF;
 427  
     private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
 428  
     private static final char LOW_SURROGATE = 0xDC00;
 429  
 
 430  
     private static int toCodePoint( char high, char low )
 431  
     {
 432  
         // See RFC 2781, Section 2.2
 433  
         // http://www.faqs.org/rfcs/rfc2781.html
 434  4
         int h = ( high & LUNATE_SIGMA ) << 10;
 435  4
         int l = low & LUNATE_SIGMA;
 436  4
         return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
 437  
     }
 438  
 
 439  
     private static final char MIN_HIGH_SURROGATE = '\uD800';
 440  
     private static final char MAX_HIGH_SURROGATE = '\uDBFF';
 441  
 
 442  
     private static boolean isHighSurrogate( char ch )
 443  
     {
 444  34
         return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
 445  
     }
 446  
 
 447  
     private static final int MIN_CODE_POINT = 0x000000;
 448  
     private static final int MAX_CODE_POINT = 0x10FFFF;
 449  
     private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
 450  
 
 451  
     private static boolean isValidCodePoint( int codePoint )
 452  
     {
 453  20
         return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
 454  
     }
 455  
 
 456  
     private static boolean isSupplementaryCodePoint( int codePoint )
 457  
     {
 458  20
         return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
 459  
     }
 460  
 
 461  
     /**
 462  
      * Converts the given code point to an equivalent character array.
 463  
      *
 464  
      * @param codePoint the code point to convert.
 465  
      * @return If codePoint is a supplementary code point, returns a character array of length 2,
 466  
      * otherwise a character array of length 1 containing only the original int as a char.
 467  
      */
 468  
     public static char[] toChars( int codePoint )
 469  
     {
 470  20
         if ( !isValidCodePoint( codePoint ) )
 471  
         {
 472  0
             throw new IllegalArgumentException();
 473  
         }
 474  
 
 475  20
         if ( isSupplementaryCodePoint( codePoint ) )
 476  
         {
 477  20
             int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
 478  20
             int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
 479  20
             int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
 480  20
             return new char[] { (char) high, (char) low };
 481  
         }
 482  0
         return new char[] { (char) codePoint };
 483  
     }
 484  
 }