Coverage Report

Coverage Report - org.apache.maven.doxia.util.HtmlTools

Classes in this File

Line Coverage

Branch Coverage

Complexity

HtmlTools

93%

99/106

36%

58/160

5,357

 package org.apache.maven.doxia.util;
 
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
 import javax.swing.text.html.HTML.Tag;
 
 import org.apache.commons.lang.StringEscapeUtils;
 import org.apache.maven.doxia.markup.HtmlMarkup;
 import org.codehaus.plexus.util.StringUtils;
 
 /**
  * The <code>HtmlTools</code> class defines methods to HTML handling.
  *
  * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
  * @version $Id: HtmlTools.java 1185112 2011-10-17 11:33:00Z ltheussl $
  * @since 1.0
  */
 public class HtmlTools
 {
     private static final Tag[] ALL_TAGS  =
     {
         HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
         HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
         HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
         HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
         HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
         HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
         HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
         HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
         HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
         HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
         HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
         HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
         HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
         HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
         HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
         HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
         HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
         HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
     };
 
     private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
 
     private static final int ASCII = 0x7E;
 
     static
     {
         for ( Tag tag : ALL_TAGS )
         {
             TAG_MAP.put( tag.toString(), tag );
         }
     }
 
     /**
      * Returns a tag for a defined HTML tag name. This is one of
      * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
      * If the given name does not represent one of the defined tags, then
      * <code>null</code> will be returned.
      *
      * @param tagName the <code>String</code> name requested.
      * @return a tag constant corresponding to the <code>tagName</code>,
      *    or <code>null</code> if not found.
      * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
      * @since 1.1
      */
     public static Tag getHtmlTag( String tagName )
     {
         Object t =  TAG_MAP.get( tagName );
 
         return (Tag) t;
     }
 
     /**
      * Escape special HTML characters in a String in <code>xml</code> mode.
      *
      * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
      *
      * @param text the String to escape, may be null.
      * @return The escaped text or the empty string if text == null.
      * @see #escapeHTML(String,boolean)
      */
     public static String escapeHTML( String text )
     {
         return escapeHTML( text, true );
     }
 
     /**
      * Escape special HTML characters in a String.
      *
      * <pre>
      * < becomes <code>&#38;lt;</code>
      * > becomes <code>&#38;gt;</code>
      * & becomes <code>&#38;amp;</code>
      * " becomes <code>&#38;quot;</code>
      * ' becomes <code>&#38;apos;</code> if xmlMode = true
      * </pre>
      *
      * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
      * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
      *
      * <b>Note</b>: all characters are encoded, i.e.:
      * <pre>
      * \u0159       = &#38;#x159;
      * \uD835\uDFED = &#38;#x1d7ed;
      * </pre>
      *
      * @param text The String to escape, may be null.
      * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
      * characters by numeric characters references.
      * @return The escaped text or the empty string if text == null.
      * @since 1.1
      * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
      * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
      */
     public static String escapeHTML( final String text, final boolean xmlMode )
     {
         if ( text == null )
         {
             return "";
         }
 
         int length = text.length();
         StringBuilder buffer = new StringBuilder( length );
 
         for ( int i = 0; i < length; ++i )
         {
             char c = text.charAt( i );
             switch ( c )
             {
                 case '<':
                     buffer.append( "&lt;" );
                     break;
                 case '>':
                     buffer.append( "&gt;" );
                     break;
                 case '&':
                     buffer.append( "&amp;" );
                     break;
                 case '\"':
                     buffer.append( "&quot;" );
                     break;
                 default:
                     if ( xmlMode )
                     {
                         if ( c == '\'' )
                         {
                             buffer.append( "&apos;" );
                         }
                         else
                         {
                             buffer.append( c );
                         }
                     }
                     else
                     {
                         if ( c <= ASCII )
                         {
                             // ASCII.
                             buffer.append( c );
                         }
                         else
                         {
                             buffer.append( "&#x" );
                             if ( isHighSurrogate( c ) )
                             {
                                 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
                             }
                             else
                             {
                                 buffer.append( Integer.toHexString( c ) );
                             }
                             buffer.append( ';' );
                         }
                     }
             }
         }
 
         return buffer.toString();
     }
 
     /**
      * Unescapes HTML entities in a string in non xml mode.
      *
      * @param text the <code>String</code> to unescape, may be null.
      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
      * @since 1.1.1.
      * @see #unescapeHTML(String, boolean)
      */
     public static String unescapeHTML( String text )
     {
         return unescapeHTML( text, false );
     }
 
     /**
      * Unescapes HTML entities in a string.
      *
      * <p> Unescapes a string containing entity escapes to a string
      * containing the actual Unicode characters corresponding to the
      * escapes. Supports HTML 4.0 entities.</p>
      *
      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
      * will become "&lt;Fran&ccedil;ais&gt;".</p>
      *
      * <b>Note</b>: all unicode entities are decoded, i.e.:
      * <pre>
      * &#38;#x159;   = \u0159
      * &#38;#x1d7ed; = \uD835\uDFED
      * </pre>
      *
      * @param text the <code>String</code> to unescape, may be null.
      * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
      * @since 1.1.1.
      */
     public static String unescapeHTML( String text, boolean xmlMode )
     {
         if ( text == null )
         {
             return null;
         }
 
         String unescaped;
         if ( xmlMode )
         {
             unescaped = StringEscapeUtils.unescapeXml( text );
         }
         else
         {
             // StringEscapeUtils.unescapeHtml returns entities it doesn't recognize unchanged
             unescaped = StringEscapeUtils.unescapeHtml( text );
         }
 
         String tmp = unescaped;
         List<String> entities = new ArrayList<String>();
         while ( true )
         {
             int i = tmp.indexOf( "&#x" );
             if ( i == -1 )
             {
                 break;
             }
 
             tmp = tmp.substring( i + 3 );
             if ( tmp.indexOf( ';' ) != -1 )
             {
                 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
                 try
                 {
                     Integer.parseInt( entity, 16 );
                     entities.add( entity );
                 }
                 catch ( NumberFormatException e )
                 {
                     // nop
                 }
             }
         }
 
         for ( String entity : entities )
         {
             int codePoint = Integer.parseInt( entity, 16 );
             unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
         }
 
         return unescaped;
     }
 
     /**
      * Encode an url
      *
      * @param url the String to encode, may be null
      * @return the text encoded, null if null String input
      */
     public static String encodeURL( String url )
     {
         if ( url == null )
         {
             return null;
         }
 
         StringBuilder encoded = new StringBuilder();
         int length = url.length();
 
         char[] unicode = new char[1];
 
         for ( int i = 0; i < length; ++i )
         {
             char c = url.charAt( i );
 
             switch ( c )
             {
                 case ';':
                 case '/':
                 case '?':
                 case ':':
                 case '@':
                 case '&':
                 case '=':
                 case '+':
                 case '$':
                 case ',':
                 case '[':
                 case ']': // RFC 2732 (IPV6)
                 case '-':
                 case '_':
                 case '.':
                 case '!':
                 case '~':
                 case '*':
                 case '\'':
                 case '(':
                 case ')':
                 case '#': // XLink mark
                     encoded.append( c );
                     break;
                 default:
                     if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
                     {
                         encoded.append( c );
                     }
                     else
                     {
                         byte[] bytes;
 
                         try
                         {
                             if ( isHighSurrogate( c ) )
                             {
                                 int codePoint = toCodePoint( c, url.charAt( ++i ) );
                                 unicode = toChars( codePoint );
                                 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
                             }
                             else
                             {
                                 unicode[0] = c;
                                 bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
                             }
                         }
                         catch ( UnsupportedEncodingException cannotHappen )
                         {
                             bytes = new byte[0];
                         }
 
                         for ( int j = 0; j < bytes.length; ++j )
                         {
                             String hex = DoxiaUtils.byteToHex( bytes[j] );
 
                             encoded.append( '%' );
                             if ( hex.length() == 1 )
                             {
                                 encoded.append( '0' );
                             }
                             encoded.append( hex );
                         }
                     }
             }
         }
 
         return encoded.toString();
     }
 
     /**
      * Construct a valid id.
      *
      * <p>
      *   <b>Note</b>: this method is identical to
      *   {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, true)},
      *   the rules to encode an id are laid out there.
      * </p>
      *
      * @param id The id to be encoded.
      * @return The trimmed and encoded id, or null if id is null.
      * @see DoxiaUtils#encodeId(java.lang.String,boolean)
      */
     public static String encodeId( String id )
     {
         return DoxiaUtils.encodeId( id, true );
     }
 
     /**
      * Determines if the specified text is a valid id according to the rules
      * laid out in {@link #encodeId(String)}.
      *
      * @param text The text to be tested.
      * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
      * @see #encodeId(String).
      */
     public static boolean isId( String text )
     {
         return DoxiaUtils.isValidId( text );
     }
 
     private HtmlTools()
     {
         // utility class
     }
 
 //
 // Imported code from ASF Harmony project rev 770909
 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
 //
 
     private static final char LUNATE_SIGMA = 0x3FF;
     private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
     private static final char LOW_SURROGATE = 0xDC00;
 
     private static int toCodePoint( char high, char low )
     {
         // See RFC 2781, Section 2.2
         // http://www.faqs.org/rfcs/rfc2781.html
         int h = ( high & LUNATE_SIGMA ) << 10;
         int l = low & LUNATE_SIGMA;
         return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
     }
 
     private static final char MIN_HIGH_SURROGATE = '\uD800';
     private static final char MAX_HIGH_SURROGATE = '\uDBFF';
 
     private static boolean isHighSurrogate( char ch )
     {
         return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
     }
 
     private static final int MIN_CODE_POINT = 0x000000;
     private static final int MAX_CODE_POINT = 0x10FFFF;
     private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
 
     private static boolean isValidCodePoint( int codePoint )
     {
         return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
     }
 
     private static boolean isSupplementaryCodePoint( int codePoint )
     {
         return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
     }
 
     /**
      * Converts the given code point to an equivalent character array.
      *
      * @param codePoint the code point to convert.
      * @return If codePoint is a supplementary code point, returns a character array of length 2,
      * otherwise a character array of length 1 containing only the original int as a char.
      */
     public static char[] toChars( int codePoint )
     {
         if ( !isValidCodePoint( codePoint ) )
         {
             throw new IllegalArgumentException();
         }
 
         if ( isSupplementaryCodePoint( codePoint ) )
         {
             int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
             int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
             int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
             return new char[] { (char) high, (char) low };
         }
         return new char[] { (char) codePoint };
     }
 }

1		package org.apache.maven.doxia.util;
2
3		/*
4		* Licensed to the Apache Software Foundation (ASF) under one
5		* or more contributor license agreements. See the NOTICE file
6		* distributed with this work for additional information
7		* regarding copyright ownership. The ASF licenses this file
8		* to you under the Apache License, Version 2.0 (the
9		* "License"); you may not use this file except in compliance
10		* with the License. You may obtain a copy of the License at
11		*
12		* http://www.apache.org/licenses/LICENSE-2.0
13		*
14		* Unless required by applicable law or agreed to in writing,
15		* software distributed under the License is distributed on an
16		* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17		* KIND, either express or implied. See the License for the
18		* specific language governing permissions and limitations
19		* under the License.
20		*/
21
22		import java.io.UnsupportedEncodingException;
23		import java.util.ArrayList;
24		import java.util.HashMap;
25		import java.util.List;
26		import java.util.Map;
27
28		import javax.swing.text.html.HTML.Tag;
29
30		import org.apache.commons.lang.StringEscapeUtils;
31		import org.apache.maven.doxia.markup.HtmlMarkup;
32		import org.codehaus.plexus.util.StringUtils;
33
34		/**
35		* The <code>HtmlTools</code> class defines methods to HTML handling.
36		*
37		* @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
38		* @version $Id: HtmlTools.java 1185112 2011-10-17 11:33:00Z ltheussl $
39		* @since 1.0
40		*/
41		public class HtmlTools
42		{
43	2	private static final Tag[] ALL_TAGS =
44		{
45		HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
46		HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
47		HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
48		HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
49		HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
50		HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
51		HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
52		HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
53		HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
54		HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
55		HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
56		HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
57		HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
58		HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
59		HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
60		HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
61		HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
62		HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
63		};
64
65	2	private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
66
67		private static final int ASCII = 0x7E;
68
69		static
70		{
71	184	for ( Tag tag : ALL_TAGS )
72		{
73	182	TAG_MAP.put( tag.toString(), tag );
74		}
75	2	}
76
77		/**
78		* Returns a tag for a defined HTML tag name. This is one of
79		* the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
80		* If the given name does not represent one of the defined tags, then
81		* <code>null</code> will be returned.
82		*
83		* @param tagName the <code>String</code> name requested.
84		* @return a tag constant corresponding to the <code>tagName</code>,
85		* or <code>null</code> if not found.
86		* @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
87		* @since 1.1
88		*/
89		public static Tag getHtmlTag( String tagName )
90		{
91	12	Object t = TAG_MAP.get( tagName );
92
93	12	return (Tag) t;
94		}
95
96		/**
97		* Escape special HTML characters in a String in <code>xml</code> mode.
98		*
99		* <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
100		*
101		* @param text the String to escape, may be null.
102		* @return The escaped text or the empty string if text == null.
103		* @see #escapeHTML(String,boolean)
104		*/
105		public static String escapeHTML( String text )
106		{
107	32	return escapeHTML( text, true );
108		}
109
110		/**
111		* Escape special HTML characters in a String.
112		*
113		* <pre>
114		* < becomes <code>&lt;</code>
115		* > becomes <code>&gt;</code>
116		* & becomes <code>&amp;</code>
117		* " becomes <code>&quot;</code>
118		* ' becomes <code>&apos;</code> if xmlMode = true
119		* </pre>
120		*
121		* If <code>xmlMode</code> is true, every other character than the above remains unchanged,
122		* if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
123		*
124		* <b>Note</b>: all characters are encoded, i.e.:
125		* <pre>
126		* \u0159 = &#x159;
127		* \uD835\uDFED = &#x1d7ed;
128		* </pre>
129		*
130		* @param text The String to escape, may be null.
131		* @param xmlMode <code>true</code> to replace also ' to &apos, <code>false</code> to replace non-ascii
132		* characters by numeric characters references.
133		* @return The escaped text or the empty string if text == null.
134		* @since 1.1
135		* @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
136		* @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
137		*/
138		public static String escapeHTML( final String text, final boolean xmlMode )
139		{
140	102	if ( text == null )
141		{
142	2	return "";
143		}
144
145	100	int length = text.length();
146	100	StringBuilder buffer = new StringBuilder( length );
147
148	872	for ( int i = 0; i < length; ++i )
149		{
150	772	char c = text.charAt( i );
151	772	switch ( c )
152		{
153		case '<':
154	2	buffer.append( "<" );
155	2	break;
156		case '>':
157	2	buffer.append( ">" );
158	2	break;
159		case '&':
160	14	buffer.append( "&" );
161	14	break;
162		case '\"':
163	2	buffer.append( """ );
164	2	break;
165		default:
166	752	if ( xmlMode )
167		{
168	70	if ( c == '\'' )
169		{
170	2	buffer.append( "'" );
171		}
172		else
173		{
174	68	buffer.append( c );
175		}
176		}
177		else
178		{
179	682	if ( c <= ASCII )
180		{
181		// ASCII.
182	670	buffer.append( c );
183		}
184		else
185		{
186	12	buffer.append( "&#x" );
187	12	if ( isHighSurrogate( c ) )
188		{
189	2	buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
190		}
191		else
192		{
193	10	buffer.append( Integer.toHexString( c ) );
194		}
195	12	buffer.append( ';' );
196		}
197		}
198		}
199		}
200
201	100	return buffer.toString();
202		}
203
204		/**
205		* Unescapes HTML entities in a string in non xml mode.
206		*
207		* @param text the <code>String</code> to unescape, may be null.
208		* @return a new unescaped <code>String</code>, <code>null</code> if null string input.
209		* @since 1.1.1.
210		* @see #unescapeHTML(String, boolean)
211		*/
212		public static String unescapeHTML( String text )
213		{
214	70	return unescapeHTML( text, false );
215		}
216
217		/**
218		* Unescapes HTML entities in a string.
219		*
220		* <p> Unescapes a string containing entity escapes to a string
221		* containing the actual Unicode characters corresponding to the
222		* escapes. Supports HTML 4.0 entities.</p>
223		*
224		* <p>For example, the string "&lt;Fran&ccedil;ais&gt;"
225		* will become "<Français>".</p>
226		*
227		* <b>Note</b>: all unicode entities are decoded, i.e.:
228		* <pre>
229		* &#x159; = \u0159
230		* &#x1d7ed; = \uD835\uDFED
231		* </pre>
232		*
233		* @param text the <code>String</code> to unescape, may be null.
234		* @param xmlMode set to <code>true</code> to replace &apos by '.
235		* @return a new unescaped <code>String</code>, <code>null</code> if null string input.
236		* @since 1.1.1.
237		*/
238		public static String unescapeHTML( String text, boolean xmlMode )
239		{
240	72	if ( text == null )
241		{
242	2	return null;
243		}
244
245		String unescaped;
246	70	if ( xmlMode )
247		{
248	2	unescaped = StringEscapeUtils.unescapeXml( text );
249		}
250		else
251		{
252		// StringEscapeUtils.unescapeHtml returns entities it doesn't recognize unchanged
253	68	unescaped = StringEscapeUtils.unescapeHtml( text );
254		}
255
256	70	String tmp = unescaped;
257	70	List<String> entities = new ArrayList<String>();
258		while ( true )
259		{
260	96	int i = tmp.indexOf( "&#x" );
261	96	if ( i == -1 )
262		{
263	70	break;
264		}
265
266	26	tmp = tmp.substring( i + 3 );
267	26	if ( tmp.indexOf( ';' ) != -1 )
268		{
269	22	String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
270		try
271		{
272	22	Integer.parseInt( entity, 16 );
273	18	entities.add( entity );
274		}
275	4	catch ( NumberFormatException e )
276		{
277		// nop
278	18	}
279		}
280	26	}
281
282	70	for ( String entity : entities )
283		{
284	18	int codePoint = Integer.parseInt( entity, 16 );
285	18	unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
286	18	}
287
288	70	return unescaped;
289		}
290
291		/**
292		* Encode an url
293		*
294		* @param url the String to encode, may be null
295		* @return the text encoded, null if null String input
296		*/
297		public static String encodeURL( String url )
298		{
299	10	if ( url == null )
300		{
301	2	return null;
302		}
303
304	8	StringBuilder encoded = new StringBuilder();
305	8	int length = url.length();
306
307	8	char[] unicode = new char[1];
308
309	210	for ( int i = 0; i < length; ++i )
310		{
311	202	char c = url.charAt( i );
312
313	202	switch ( c )
314		{
315		case ';':
316		case '/':
317		case '?':
318		case ':':
319		case '@':
320		case '&':
321		case '=':
322		case '+':
323		case '$':
324		case ',':
325		case '[':
326		case ']': // RFC 2732 (IPV6)
327		case '-':
328		case '_':
329		case '.':
330		case '!':
331		case '~':
332		case '*':
333		case '\'':
334		case '(':
335		case ')':
336		case '#': // XLink mark
337	34	encoded.append( c );
338	34	break;
339		default:
340	168	if ( ( c >= 'a' && c <= 'z' ) \|\| ( c >= 'A' && c <= 'Z' ) \|\| ( c >= '0' && c <= '9' ) )
341		{
342	146	encoded.append( c );
343		}
344		else
345		{
346		byte[] bytes;
347
348		try
349		{
350	22	if ( isHighSurrogate( c ) )
351		{
352	2	int codePoint = toCodePoint( c, url.charAt( ++i ) );
353	2	unicode = toChars( codePoint );
354	2	bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
355	2	}
356		else
357		{
358	20	unicode[0] = c;
359	20	bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
360		}
361		}
362	0	catch ( UnsupportedEncodingException cannotHappen )
363		{
364	0	bytes = new byte[0];
365	22	}
366
367	50	for ( int j = 0; j < bytes.length; ++j )
368		{
369	28	String hex = DoxiaUtils.byteToHex( bytes[j] );
370
371	28	encoded.append( '%' );
372	28	if ( hex.length() == 1 )
373		{
374	0	encoded.append( '0' );
375		}
376	28	encoded.append( hex );
377		}
378		}
379		}
380		}
381
382	8	return encoded.toString();
383		}
384
385		/**
386		* Construct a valid id.
387		*
388		* <p>
389		* <b>Note</b>: this method is identical to
390		* {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, true)},
391		* the rules to encode an id are laid out there.
392		* </p>
393		*
394		* @param id The id to be encoded.
395		* @return The trimmed and encoded id, or null if id is null.
396		* @see DoxiaUtils#encodeId(java.lang.String,boolean)
397		*/
398		public static String encodeId( String id )
399		{
400	84	return DoxiaUtils.encodeId( id, true );
401		}
402
403		/**
404		* Determines if the specified text is a valid id according to the rules
405		* laid out in {@link #encodeId(String)}.
406		*
407		* @param text The text to be tested.
408		* @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
409		* @see #encodeId(String).
410		*/
411		public static boolean isId( String text )
412		{
413	30	return DoxiaUtils.isValidId( text );
414		}
415
416		private HtmlTools()
417	0	{
418		// utility class
419	0	}
420
421		//
422		// Imported code from ASF Harmony project rev 770909
423		// http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
424		//
425
426		private static final char LUNATE_SIGMA = 0x3FF;
427		private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
428		private static final char LOW_SURROGATE = 0xDC00;
429
430		private static int toCodePoint( char high, char low )
431		{
432		// See RFC 2781, Section 2.2
433		// http://www.faqs.org/rfcs/rfc2781.html
434	4	int h = ( high & LUNATE_SIGMA ) << 10;
435	4	int l = low & LUNATE_SIGMA;
436	4	return ( h \| l ) + MIN_SUPPLEMENTARY_CODE_POINT;
437		}
438
439		private static final char MIN_HIGH_SURROGATE = '\uD800';
440		private static final char MAX_HIGH_SURROGATE = '\uDBFF';
441
442		private static boolean isHighSurrogate( char ch )
443		{
444	34	return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
445		}
446
447		private static final int MIN_CODE_POINT = 0x000000;
448		private static final int MAX_CODE_POINT = 0x10FFFF;
449		private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
450
451		private static boolean isValidCodePoint( int codePoint )
452		{
453	20	return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
454		}
455
456		private static boolean isSupplementaryCodePoint( int codePoint )
457		{
458	20	return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
459		}
460
461		/**
462		* Converts the given code point to an equivalent character array.
463		*
464		* @param codePoint the code point to convert.
465		* @return If codePoint is a supplementary code point, returns a character array of length 2,
466		* otherwise a character array of length 1 containing only the original int as a char.
467		*/
468		public static char[] toChars( int codePoint )
469		{
470	20	if ( !isValidCodePoint( codePoint ) )
471		{
472	0	throw new IllegalArgumentException();
473		}
474
475	20	if ( isSupplementaryCodePoint( codePoint ) )
476		{
477	20	int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
478	20	int high = NON_PRIVATE_USE_HIGH_SURROGATE \| ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
479	20	int low = LOW_SURROGATE \| ( cpPrime & LUNATE_SIGMA );
480	20	return new char[] { (char) high, (char) low };
481		}
482	0	return new char[] { (char) codePoint };
483		}
484		}