View Javadoc
1   package org.apache.maven.doxia.util;
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
22  import;
23  import java.util.ArrayList;
24  import java.util.HashMap;
25  import java.util.List;
26  import java.util.Map;
28  import javax.swing.text.html.HTML.Tag;
30  import org.apache.commons.lang3.StringEscapeUtils;
31  import org.apache.maven.doxia.markup.HtmlMarkup;
32  import org.codehaus.plexus.util.StringUtils;
34  /**
35   * The <code>HtmlTools</code> class defines methods to HTML handling.
36   *
37   * @author <a href="">Vincent Siveton</a>
38   * @version $Id$
39   * @since 1.0
40   */
41  public class HtmlTools
42  {
43      private static final Tag[] ALL_TAGS  =
44      {
45          HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
46          HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
47          HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
48          HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
49          HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
50          HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
51          HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
52          HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
53          HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
54          HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
55          HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
56          HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
57          HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
58          HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
59          HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
60          HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
61          HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
62          HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
63      };
65      private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
67      private static final int ASCII = 0x7E;
69      static
70      {
71          for ( Tag tag : ALL_TAGS )
72          {
73              TAG_MAP.put( tag.toString(), tag );
74          }
75      }
77      /**
78       * Returns a tag for a defined HTML tag name. This is one of
79       * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
80       * If the given name does not represent one of the defined tags, then
81       * <code>null</code> will be returned.
82       *
83       * @param tagName the <code>String</code> name requested.
84       * @return a tag constant corresponding to the <code>tagName</code>,
85       *    or <code>null</code> if not found.
86       * @see <a href=""></a>
87       * @since 1.1
88       */
89      public static Tag getHtmlTag( String tagName )
90      {
91          Object t =  TAG_MAP.get( tagName );
93          return (Tag) t;
94      }
96      /**
97       * Escape special HTML characters in a String in <code>xml</code> mode.
98       *
99       * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
100      *
101      * @param text the String to escape, may be null.
102      * @return The escaped text or the empty string if text == null.
103      * @see #escapeHTML(String,boolean)
104      */
105     public static String escapeHTML( String text )
106     {
107         return escapeHTML( text, true );
108     }
110     /**
111      * Escape special HTML characters in a String.
112      *
113      * <pre>
114      * < becomes <code>&#38;lt;</code>
115      * > becomes <code>&#38;gt;</code>
116      * & becomes <code>&#38;amp;</code>
117      * " becomes <code>&#38;quot;</code>
118      * ' becomes <code>&#38;apos;</code> if xmlMode = true
119      * </pre>
120      *
121      * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
122      * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
123      *
124      * <b>Note</b>: all characters are encoded, i.e.:
125      * <pre>
126      * \u0159       = &#38;#x159;
127      * \uD835\uDFED = &#38;#x1d7ed;
128      * </pre>
129      *
130      * @param text The String to escape, may be null.
131      * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
132      * characters by numeric characters references.
133      * @return The escaped text or the empty string if text == null.
134      * @since 1.1
135      * @see <a href=""></a>
136      * @see <a href=""></a>
137      */
138     public static String escapeHTML( final String text, final boolean xmlMode )
139     {
140         if ( text == null )
141         {
142             return "";
143         }
145         int length = text.length();
146         StringBuilder buffer = new StringBuilder( length );
148         for ( int i = 0; i < length; ++i )
149         {
150             char c = text.charAt( i );
151             switch ( c )
152             {
153                 case '<':
154                     buffer.append( "&lt;" );
155                     break;
156                 case '>':
157                     buffer.append( "&gt;" );
158                     break;
159                 case '&':
160                     buffer.append( "&amp;" );
161                     break;
162                 case '\"':
163                     buffer.append( "&quot;" );
164                     break;
165                 default:
166                     if ( xmlMode )
167                     {
168                         if ( c == '\'' )
169                         {
170                             buffer.append( "&apos;" );
171                         }
172                         else
173                         {
174                             buffer.append( c );
175                         }
176                     }
177                     else
178                     {
179                         if ( c <= ASCII )
180                         {
181                             // ASCII.
182                             buffer.append( c );
183                         }
184                         else
185                         {
186                             buffer.append( "&#x" );
187                             if ( isHighSurrogate( c ) )
188                             {
189                                 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
190                             }
191                             else
192                             {
193                                 buffer.append( Integer.toHexString( c ) );
194                             }
195                             buffer.append( ';' );
196                         }
197                     }
198             }
199         }
201         return buffer.toString();
202     }
204     /**
205      * Unescapes HTML entities in a string in non xml mode.
206      *
207      * @param text the <code>String</code> to unescape, may be null.
208      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
209      * @since 1.1.1.
210      * @see #unescapeHTML(String, boolean)
211      */
212     public static String unescapeHTML( String text )
213     {
214         return unescapeHTML( text, false );
215     }
217     /**
218      * Unescapes HTML entities in a string.
219      *
220      * <p> Unescapes a string containing entity escapes to a string
221      * containing the actual Unicode characters corresponding to the
222      * escapes. Supports HTML 4.0 entities.</p>
223      *
224      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
225      * will become "&lt;Fran&ccedil;ais&gt;".</p>
226      *
227      * <b>Note</b>: all unicode entities are decoded, i.e.:
228      * <pre>
229      * &#38;#x159;   = \u0159
230      * &#38;#x1d7ed; = \uD835\uDFED
231      * </pre>
232      *
233      * @param text the <code>String</code> to unescape, may be null.
234      * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
235      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
236      * @since 1.1.1.
237      */
238     public static String unescapeHTML( String text, boolean xmlMode )
239     {
240         if ( text == null )
241         {
242             return null;
243         }
245         String unescaped;
246         if ( xmlMode )
247         {
248             unescaped = StringEscapeUtils.unescapeXml( text );
249         }
250         else
251         {
252             // StringEscapeUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
253             unescaped = StringEscapeUtils.unescapeHtml4( text );
254         }
256         String tmp = unescaped;
257         List<String> entities = new ArrayList<String>();
258         while ( true )
259         {
260             int i = tmp.indexOf( "&#x" );
261             if ( i == -1 )
262             {
263                 break;
264             }
266             tmp = tmp.substring( i + 3 );
267             if ( tmp.indexOf( ';' ) != -1 )
268             {
269                 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
270                 try
271                 {
272                     Integer.parseInt( entity, 16 );
273                     entities.add( entity );
274                 }
275                 catch ( NumberFormatException e )
276                 {
277                     // nop
278                 }
279             }
280         }
282         for ( String entity : entities )
283         {
284             int codePoint = Integer.parseInt( entity, 16 );
285             unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
286         }
288         return unescaped;
289     }
291     /**
292      * Encode an url
293      *
294      * @param url the String to encode, may be null
295      * @return the text encoded, null if null String input
296      */
297     public static String encodeURL( String url )
298     {
299         if ( url == null )
300         {
301             return null;
302         }
304         StringBuilder encoded = new StringBuilder();
305         int length = url.length();
307         char[] unicode = new char[1];
309         for ( int i = 0; i < length; ++i )
310         {
311             char c = url.charAt( i );
313             switch ( c )
314             {
315                 case ';':
316                 case '/':
317                 case '?':
318                 case ':':
319                 case '@':
320                 case '&':
321                 case '=':
322                 case '+':
323                 case '$':
324                 case ',':
325                 case '[':
326                 case ']': // RFC 2732 (IPV6)
327                 case '-':
328                 case '_':
329                 case '.':
330                 case '!':
331                 case '~':
332                 case '*':
333                 case '\'':
334                 case '(':
335                 case ')':
336                 case '#': // XLink mark
337                     encoded.append( c );
338                     break;
339                 default:
340                     if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
341                     {
342                         encoded.append( c );
343                     }
344                     else
345                     {
346                         byte[] bytes;
348                         try
349                         {
350                             if ( isHighSurrogate( c ) )
351                             {
352                                 int codePoint = toCodePoint( c, url.charAt( ++i ) );
353                                 unicode = toChars( codePoint );
354                                 bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
355                             }
356                             else
357                             {
358                                 unicode[0] = c;
359                                 bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
360                             }
361                         }
362                         catch ( UnsupportedEncodingException cannotHappen )
363                         {
364                             bytes = new byte[0];
365                         }
367                         for ( int j = 0; j < bytes.length; ++j )
368                         {
369                             encoded.append( '%' );
370                             encoded.append( String.format( "%02X", bytes[j] ) );
371                         }
372                     }
373             }
374         }
376         return encoded.toString();
377     }
379     /**
380      * Construct a valid id.
381      *
382      * <p>
383      *   <b>Note</b>: this method is identical to
384      *   {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, false )},
385      *   the rules to encode an id are laid out there.
386      * </p>
387      *
388      * @param id The id to be encoded.
389      * @return The trimmed and encoded id, or null if id is null.
390      * @see DoxiaUtils#encodeId(java.lang.String,boolean)
391      */
392     public static String encodeId( String id )
393     {
394         return DoxiaUtils.encodeId( id, false );
395     }
397     /**
398      * Determines if the specified text is a valid id according to the rules
399      * laid out in {@link #encodeId(String)}.
400      *
401      * @param text The text to be tested.
402      * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
403      * @see #encodeId(String).
404      */
405     public static boolean isId( String text )
406     {
407         return DoxiaUtils.isValidId( text );
408     }
410     private HtmlTools()
411     {
412         // utility class
413     }
415 //
416 // Imported code from ASF Harmony project rev 770909
417 //
418 //
420     private static final char LUNATE_SIGMA = 0x3FF;
421     private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
422     private static final char LOW_SURROGATE = 0xDC00;
424     private static int toCodePoint( char high, char low )
425     {
426         // See RFC 2781, Section 2.2
427         //
428         int h = ( high & LUNATE_SIGMA ) << 10;
429         int l = low & LUNATE_SIGMA;
430         return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
431     }
433     private static final char MIN_HIGH_SURROGATE = '\uD800';
434     private static final char MAX_HIGH_SURROGATE = '\uDBFF';
436     private static boolean isHighSurrogate( char ch )
437     {
438         return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
439     }
441     private static final int MIN_CODE_POINT = 0x000000;
442     private static final int MAX_CODE_POINT = 0x10FFFF;
443     private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
445     private static boolean isValidCodePoint( int codePoint )
446     {
447         return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
448     }
450     private static boolean isSupplementaryCodePoint( int codePoint )
451     {
452         return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
453     }
455     /**
456      * Converts the given code point to an equivalent character array.
457      *
458      * @param codePoint the code point to convert.
459      * @return If codePoint is a supplementary code point, returns a character array of length 2,
460      * otherwise a character array of length 1 containing only the original int as a char.
461      */
462     public static char[] toChars( int codePoint )
463     {
464         if ( !isValidCodePoint( codePoint ) )
465         {
466             throw new IllegalArgumentException();
467         }
469         if ( isSupplementaryCodePoint( codePoint ) )
470         {
471             int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
472             int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
473             int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
474             return new char[] { (char) high, (char) low };
475         }
476         return new char[] { (char) codePoint };
477     }
478 }