View Javadoc
1   package org.apache.maven.doxia.util;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.nio.charset.StandardCharsets;
23  import java.util.ArrayList;
24  import java.util.HashMap;
25  import java.util.List;
26  import java.util.Map;
27  
28  import javax.swing.text.html.HTML.Tag;
29  
30  import org.apache.commons.lang3.StringEscapeUtils;
31  import org.apache.maven.doxia.markup.HtmlMarkup;
32  import org.codehaus.plexus.util.StringUtils;
33  
34  /**
35   * The <code>HtmlTools</code> class defines methods to HTML handling.
36   *
37   * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
38   * @version $Id$
39   * @since 1.0
40   */
41  public class HtmlTools
42  {
43      private static final Tag[] ALL_TAGS  =
44      {
45          HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
46          HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
47          HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
48          HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
49          HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
50          HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
51          HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
52          HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
53          HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
54          HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
55          HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
56          HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
57          HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
58          HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
59          HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
60          HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
61          HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
62          HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
63      };
64  
65      private static final Map<String, Tag> TAG_MAP = new HashMap<>( ALL_TAGS.length );
66  
67      private static final int ASCII = 0x7E;
68  
69      static
70      {
71          for ( Tag tag : ALL_TAGS )
72          {
73              TAG_MAP.put( tag.toString(), tag );
74          }
75      }
76  
77      /**
78       * Returns a tag for a defined HTML tag name. This is one of
79       * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
80       * If the given name does not represent one of the defined tags, then
81       * <code>null</code> will be returned.
82       *
83       * @param tagName the <code>String</code> name requested.
84       * @return a tag constant corresponding to the <code>tagName</code>,
85       *    or <code>null</code> if not found.
86       * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
87       * @since 1.1
88       */
89      public static Tag getHtmlTag( String tagName )
90      {
91          return TAG_MAP.get( tagName );
92      }
93  
94      /**
95       * Escape special HTML characters in a String in <code>xml</code> mode.
96       *
97       * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
98       *
99       * @param text the String to escape, may be null.
100      * @return The escaped text or the empty string if text == null.
101      * @see #escapeHTML(String,boolean)
102      */
103     public static String escapeHTML( String text )
104     {
105         return escapeHTML( text, true );
106     }
107 
108     /**
109      * Escape special HTML characters in a String.
110      *
111      * <pre>
112      * &lt; becomes <code>&#38;lt;</code>
113      * &gt; becomes <code>&#38;gt;</code>
114      * &amp; becomes <code>&#38;amp;</code>
115      * " becomes <code>&#38;quot;</code>
116      * ' becomes <code>&#38;apos;</code> if xmlMode = true
117      * </pre>
118      *
119      * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
120      * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
121      *
122      * <b>Note</b>: all characters are encoded, i.e.:
123      * <pre>
124      * \u0159       = &#38;#x159;
125      * \uD835\uDFED = &#38;#x1d7ed;
126      * </pre>
127      *
128      * @param text The String to escape, may be null.
129      * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
130      * characters by numeric characters references.
131      * @return The escaped text or the empty string if text == null.
132      * @since 1.1
133      * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
134      * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
135      */
136     public static String escapeHTML( final String text, final boolean xmlMode )
137     {
138         if ( text == null )
139         {
140             return "";
141         }
142 
143         int length = text.length();
144         StringBuilder buffer = new StringBuilder( length );
145 
146         for ( int i = 0; i < length; ++i )
147         {
148             char c = text.charAt( i );
149             switch ( c )
150             {
151                 case '<':
152                     buffer.append( "&lt;" );
153                     break;
154                 case '>':
155                     buffer.append( "&gt;" );
156                     break;
157                 case '&':
158                     buffer.append( "&amp;" );
159                     break;
160                 case '\"':
161                     buffer.append( "&quot;" );
162                     break;
163                 default:
164                     if ( xmlMode )
165                     {
166                         if ( c == '\'' )
167                         {
168                             buffer.append( "&apos;" );
169                         }
170                         else
171                         {
172                             buffer.append( c );
173                         }
174                     }
175                     else
176                     {
177                         if ( c <= ASCII )
178                         {
179                             // ASCII.
180                             buffer.append( c );
181                         }
182                         else
183                         {
184                             buffer.append( "&#x" );
185                             if ( isHighSurrogate( c ) )
186                             {
187                                 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
188                             }
189                             else
190                             {
191                                 buffer.append( Integer.toHexString( c ) );
192                             }
193                             buffer.append( ';' );
194                         }
195                     }
196             }
197         }
198 
199         return buffer.toString();
200     }
201 
202     /**
203      * Unescapes HTML entities in a string in non xml mode.
204      *
205      * @param text the <code>String</code> to unescape, may be null.
206      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
207      * @since 1.1.1.
208      * @see #unescapeHTML(String, boolean)
209      */
210     public static String unescapeHTML( String text )
211     {
212         return unescapeHTML( text, false );
213     }
214 
215     /**
216      * Unescapes HTML entities in a string.
217      *
218      * <p> Unescapes a string containing entity escapes to a string
219      * containing the actual Unicode characters corresponding to the
220      * escapes. Supports HTML 4.0 entities.</p>
221      *
222      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
223      * will become "&lt;Fran&ccedil;ais&gt;".</p>
224      *
225      * <b>Note</b>: all unicode entities are decoded, i.e.:
226      * <pre>
227      * &#38;#x159;   = \u0159
228      * &#38;#x1d7ed; = \uD835\uDFED
229      * </pre>
230      *
231      * @param text the <code>String</code> to unescape, may be null.
232      * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
233      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
234      * @since 1.1.1.
235      */
236     public static String unescapeHTML( String text, boolean xmlMode )
237     {
238         if ( text == null )
239         {
240             return null;
241         }
242 
243         String unescaped;
244         if ( xmlMode )
245         {
246             unescaped = StringEscapeUtils.unescapeXml( text );
247         }
248         else
249         {
250             // StringEscapeUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
251             unescaped = StringEscapeUtils.unescapeHtml4( text );
252         }
253 
254         String tmp = unescaped;
255         List<String> entities = new ArrayList<>();
256         while ( true )
257         {
258             int i = tmp.indexOf( "&#x" );
259             if ( i == -1 )
260             {
261                 break;
262             }
263 
264             tmp = tmp.substring( i + 3 );
265             if ( tmp.indexOf( ';' ) != -1 )
266             {
267                 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
268                 try
269                 {
270                     Integer.parseInt( entity, 16 );
271                     entities.add( entity );
272                 }
273                 catch ( NumberFormatException e )
274                 {
275                     // nop
276                 }
277             }
278         }
279 
280         for ( String entity : entities )
281         {
282             int codePoint = Integer.parseInt( entity, 16 );
283             unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
284         }
285 
286         return unescaped;
287     }
288 
289     /**
290      * Encode an url
291      *
292      * @param url the String to encode, may be null
293      * @return the text encoded, null if null String input
294      */
295     public static String encodeURL( String url )
296     {
297         if ( url == null )
298         {
299             return null;
300         }
301 
302         StringBuilder encoded = new StringBuilder();
303         int length = url.length();
304 
305         char[] unicode = new char[1];
306 
307         for ( int i = 0; i < length; ++i )
308         {
309             char c = url.charAt( i );
310 
311             switch ( c )
312             {
313                 case ';':
314                 case '/':
315                 case '?':
316                 case ':':
317                 case '@':
318                 case '&':
319                 case '=':
320                 case '+':
321                 case '$':
322                 case ',':
323                 case '[':
324                 case ']': // RFC 2732 (IPV6)
325                 case '-':
326                 case '_':
327                 case '.':
328                 case '!':
329                 case '~':
330                 case '*':
331                 case '\'':
332                 case '(':
333                 case ')':
334                 case '#': // XLink mark
335                     encoded.append( c );
336                     break;
337                 default:
338                     if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
339                     {
340                         encoded.append( c );
341                     }
342                     else
343                     {
344                         byte[] bytes;
345 
346                         if ( isHighSurrogate( c ) )
347                         {
348                             int codePoint = toCodePoint( c, url.charAt( ++i ) );
349                             unicode = toChars( codePoint );
350                             bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( StandardCharsets.UTF_8 );
351                         }
352                         else
353                         {
354                             unicode[0] = c;
355                             bytes = ( new String( unicode, 0, 1 ) ).getBytes( StandardCharsets.UTF_8 );
356                         }
357 
358                         for ( byte aByte : bytes )
359                         {
360                             encoded.append( '%' );
361                             encoded.append( String.format( "%02X", aByte ) );
362                         }
363                     }
364             }
365         }
366 
367         return encoded.toString();
368     }
369 
370     /**
371      * Construct a valid id.
372      *
373      * <p>
374      *   <b>Note</b>: this method is identical to
375      *   {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, false )},
376      *   the rules to encode an id are laid out there.
377      * </p>
378      *
379      * @param id The id to be encoded.
380      * @return The trimmed and encoded id, or null if id is null.
381      * @see DoxiaUtils#encodeId(java.lang.String,boolean)
382      */
383     public static String encodeId( String id )
384     {
385         return DoxiaUtils.encodeId( id, false );
386     }
387 
388     /**
389      * Determines if the specified text is a valid id according to the rules
390      * laid out in {@link #encodeId(String)}.
391      *
392      * @param text The text to be tested.
393      * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
394      * @see DoxiaUtils#isValidId(String)
395      */
396     public static boolean isId( String text )
397     {
398         return DoxiaUtils.isValidId( text );
399     }
400 
401     private HtmlTools()
402     {
403         // utility class
404     }
405 
406 //
407 // Imported code from ASF Harmony project rev 770909
408 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
409 //
410 
411     private static final char LUNATE_SIGMA = 0x3FF;
412     private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
413     private static final char LOW_SURROGATE = 0xDC00;
414 
415     private static int toCodePoint( char high, char low )
416     {
417         // See RFC 2781, Section 2.2
418         // http://www.faqs.org/rfcs/rfc2781.html
419         int h = ( high & LUNATE_SIGMA ) << 10;
420         int l = low & LUNATE_SIGMA;
421         return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
422     }
423 
424     private static final char MIN_HIGH_SURROGATE = '\uD800';
425     private static final char MAX_HIGH_SURROGATE = '\uDBFF';
426 
427     private static boolean isHighSurrogate( char ch )
428     {
429         return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
430     }
431 
432     private static final int MIN_CODE_POINT = 0x000000;
433     private static final int MAX_CODE_POINT = 0x10FFFF;
434     private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
435 
436     private static boolean isValidCodePoint( int codePoint )
437     {
438         return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
439     }
440 
441     private static boolean isSupplementaryCodePoint( int codePoint )
442     {
443         return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
444     }
445 
446     /**
447      * Converts the given code point to an equivalent character array.
448      *
449      * @param codePoint the code point to convert.
450      * @return If codePoint is a supplementary code point, returns a character array of length 2,
451      * otherwise a character array of length 1 containing only the original int as a char.
452      */
453     public static char[] toChars( int codePoint )
454     {
455         if ( !isValidCodePoint( codePoint ) )
456         {
457             throw new IllegalArgumentException();
458         }
459 
460         if ( isSupplementaryCodePoint( codePoint ) )
461         {
462             int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
463             int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
464             int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
465             return new char[] { (char) high, (char) low };
466         }
467         return new char[] { (char) codePoint };
468     }
469 }