View Javadoc
1   package org.apache.maven.doxia.util;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one
5    * or more contributor license agreements.  See the NOTICE file
6    * distributed with this work for additional information
7    * regarding copyright ownership.  The ASF licenses this file
8    * to you under the Apache License, Version 2.0 (the
9    * "License"); you may not use this file except in compliance
10   * with the License.  You may obtain a copy of the License at
11   *
12   *   http://www.apache.org/licenses/LICENSE-2.0
13   *
14   * Unless required by applicable law or agreed to in writing,
15   * software distributed under the License is distributed on an
16   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17   * KIND, either express or implied.  See the License for the
18   * specific language governing permissions and limitations
19   * under the License.
20   */
21  
22  import java.nio.charset.StandardCharsets;
23  import java.util.ArrayList;
24  import java.util.HashMap;
25  import java.util.List;
26  import java.util.Map;
27  
28  import javax.swing.text.html.HTML.Tag;
29  
30  import org.apache.commons.text.StringEscapeUtils;
31  import org.apache.maven.doxia.markup.HtmlMarkup;
32  import org.codehaus.plexus.util.StringUtils;
33  
34  /**
35   * The <code>HtmlTools</code> class defines methods to HTML handling.
36   *
37   * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
38   * @since 1.0
39   */
40  public class HtmlTools
41  {
42      private static final Tag[] ALL_TAGS  =
43      {
44          HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
45          HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO, HtmlMarkup.BIG,
46          HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON, HtmlMarkup.CAPTION,
47          HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL, HtmlMarkup.COLGROUP,
48          HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR, HtmlMarkup.DIV, HtmlMarkup.DL,
49          HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET, HtmlMarkup.FONT, HtmlMarkup.FORM,
50          HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1, HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4,
51          HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD, HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I,
52          HtmlMarkup.IFRAME, HtmlMarkup.IMG, HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX,
53          HtmlMarkup.KBD, HtmlMarkup.KEYGEN, HtmlMarkup.LABEL, HtmlMarkup.LEGEND, HtmlMarkup.LI,
54          HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU, HtmlMarkup.META, HtmlMarkup.NOFRAMES,
55          HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL, HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION,
56          HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE, HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP,
57          HtmlMarkup.SCRIPT, HtmlMarkup.SELECT, HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE,
58          HtmlMarkup.STRONG, HtmlMarkup.STYLE, HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE,
59          HtmlMarkup.TBODY, HtmlMarkup.TD, HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH,
60          HtmlMarkup.THEAD, HtmlMarkup.TITLE, HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL,
61          HtmlMarkup.VAR
62      };
63  
64      private static final Map<String, Tag> TAG_MAP = new HashMap<>( ALL_TAGS.length );
65  
66      private static final int ASCII = 0x7E;
67  
68      static
69      {
70          for ( Tag tag : ALL_TAGS )
71          {
72              TAG_MAP.put( tag.toString(), tag );
73          }
74      }
75  
76      /**
77       * Returns a tag for a defined HTML tag name. This is one of
78       * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
79       * If the given name does not represent one of the defined tags, then
80       * <code>null</code> will be returned.
81       *
82       * @param tagName the <code>String</code> name requested.
83       * @return a tag constant corresponding to the <code>tagName</code>,
84       *    or <code>null</code> if not found.
85       * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
86       * @since 1.1
87       */
88      public static Tag getHtmlTag( String tagName )
89      {
90          return TAG_MAP.get( tagName );
91      }
92  
93      /**
94       * Escape special HTML characters in a String in <code>xml</code> mode.
95       *
96       * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
97       *
98       * @param text the String to escape, may be null.
99       * @return The escaped text or the empty string if text == null.
100      * @see #escapeHTML(String,boolean)
101      */
102     public static String escapeHTML( String text )
103     {
104         return escapeHTML( text, true );
105     }
106 
107     /**
108      * Escape special HTML characters in a String.
109      *
110      * <pre>
111      * &lt; becomes <code>&#38;lt;</code>
112      * &gt; becomes <code>&#38;gt;</code>
113      * &amp; becomes <code>&#38;amp;</code>
114      * " becomes <code>&#38;quot;</code>
115      * ' becomes <code>&#38;apos;</code> if xmlMode = true
116      * </pre>
117      *
118      * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
119      * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
120      *
121      * <b>Note</b>: all characters are encoded, i.e.:
122      * <pre>
123      * \u0159       = &#38;#x159;
124      * \uD835\uDFED = &#38;#x1d7ed;
125      * </pre>
126      *
127      * @param text The String to escape, may be null.
128      * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
129      * characters by numeric characters references.
130      * @return The escaped text or the empty string if text == null.
131      * @since 1.1
132      * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
133      * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
134      */
135     public static String escapeHTML( final String text, final boolean xmlMode )
136     {
137         if ( text == null )
138         {
139             return "";
140         }
141 
142         int length = text.length();
143         StringBuilder buffer = new StringBuilder( length );
144 
145         for ( int i = 0; i < length; ++i )
146         {
147             char c = text.charAt( i );
148             switch ( c )
149             {
150                 case '<':
151                     buffer.append( "&lt;" );
152                     break;
153                 case '>':
154                     buffer.append( "&gt;" );
155                     break;
156                 case '&':
157                     buffer.append( "&amp;" );
158                     break;
159                 case '\"':
160                     buffer.append( "&quot;" );
161                     break;
162                 default:
163                     if ( xmlMode )
164                     {
165                         if ( c == '\'' )
166                         {
167                             buffer.append( "&apos;" );
168                         }
169                         else
170                         {
171                             buffer.append( c );
172                         }
173                     }
174                     else
175                     {
176                         if ( c <= ASCII )
177                         {
178                             // ASCII.
179                             buffer.append( c );
180                         }
181                         else
182                         {
183                             buffer.append( "&#x" );
184                             if ( isHighSurrogate( c ) )
185                             {
186                                 buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
187                             }
188                             else
189                             {
190                                 buffer.append( Integer.toHexString( c ) );
191                             }
192                             buffer.append( ';' );
193                         }
194                     }
195             }
196         }
197 
198         return buffer.toString();
199     }
200 
201     /**
202      * Unescapes HTML entities in a string in non xml mode.
203      *
204      * @param text the <code>String</code> to unescape, may be null.
205      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
206      * @since 1.1.1.
207      * @see #unescapeHTML(String, boolean)
208      */
209     public static String unescapeHTML( String text )
210     {
211         return unescapeHTML( text, false );
212     }
213 
214     /**
215      * Unescapes HTML entities in a string.
216      *
217      * <p> Unescapes a string containing entity escapes to a string
218      * containing the actual Unicode characters corresponding to the
219      * escapes. Supports HTML 4.0 entities.</p>
220      *
221      * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
222      * will become "&lt;Fran&ccedil;ais&gt;".</p>
223      *
224      * <b>Note</b>: all unicode entities are decoded, i.e.:
225      * <pre>
226      * &#38;#x159;   = \u0159
227      * &#38;#x1d7ed; = \uD835\uDFED
228      * </pre>
229      *
230      * @param text the <code>String</code> to unescape, may be null.
231      * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
232      * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
233      * @since 1.1.1.
234      */
235     public static String unescapeHTML( String text, boolean xmlMode )
236     {
237         if ( text == null )
238         {
239             return null;
240         }
241 
242         String unescaped;
243         if ( xmlMode )
244         {
245             unescaped = StringEscapeUtils.unescapeXml( text );
246         }
247         else
248         {
249             // StringEscapeUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
250             unescaped = StringEscapeUtils.unescapeHtml4( text );
251         }
252 
253         String tmp = unescaped;
254         List<String> entities = new ArrayList<>();
255         while ( true )
256         {
257             int i = tmp.indexOf( "&#x" );
258             if ( i == -1 )
259             {
260                 break;
261             }
262 
263             tmp = tmp.substring( i + 3 );
264             if ( tmp.indexOf( ';' ) != -1 )
265             {
266                 String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
267                 try
268                 {
269                     Integer.parseInt( entity, 16 );
270                     entities.add( entity );
271                 }
272                 catch ( NumberFormatException e )
273                 {
274                     // nop
275                 }
276             }
277         }
278 
279         for ( String entity : entities )
280         {
281             int codePoint = Integer.parseInt( entity, 16 );
282             unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
283         }
284 
285         return unescaped;
286     }
287 
288     /**
289      * Encode an url
290      *
291      * @param url the String to encode, may be null
292      * @return the text encoded, null if null String input
293      */
294     public static String encodeURL( String url )
295     {
296         if ( url == null )
297         {
298             return null;
299         }
300 
301         StringBuilder encoded = new StringBuilder();
302         int length = url.length();
303 
304         char[] unicode = new char[1];
305 
306         for ( int i = 0; i < length; ++i )
307         {
308             char c = url.charAt( i );
309 
310             switch ( c )
311             {
312                 case ';':
313                 case '/':
314                 case '?':
315                 case ':':
316                 case '@':
317                 case '&':
318                 case '=':
319                 case '+':
320                 case '$':
321                 case ',':
322                 case '[':
323                 case ']': // RFC 2732 (IPV6)
324                 case '-':
325                 case '_':
326                 case '.':
327                 case '!':
328                 case '~':
329                 case '*':
330                 case '\'':
331                 case '(':
332                 case ')':
333                 case '#': // XLink mark
334                     encoded.append( c );
335                     break;
336                 default:
337                     if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
338                     {
339                         encoded.append( c );
340                     }
341                     else
342                     {
343                         byte[] bytes;
344 
345                         if ( isHighSurrogate( c ) )
346                         {
347                             int codePoint = toCodePoint( c, url.charAt( ++i ) );
348                             unicode = toChars( codePoint );
349                             bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( StandardCharsets.UTF_8 );
350                         }
351                         else
352                         {
353                             unicode[0] = c;
354                             bytes = ( new String( unicode, 0, 1 ) ).getBytes( StandardCharsets.UTF_8 );
355                         }
356 
357                         for ( byte aByte : bytes )
358                         {
359                             encoded.append( '%' );
360                             encoded.append( String.format( "%02X", aByte ) );
361                         }
362                     }
363             }
364         }
365 
366         return encoded.toString();
367     }
368 
369     /**
370      * Construct a valid id.
371      *
372      * <p>
373      *   <b>Note</b>: this method is identical to
374      *   {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, false )},
375      *   the rules to encode an id are laid out there.
376      * </p>
377      *
378      * @param id The id to be encoded.
379      * @return The trimmed and encoded id, or null if id is null.
380      * @see DoxiaUtils#encodeId(java.lang.String,boolean)
381      */
382     public static String encodeId( String id )
383     {
384         return DoxiaUtils.encodeId( id, false );
385     }
386 
387     /**
388      * Determines if the specified text is a valid id according to the rules
389      * laid out in {@link #encodeId(String)}.
390      *
391      * @param text The text to be tested.
392      * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
393      * @see DoxiaUtils#isValidId(String)
394      */
395     public static boolean isId( String text )
396     {
397         return DoxiaUtils.isValidId( text );
398     }
399 
400     private HtmlTools()
401     {
402         // utility class
403     }
404 
405 //
406 // Imported code from ASF Harmony project rev 770909
407 // http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
408 //
409 
410     private static final char LUNATE_SIGMA = 0x3FF;
411     private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
412     private static final char LOW_SURROGATE = 0xDC00;
413 
414     private static int toCodePoint( char high, char low )
415     {
416         // See RFC 2781, Section 2.2
417         // http://www.faqs.org/rfcs/rfc2781.html
418         int h = ( high & LUNATE_SIGMA ) << 10;
419         int l = low & LUNATE_SIGMA;
420         return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
421     }
422 
423     private static final char MIN_HIGH_SURROGATE = '\uD800';
424     private static final char MAX_HIGH_SURROGATE = '\uDBFF';
425 
426     private static boolean isHighSurrogate( char ch )
427     {
428         return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
429     }
430 
431     private static final int MIN_CODE_POINT = 0x000000;
432     private static final int MAX_CODE_POINT = 0x10FFFF;
433     private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
434 
435     private static boolean isValidCodePoint( int codePoint )
436     {
437         return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
438     }
439 
440     private static boolean isSupplementaryCodePoint( int codePoint )
441     {
442         return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
443     }
444 
445     /**
446      * Converts the given code point to an equivalent character array.
447      *
448      * @param codePoint the code point to convert.
449      * @return If codePoint is a supplementary code point, returns a character array of length 2,
450      * otherwise a character array of length 1 containing only the original int as a char.
451      */
452     public static char[] toChars( int codePoint )
453     {
454         if ( !isValidCodePoint( codePoint ) )
455         {
456             throw new IllegalArgumentException();
457         }
458 
459         if ( isSupplementaryCodePoint( codePoint ) )
460         {
461             int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
462             int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
463             int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
464             return new char[] { (char) high, (char) low };
465         }
466         return new char[] { (char) codePoint };
467     }
468 }