001package org.apache.maven.doxia.util;
002
003/*
004 * Licensed to the Apache Software Foundation (ASF) under one
005 * or more contributor license agreements.  See the NOTICE file
006 * distributed with this work for additional information
007 * regarding copyright ownership.  The ASF licenses this file
008 * to you under the Apache License, Version 2.0 (the
009 * "License"); you may not use this file except in compliance
010 * with the License.  You may obtain a copy of the License at
011 *
012 *   http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing,
015 * software distributed under the License is distributed on an
016 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
017 * KIND, either express or implied.  See the License for the
018 * specific language governing permissions and limitations
019 * under the License.
020 */
021
022import java.io.UnsupportedEncodingException;
023import java.util.ArrayList;
024import java.util.HashMap;
025import java.util.List;
026import java.util.Map;
027
028import javax.swing.text.html.HTML.Tag;
029
030import org.apache.commons.lang3.StringEscapeUtils;
031import org.apache.maven.doxia.markup.HtmlMarkup;
032import org.codehaus.plexus.util.StringUtils;
033
034/**
035 * The <code>HtmlTools</code> class defines methods to HTML handling.
036 *
037 * @author <a href="mailto:vincent.siveton@gmail.com">Vincent Siveton</a>
038 * @version $Id$
039 * @since 1.0
040 */
041public class HtmlTools
042{
043    private static final Tag[] ALL_TAGS  =
044    {
045        HtmlMarkup.A, HtmlMarkup.ABBR, HtmlMarkup.ACRONYM, HtmlMarkup.ADDRESS, HtmlMarkup.APPLET,
046        HtmlMarkup.AREA, HtmlMarkup.B, HtmlMarkup.BASE, HtmlMarkup.BASEFONT, HtmlMarkup.BDO,
047        HtmlMarkup.BIG, HtmlMarkup.BLOCKQUOTE, HtmlMarkup.BODY, HtmlMarkup.BR, HtmlMarkup.BUTTON,
048        HtmlMarkup.CAPTION, HtmlMarkup.CENTER, HtmlMarkup.CITE, HtmlMarkup.CODE, HtmlMarkup.COL,
049        HtmlMarkup.COLGROUP, HtmlMarkup.DD, HtmlMarkup.DEL, HtmlMarkup.DFN, HtmlMarkup.DIR,
050        HtmlMarkup.DIV, HtmlMarkup.DL, HtmlMarkup.DT, HtmlMarkup.EM, HtmlMarkup.FIELDSET,
051        HtmlMarkup.FONT, HtmlMarkup.FORM, HtmlMarkup.FRAME, HtmlMarkup.FRAMESET, HtmlMarkup.H1,
052        HtmlMarkup.H2, HtmlMarkup.H3, HtmlMarkup.H4, HtmlMarkup.H5, HtmlMarkup.H6, HtmlMarkup.HEAD,
053        HtmlMarkup.HR, HtmlMarkup.HTML, HtmlMarkup.I, HtmlMarkup.IFRAME, HtmlMarkup.IMG,
054        HtmlMarkup.INPUT, HtmlMarkup.INS, HtmlMarkup.ISINDEX, HtmlMarkup.KBD, HtmlMarkup.LABEL,
055        HtmlMarkup.LEGEND, HtmlMarkup.LI, HtmlMarkup.LINK, HtmlMarkup.MAP, HtmlMarkup.MENU,
056        HtmlMarkup.META, HtmlMarkup.NOFRAMES, HtmlMarkup.NOSCRIPT, HtmlMarkup.OBJECT, HtmlMarkup.OL,
057        HtmlMarkup.OPTGROUP, HtmlMarkup.OPTION, HtmlMarkup.P, HtmlMarkup.PARAM, HtmlMarkup.PRE,
058        HtmlMarkup.Q, HtmlMarkup.S, HtmlMarkup.SAMP, HtmlMarkup.SCRIPT, HtmlMarkup.SELECT,
059        HtmlMarkup.SMALL, HtmlMarkup.SPAN, HtmlMarkup.STRIKE, HtmlMarkup.STRONG, HtmlMarkup.STYLE,
060        HtmlMarkup.SUB, HtmlMarkup.SUP, HtmlMarkup.TABLE, HtmlMarkup.TBODY, HtmlMarkup.TD,
061        HtmlMarkup.TEXTAREA, HtmlMarkup.TFOOT, HtmlMarkup.TH, HtmlMarkup.THEAD, HtmlMarkup.TITLE,
062        HtmlMarkup.TR, HtmlMarkup.TT, HtmlMarkup.U, HtmlMarkup.UL, HtmlMarkup.VAR
063    };
064
065    private static final Map<String, Tag> TAG_MAP = new HashMap<String, Tag>( ALL_TAGS.length );
066
067    private static final int ASCII = 0x7E;
068
069    static
070    {
071        for ( Tag tag : ALL_TAGS )
072        {
073            TAG_MAP.put( tag.toString(), tag );
074        }
075    }
076
077    /**
078     * Returns a tag for a defined HTML tag name. This is one of
079     * the tags defined in {@link org.apache.maven.doxia.markup.HtmlMarkup}.
080     * If the given name does not represent one of the defined tags, then
081     * <code>null</code> will be returned.
082     *
083     * @param tagName the <code>String</code> name requested.
084     * @return a tag constant corresponding to the <code>tagName</code>,
085     *    or <code>null</code> if not found.
086     * @see <a href="http://www.w3.org/TR/html401/index/elements.html">http://www.w3.org/TR/html401/index/elements.html</a>
087     * @since 1.1
088     */
089    public static Tag getHtmlTag( String tagName )
090    {
091        Object t =  TAG_MAP.get( tagName );
092
093        return (Tag) t;
094    }
095
096    /**
097     * Escape special HTML characters in a String in <code>xml</code> mode.
098     *
099     * <b>Note</b>: this method doesn't escape non-ascii characters by numeric characters references.
100     *
101     * @param text the String to escape, may be null.
102     * @return The escaped text or the empty string if text == null.
103     * @see #escapeHTML(String,boolean)
104     */
105    public static String escapeHTML( String text )
106    {
107        return escapeHTML( text, true );
108    }
109
110    /**
111     * Escape special HTML characters in a String.
112     *
113     * <pre>
114     * < becomes <code>&#38;lt;</code>
115     * > becomes <code>&#38;gt;</code>
116     * & becomes <code>&#38;amp;</code>
117     * " becomes <code>&#38;quot;</code>
118     * ' becomes <code>&#38;apos;</code> if xmlMode = true
119     * </pre>
120     *
121     * If <code>xmlMode</code> is true, every other character than the above remains unchanged,
122     * if <code>xmlMode</code> is false, non-ascii characters get replaced by their hex code.
123     *
124     * <b>Note</b>: all characters are encoded, i.e.:
125     * <pre>
126     * \u0159       = &#38;#x159;
127     * \uD835\uDFED = &#38;#x1d7ed;
128     * </pre>
129     *
130     * @param text The String to escape, may be null.
131     * @param xmlMode <code>true</code> to replace also ' to &#38;apos, <code>false</code> to replace non-ascii
132     * characters by numeric characters references.
133     * @return The escaped text or the empty string if text == null.
134     * @since 1.1
135     * @see <a href="http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent">http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</a>
136     * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">http://www.w3.org/TR/html401/charset.html#h-5.3</a>
137     */
138    public static String escapeHTML( final String text, final boolean xmlMode )
139    {
140        if ( text == null )
141        {
142            return "";
143        }
144
145        int length = text.length();
146        StringBuilder buffer = new StringBuilder( length );
147
148        for ( int i = 0; i < length; ++i )
149        {
150            char c = text.charAt( i );
151            switch ( c )
152            {
153                case '<':
154                    buffer.append( "&lt;" );
155                    break;
156                case '>':
157                    buffer.append( "&gt;" );
158                    break;
159                case '&':
160                    buffer.append( "&amp;" );
161                    break;
162                case '\"':
163                    buffer.append( "&quot;" );
164                    break;
165                default:
166                    if ( xmlMode )
167                    {
168                        if ( c == '\'' )
169                        {
170                            buffer.append( "&apos;" );
171                        }
172                        else
173                        {
174                            buffer.append( c );
175                        }
176                    }
177                    else
178                    {
179                        if ( c <= ASCII )
180                        {
181                            // ASCII.
182                            buffer.append( c );
183                        }
184                        else
185                        {
186                            buffer.append( "&#x" );
187                            if ( isHighSurrogate( c ) )
188                            {
189                                buffer.append( Integer.toHexString( toCodePoint( c, text.charAt( ++i ) ) ) );
190                            }
191                            else
192                            {
193                                buffer.append( Integer.toHexString( c ) );
194                            }
195                            buffer.append( ';' );
196                        }
197                    }
198            }
199        }
200
201        return buffer.toString();
202    }
203
204    /**
205     * Unescapes HTML entities in a string in non xml mode.
206     *
207     * @param text the <code>String</code> to unescape, may be null.
208     * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
209     * @since 1.1.1.
210     * @see #unescapeHTML(String, boolean)
211     */
212    public static String unescapeHTML( String text )
213    {
214        return unescapeHTML( text, false );
215    }
216
217    /**
218     * Unescapes HTML entities in a string.
219     *
220     * <p> Unescapes a string containing entity escapes to a string
221     * containing the actual Unicode characters corresponding to the
222     * escapes. Supports HTML 4.0 entities.</p>
223     *
224     * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
225     * will become "&lt;Fran&ccedil;ais&gt;".</p>
226     *
227     * <b>Note</b>: all unicode entities are decoded, i.e.:
228     * <pre>
229     * &#38;#x159;   = \u0159
230     * &#38;#x1d7ed; = \uD835\uDFED
231     * </pre>
232     *
233     * @param text the <code>String</code> to unescape, may be null.
234     * @param xmlMode set to <code>true</code> to replace &#38;apos by '.
235     * @return a new unescaped <code>String</code>, <code>null</code> if null string input.
236     * @since 1.1.1.
237     */
238    public static String unescapeHTML( String text, boolean xmlMode )
239    {
240        if ( text == null )
241        {
242            return null;
243        }
244
245        String unescaped;
246        if ( xmlMode )
247        {
248            unescaped = StringEscapeUtils.unescapeXml( text );
249        }
250        else
251        {
252            // StringEscapeUtils.unescapeHtml4 returns entities it doesn't recognize unchanged
253            unescaped = StringEscapeUtils.unescapeHtml4( text );
254        }
255
256        String tmp = unescaped;
257        List<String> entities = new ArrayList<String>();
258        while ( true )
259        {
260            int i = tmp.indexOf( "&#x" );
261            if ( i == -1 )
262            {
263                break;
264            }
265
266            tmp = tmp.substring( i + 3 );
267            if ( tmp.indexOf( ';' ) != -1 )
268            {
269                String entity = tmp.substring( 0, tmp.indexOf( ';' ) );
270                try
271                {
272                    Integer.parseInt( entity, 16 );
273                    entities.add( entity );
274                }
275                catch ( NumberFormatException e )
276                {
277                    // nop
278                }
279            }
280        }
281
282        for ( String entity : entities )
283        {
284            int codePoint = Integer.parseInt( entity, 16 );
285            unescaped = StringUtils.replace( unescaped, "&#x" + entity + ";", new String( toChars( codePoint ) ) );
286        }
287
288        return unescaped;
289    }
290
291    /**
292     * Encode an url
293     *
294     * @param url the String to encode, may be null
295     * @return the text encoded, null if null String input
296     */
297    public static String encodeURL( String url )
298    {
299        if ( url == null )
300        {
301            return null;
302        }
303
304        StringBuilder encoded = new StringBuilder();
305        int length = url.length();
306
307        char[] unicode = new char[1];
308
309        for ( int i = 0; i < length; ++i )
310        {
311            char c = url.charAt( i );
312
313            switch ( c )
314            {
315                case ';':
316                case '/':
317                case '?':
318                case ':':
319                case '@':
320                case '&':
321                case '=':
322                case '+':
323                case '$':
324                case ',':
325                case '[':
326                case ']': // RFC 2732 (IPV6)
327                case '-':
328                case '_':
329                case '.':
330                case '!':
331                case '~':
332                case '*':
333                case '\'':
334                case '(':
335                case ')':
336                case '#': // XLink mark
337                    encoded.append( c );
338                    break;
339                default:
340                    if ( ( c >= 'a' && c <= 'z' ) || ( c >= 'A' && c <= 'Z' ) || ( c >= '0' && c <= '9' ) )
341                    {
342                        encoded.append( c );
343                    }
344                    else
345                    {
346                        byte[] bytes;
347
348                        try
349                        {
350                            if ( isHighSurrogate( c ) )
351                            {
352                                int codePoint = toCodePoint( c, url.charAt( ++i ) );
353                                unicode = toChars( codePoint );
354                                bytes = ( new String( unicode, 0, unicode.length ) ).getBytes( "UTF8" );
355                            }
356                            else
357                            {
358                                unicode[0] = c;
359                                bytes = ( new String( unicode, 0, 1 ) ).getBytes( "UTF8" );
360                            }
361                        }
362                        catch ( UnsupportedEncodingException cannotHappen )
363                        {
364                            bytes = new byte[0];
365                        }
366
367                        for ( int j = 0; j < bytes.length; ++j )
368                        {
369                            encoded.append( '%' );
370                            encoded.append( String.format( "%02X", bytes[j] ) );
371                        }
372                    }
373            }
374        }
375
376        return encoded.toString();
377    }
378
379    /**
380     * Construct a valid id.
381     *
382     * <p>
383     *   <b>Note</b>: this method is identical to
384     *   {@link DoxiaUtils#encodeId(String,boolean) DoxiaUtils.encodeId( id, false )},
385     *   the rules to encode an id are laid out there.
386     * </p>
387     *
388     * @param id The id to be encoded.
389     * @return The trimmed and encoded id, or null if id is null.
390     * @see DoxiaUtils#encodeId(java.lang.String,boolean)
391     */
392    public static String encodeId( String id )
393    {
394        return DoxiaUtils.encodeId( id, false );
395    }
396
397    /**
398     * Determines if the specified text is a valid id according to the rules
399     * laid out in {@link #encodeId(String)}.
400     *
401     * @param text The text to be tested.
402     * @return <code>true</code> if the text is a valid id, otherwise <code>false</code>.
403     * @see #encodeId(String).
404     */
405    public static boolean isId( String text )
406    {
407        return DoxiaUtils.isValidId( text );
408    }
409
410    private HtmlTools()
411    {
412        // utility class
413    }
414
415//
416// Imported code from ASF Harmony project rev 770909
417// http://svn.apache.org/repos/asf/harmony/enhanced/classlib/trunk/modules/luni/src/main/java/java/lang/Character.java
418//
419
420    private static final char LUNATE_SIGMA = 0x3FF;
421    private static final char NON_PRIVATE_USE_HIGH_SURROGATE = 0xD800;
422    private static final char LOW_SURROGATE = 0xDC00;
423
424    private static int toCodePoint( char high, char low )
425    {
426        // See RFC 2781, Section 2.2
427        // http://www.faqs.org/rfcs/rfc2781.html
428        int h = ( high & LUNATE_SIGMA ) << 10;
429        int l = low & LUNATE_SIGMA;
430        return ( h | l ) + MIN_SUPPLEMENTARY_CODE_POINT;
431    }
432
433    private static final char MIN_HIGH_SURROGATE = '\uD800';
434    private static final char MAX_HIGH_SURROGATE = '\uDBFF';
435
436    private static boolean isHighSurrogate( char ch )
437    {
438        return ( MIN_HIGH_SURROGATE <= ch && MAX_HIGH_SURROGATE >= ch );
439    }
440
441    private static final int MIN_CODE_POINT = 0x000000;
442    private static final int MAX_CODE_POINT = 0x10FFFF;
443    private static final int MIN_SUPPLEMENTARY_CODE_POINT = 0x10000;
444
445    private static boolean isValidCodePoint( int codePoint )
446    {
447        return ( MIN_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
448    }
449
450    private static boolean isSupplementaryCodePoint( int codePoint )
451    {
452        return ( MIN_SUPPLEMENTARY_CODE_POINT <= codePoint && MAX_CODE_POINT >= codePoint );
453    }
454
455    /**
456     * Converts the given code point to an equivalent character array.
457     *
458     * @param codePoint the code point to convert.
459     * @return If codePoint is a supplementary code point, returns a character array of length 2,
460     * otherwise a character array of length 1 containing only the original int as a char.
461     */
462    public static char[] toChars( int codePoint )
463    {
464        if ( !isValidCodePoint( codePoint ) )
465        {
466            throw new IllegalArgumentException();
467        }
468
469        if ( isSupplementaryCodePoint( codePoint ) )
470        {
471            int cpPrime = codePoint - MIN_SUPPLEMENTARY_CODE_POINT;
472            int high = NON_PRIVATE_USE_HIGH_SURROGATE | ( ( cpPrime >> 10 ) & LUNATE_SIGMA );
473            int low = LOW_SURROGATE | ( cpPrime & LUNATE_SIGMA );
474            return new char[] { (char) high, (char) low };
475        }
476        return new char[] { (char) codePoint };
477    }
478}