Coverage Report

Coverage Report - org.apache.myfaces.shared_impl.renderkit.html.util.HTMLEncoder

Classes in this File

Line Coverage

Branch Coverage

Complexity

HTMLEncoder

0/175

0/228

8.333

 /*
  *  Licensed to the Apache Software Foundation (ASF) under one
  *  or more contributor license agreements.  See the NOTICE file
  *  distributed with this work for additional information
  *  regarding copyright ownership.  The ASF licenses this file
  *  to you under the Apache License, Version 2.0 (the
  *  "License"); you may not use this file except in compliance
  *  with the License.  You may obtain a copy of the License at
  * 
  *  http://www.apache.org/licenses/LICENSE-2.0
  * 
  *  Unless required by applicable law or agreed to in writing,
  *  software distributed under the License is distributed on an
  *  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  *  KIND, either express or implied.  See the License for the
  *  specific language governing permissions and limitations
  *  under the License.
  */
 package org.apache.myfaces.shared_impl.renderkit.html.util;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
 import java.io.Writer;
 
 /**
  * Converts Strings so that they can be used within HTML-Code.
  */
 public abstract class HTMLEncoder
 {
     /**
      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
      */
     public static String encode (String string)
     {
         return encode(string, false, true);
     }
 
     /**
      * Variant of {@link #encode} where encodeNbsp is true.
      */
     public static String encode (String string, boolean encodeNewline)
     {
         return encode(string, encodeNewline, true);
     }
 
     /**
      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
      */
     public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
     {
         return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
     }
 
     /**
      * Encodes the given string, so that it can be used within a html page.
      * @param string the string to convert
      * @param encodeNewline if true newline characters are converted to &lt;br&gt;'s
      * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &amp;nbsp;'s
      * @param encodeNonLatin if true encode non-latin characters as numeric character references
      */
     public static String encode (String string,
                                  boolean encodeNewline,
                                  boolean encodeSubsequentBlanksToNbsp,
                                  boolean encodeNonLatin)
     {
         if (string == null)
         {
             return "";
         }
 
         StringBuilder sb = null;    //create later on demand
         String app;
         char c;
         for (int i = 0; i < string.length (); ++i)
         {
             app = null;
             c = string.charAt(i);
             
             // All characters before letters
             if ((int)c < 0x41)
             {
                 switch (c)
                 {
                     case '"': app = "&quot;"; break;    //"
                     case '&': app = "&amp;"; break;     //&
                     case '<': app = "&lt;"; break;      //<
                     case '>': app = "&gt;"; break;      //>
                     case ' ':
                         if (encodeSubsequentBlanksToNbsp &&
                                 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
                         {
                             //Space at beginning or after another space
                             app = "&#160;";
                         }
                         break;
                     case '\n':
                         if (encodeNewline)
                         {
                             app = "<br/>";
                         }
                         break;
                 }
             } else if (encodeNonLatin && (int)c > 0x80) {
                  switch(c) {
                     //german umlauts
                     case '\u00E4' : app = "&auml;";  break;
                     case '\u00C4' : app = "&Auml;";  break;
                     case '\u00F6' : app = "&ouml;";  break;
                     case '\u00D6' : app = "&Ouml;";  break;
                     case '\u00FC' : app = "&uuml;";  break;
                     case '\u00DC' : app = "&Uuml;";  break;
                     case '\u00DF' : app = "&szlig;"; break;
 
                     //misc
                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
                     case '\u20AC': app = "&euro;";  break;
                     case '\u00AB': app = "&laquo;"; break;
                     case '\u00BB': app = "&raquo;"; break;
                     case '\u00A0': app = "&#160;"; break;
 
                     default :
                         //encode all non basic latin characters
                         app = "&#" + ((int)c) + ";";
                     break;
                 }
             }
             if (app != null)
             {
                 if (sb == null)
                 {
                     sb = new StringBuilder(string.substring(0, i));
                 }
                 sb.append(app);
             } else {
                 if (sb != null)
                 {
                     sb.append(c);
                 }
             }
         }
 
         if (sb == null)
         {
             return string;
         }
         else
         {
             return sb.toString();
         }
     }
 
     /**
      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
      */
     public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
     {
         encode(string, offset, length, false, true, writer);
     }
 
     /**
      * Variant of {@link #encode} where encodeNbsp is true.
      */
     public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) throws IOException
     {
         encode(string, offset, length, encodeNewline, true, writer);
     }
 
     /**
      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
      */
     public static void encode (char[] string, int offset, int length, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
     {
         encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
     }
 
 
     /**
      * Encodes the given string, so that it can be used within a html page.
      * @param string the string to convert
      * @param encodeNewline if true newline characters are converted to &lt;br&gt;'s
      * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &amp;nbsp;'s
      * @param encodeNonLatin if true encode non-latin characters as numeric character references
      */
     public static void encode (char[] string, int offset, int length,
                                  boolean encodeNewline,
                                  boolean encodeSubsequentBlanksToNbsp,
                                  boolean encodeNonLatin, Writer writer) throws IOException
     {
         if (string == null || length < 0 || offset >= string.length)
         {
             return;
         }
         offset = Math.max(0, offset);
         int realLength = Math.min(length, string.length - offset);
 
         StringBuilder sb = null;    //create later on demand
         String app;
         char c;
         
         for (int i = offset; i < offset + realLength; ++i)
         {
             app = null;
             c = string[i];
 
             // All characters before letters
             if ((int)c < 0x41)
             {
                 switch (c)
                 {
                     case '"': app = "&quot;"; break;    //"
                     case '&': app = "&amp;"; break;     //&
                     case '<': app = "&lt;"; break;      //<
                     case '>': app = "&gt;"; break;      //>
                     case ' ':
                         if (encodeSubsequentBlanksToNbsp &&
                                 (i == 0 || (i - 1 >= 0 && string[i - 1] == ' ')))
                         {
                             //Space at beginning or after another space
                             app = "&#160;";
                         }
                         break;
                     case '\n':
                         if (encodeNewline)
                         {
                             app = "<br/>";
                         }
                         break;
                 }
             } else if (encodeNonLatin && (int)c > 0x80) {
                  switch(c) {
                     //german umlauts
                     case '\u00E4' : app = "&auml;";  break;
                     case '\u00C4' : app = "&Auml;";  break;
                     case '\u00F6' : app = "&ouml;";  break;
                     case '\u00D6' : app = "&Ouml;";  break;
                     case '\u00FC' : app = "&uuml;";  break;
                     case '\u00DC' : app = "&Uuml;";  break;
                     case '\u00DF' : app = "&szlig;"; break;
 
                     //misc
                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
                     case '\u20AC': app = "&euro;";  break;
                     case '\u00AB': app = "&laquo;"; break;
                     case '\u00BB': app = "&raquo;"; break;
                     case '\u00A0': app = "&#160;"; break;
 
                     default :
                         //encode all non basic latin characters
                         app = "&#" + ((int)c) + ";";
                     break;
                 }
             }
             if (app != null)
             {
                 if (sb == null)
                 {
                     sb = new StringBuilder(realLength*2);
                     sb.append(string, offset, i - offset);
                 }
                 sb.append(app);
             } else {
                 if (sb != null)
                 {
                     sb.append(c);
                 }
             }
         }
 
         if (sb == null)
         {
             writer.write(string, offset, realLength);
         }
         else
         {
             writer.write(sb.toString());
         }
     }
     
     private static final String HEX_CHARSET = "0123456789ABCDEF";
     
     private static final String UTF8 = "UTF-8";
     
     /**
      * Encode an URI, escaping or percent-encoding all required characters and
      * following the rules mentioned on RFC 3986.  
      * 
      * @param string
      * @param encodeNonLatin
      * @return
      * @throws IOException
      */
     public static String encodeURIAtributte(final String string, final String characterEncoding)
         throws IOException
     {
         StringBuilder sb = null;    //create later on demand
         String app;
         char c;
         boolean endLoop = false;
         for (int i = 0; i < string.length (); ++i)
         {
             app = null;
             c = string.charAt(i);
             
             // This are the guidelines to be taken into account by this algorithm to encode:
             
             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
             //
             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
             // space       = <US-ASCII coded character 20 hexadecimal>
             // delims      = "<" | ">" | "#" | "%" | <">
             //               %3C   %3E   %23   %25   %22
             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
             //
             // ".... Data corresponding to excluded characters must be escaped in order to
             // be properly represented within a URI....."
             
             // RFC 3986 Section 3.  Syntax Components
             //
             // "... The generic URI syntax consists of a hierarchical sequence of
             // components referred to as the scheme, authority, path, query, and
             // fragment.
             //
             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
             //
             //   hier-part   = "//" authority path-abempty
             //               / path-absolute
             //               / path-rootless
             //               / path-empty
             // ...."
             
             // RFC 3986 Section 2.2:
             // Reserved characters (should not be percent-encoded)
             // reserved    = gen-delims / sub-delims
             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
             //               %3A   %2F   %3F   %23   %5B   %5D   %40
             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
             
             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
             // "...those rules were redefined to directly specify the characters allowed...."
             // There is also other characters moved from excluded list to reserved:
             // "[" / "]" / "#"  
             
             // RFC 3986 Section 2.3:
             // "... for consistency, percent-encoded octets in the ranges of ALPHA
             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
             // underscore (%5F), or tilde (%7E) should not be created by URI
             // producers...."
             
             // RFC 3986 Section  3.2.2.  Host
 
             // host = IP-literal / IPv4address / reg-name
 
             // The reg-name syntax allows percent-encoded octets in order to
             // represent non-ASCII registered names in a uniform way that is
             // independent of the underlying name resolution technology.  Non-ASCII
             // characters must first be encoded according to UTF-8 [STD63], and then
             // each octet of the corresponding UTF-8 sequence must be percent-
             // encoded to be represented as URI characters.  URI producing
             // applications must not use percent-encoding in host unless it is used
             // to represent a UTF-8 character sequence.
             
             // RFC 3986 Section 3.4 Query 
             //         query       = *( pchar / "/" / "?" )
             //
             // "...  However, as query components are often used to carry identifying information 
             // in the form of "key=value" pairs and one frequently used value is a reference to
             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
             //
             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
             //
             // When a new URI scheme defines a component that represents textual
             // data consisting of characters from the Universal Character Set [UCS],
             // the data should first be encoded as octets according to the UTF-8
             // character encoding [STD63]; then only those octets that do not
             // correspond to characters in the unreserved set should be percent-
             // encoded.  For example, the character A would be represented as "A",
             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
             // as "%C3%80", and the character KATAKANA LETTER A would be represented
             // as "%E3%82%A2".
             //
             // RFC 3986 Section 3.5 Fragment
             //         fragment    = *( pchar / "/" / "?" )
             //
             // Note that follows the same as query
             
             // Based on the extracts the strategy to apply on this method is:
             // 
             // On scheme ":" hier-part
             //
             // Escape or percent encode chars inside :
             // 
             // - From %00 to %20, 
             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
             //                     duplicate encoding, encode it when we are sure 
             //                     that there are not encoded twice)
             // - "<" %3C, ">" %3E
             // - "\" %5C, "^" %5E, "`" %60 
             // - "{" %7B, "|" %7C, "}" %7D
             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
             //   part of an URI, but it is preferred to encode it that omit it).
             //
             // The remaining characters must not be encoded
             //
             // Characters after ? or # should be percent encoding but only the necessary ones:
             //
             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
             //                     duplicate encoding, encode it when we are sure 
             //                     that there are not encoded twice)
             // - "<" %3C, ">" %3E,
             // - "\" %5C, "^" %5E, "`" %60 
             // - "{" %7B, "|" %7C, "}" %7D
             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
             //   translating from the document character encoding to percent encoding, because this values
             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
             //   for decode values)
             //
             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
             // put only & is invalid in this context.
 
             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
                     c == '"' || c == '<' ||
                     c == '>' || c == '\\' || c == '^' || c == '`' ||
                     c == '{' || c == '|' || c == '}')
             {
                 // The percent encoding on this part should be done using UTF-8 charset
                 // as RFC 3986 Section 3.2.2 says.
                 // Also there is a reference on 
                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
                 // that recommend use of UTF-8 instead the document character encoding.
                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
                 app = percentEncode(c, "UTF-8");
             }
             else if (c == '%')
             {
                 if (i + 2 < string.length())
                 {
                     char c1 = string.charAt(i+1);
                     char c2 = string.charAt(i+2);
                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
                     {
                         // do not percent encode, because it could be already encoded
                         // and we don't want encode it twice
                     }
                     else
                     {
                         app = percentEncode(c, UTF8);
                     }
                 }
                 else
                 {
                     app = percentEncode(c, UTF8);
                 }
             }
             else if (c == '?' || c == '#')
             {
                 if (i+1 < string.length())
                 {
                     // The remaining part of the URI are data that should be encoded
                     // using the document character encoding.
                     app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
                     endLoop = true;
                 }
             }
             else
             {
                 //No encoding, just do nothing, char will be added later.
             }
                         
             if (app != null)
             {
                 if (sb == null)
                 {
                     sb = new StringBuilder(string.substring(0, i));
                 }
                 sb.append(app);
             } else {
                 if (sb != null)
                 {
                     sb.append(c);
                 }
             }
             if (endLoop)
             {
                 break;
             }
         }
         if (sb == null)
         {
             return string;
         }
         else
         {
             return sb.toString();
         }
     }
     
     /**
      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
      * characterEncoding.
      * 
      * @param c
      * @param characterEncoding
      * @return
      */
     private static String percentEncode(char c, String characterEncoding)
     {
         String app = null;
         if (c > (char)((short)0x007F))
         {
             //percent encode in the proper encoding to be consistent
             app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
         }
         else
         {
             //percent encode US-ASCII char (0x00-0x7F range)
             app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
         }
         return app;
     }
     
     private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
     {
         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
         StringBuffer builder = new StringBuffer();
         try
         {
             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
             writer.write(c);
             writer.flush();
         }
         catch(IOException e)
         {
             baos.reset();
             return null;
         }
         
         byte [] byteArray =  baos.toByteArray();
         for (int i=0; i < byteArray.length; i++)
         {
             builder.append('%');
             builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
             builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
         }
         
         return builder.toString();
     }
 
     /**
      * Encode the query part using the document charset encoding provided.
      * 
      * 
      * @param string
      * @param characterEncoding
      * @return
      */
     private static String encodeURIQuery(final String string, final String characterEncoding)
     {
         StringBuilder sb = null;    //create later on demand
         String app;
         char c;
         boolean endLoop = false;
         for (int i = 0; i < string.length (); ++i)
         {
             app = null;
             c = string.charAt(i);
             
             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
             // - "<" %3C, ">" %3E,
             // - "\" %5C, "^" %5E, "`" %60 
             // - "{" %7B, "|" %7C, "}" %7D
             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
             //   that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
             //   character encoding to percent encoding)
             //
             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
             // put & is invalid in this context   
             
             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
                     c == '"' || c == '<' ||
                     c == '>' || c == '\\' || c == '^' || c == '`' ||
                     c == '{' || c == '|' || c == '}')
             {
                 // The percent encoding on this part should be done using UTF-8 charset
                 // as RFC 3986 Section 3.2.2 says
                 app = percentEncode(c, characterEncoding);
             }
             else if (c == '%')
             {
                 if (i + 2 < string.length())
                 {
                     char c1 = string.charAt(i+1);
                     char c2 = string.charAt(i+2);
                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) &&
                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z')))
                     {
                         // do not percent encode, because it could be already encoded
                     }
                     else
                     {
                         app = percentEncode(c, characterEncoding);
                     }
                 }
                 else
                 {
                     app = percentEncode(c, characterEncoding);
                 }
             }
             else if (c == '&')
             {
                 if (i+4 < string.length() )
                 {
                     if ('a' == string.charAt(i+1) &&
                         'm' == string.charAt(i+2) &&
                         'p' == string.charAt(i+3) &&
                         ';' == string.charAt(i+4))
                     {
                         //Skip
                     }
                     else
                     {
                         app = "&amp;";
                     }
                 }
                 else
                 {
                     app = "&amp;";
                 }
             }
             else
             {
                 //No encoding, just do nothing, char will be added later.
             }
                         
             if (app != null)
             {
                 if (sb == null)
                 {
                     sb = new StringBuilder(string.substring(0, i));
                 }
                 sb.append(app);
             } else {
                 if (sb != null)
                 {
                     sb.append(c);
                 }
             }
             if (endLoop)
             {
                 break;
             }
         }
         if (sb == null)
         {
             return string;
         }
         else
         {
             return sb.toString();
         }
     }
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one
3		* or more contributor license agreements. See the NOTICE file
4		* distributed with this work for additional information
5		* regarding copyright ownership. The ASF licenses this file
6		* to you under the Apache License, Version 2.0 (the
7		* "License"); you may not use this file except in compliance
8		* with the License. You may obtain a copy of the License at
9		*
10		* http://www.apache.org/licenses/LICENSE-2.0
11		*
12		* Unless required by applicable law or agreed to in writing,
13		* software distributed under the License is distributed on an
14		* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15		* KIND, either express or implied. See the License for the
16		* specific language governing permissions and limitations
17		* under the License.
18		*/
19		package org.apache.myfaces.shared_impl.renderkit.html.util;
20
21		import java.io.ByteArrayOutputStream;
22		import java.io.IOException;
23		import java.io.OutputStreamWriter;
24		import java.io.Writer;
25
26		/**
27		* Converts Strings so that they can be used within HTML-Code.
28		*/
29	0	public abstract class HTMLEncoder
30		{
31		/**
32		* Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33		*/
34		public static String encode (String string)
35		{
36	0	return encode(string, false, true);
37		}
38
39		/**
40		* Variant of {@link #encode} where encodeNbsp is true.
41		*/
42		public static String encode (String string, boolean encodeNewline)
43		{
44	0	return encode(string, encodeNewline, true);
45		}
46
47		/**
48		* Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
49		*/
50		public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51		{
52	0	return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53		}
54
55		/**
56		* Encodes the given string, so that it can be used within a html page.
57		* @param string the string to convert
58		* @param encodeNewline if true newline characters are converted to <br>'s
59		* @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
60		* @param encodeNonLatin if true encode non-latin characters as numeric character references
61		*/
62		public static String encode (String string,
63		boolean encodeNewline,
64		boolean encodeSubsequentBlanksToNbsp,
65		boolean encodeNonLatin)
66		{
67	0	if (string == null)
68		{
69	0	return "";
70		}
71
72	0	StringBuilder sb = null; //create later on demand
73		String app;
74		char c;
75	0	for (int i = 0; i < string.length (); ++i)
76		{
77	0	app = null;
78	0	c = string.charAt(i);
79
80		// All characters before letters
81	0	if ((int)c < 0x41)
82		{
83	0	switch (c)
84		{
85	0	case '"': app = """; break; //"
86	0	case '&': app = "&"; break; //&
87	0	case '<': app = "<"; break; //<
88	0	case '>': app = ">"; break; //>
89		case ' ':
90	0	if (encodeSubsequentBlanksToNbsp &&
91		(i == 0 \|\| (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
92		{
93		//Space at beginning or after another space
94	0	app = " ";
95		}
96		break;
97		case '\n':
98	0	if (encodeNewline)
99		{
100	0	app = "<br/>";
101		}
102	0	break;
103		}
104	0	} else if (encodeNonLatin && (int)c > 0x80) {
105	0	switch(c) {
106		//german umlauts
107	0	case '\u00E4' : app = "ä"; break;
108	0	case '\u00C4' : app = "Ä"; break;
109	0	case '\u00F6' : app = "ö"; break;
110	0	case '\u00D6' : app = "Ö"; break;
111	0	case '\u00FC' : app = "ü"; break;
112	0	case '\u00DC' : app = "Ü"; break;
113	0	case '\u00DF' : app = "ß"; break;
114
115		//misc
116		//case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
117	0	case '\u20AC': app = "€"; break;
118	0	case '\u00AB': app = "«"; break;
119	0	case '\u00BB': app = "»"; break;
120	0	case '\u00A0': app = " "; break;
121
122		default :
123		//encode all non basic latin characters
124	0	app = "&#" + ((int)c) + ";";
125		break;
126		}
127		}
128	0	if (app != null)
129		{
130	0	if (sb == null)
131		{
132	0	sb = new StringBuilder(string.substring(0, i));
133		}
134	0	sb.append(app);
135		} else {
136	0	if (sb != null)
137		{
138	0	sb.append(c);
139		}
140		}
141		}
142
143	0	if (sb == null)
144		{
145	0	return string;
146		}
147		else
148		{
149	0	return sb.toString();
150		}
151		}
152
153		/**
154		* Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
155		*/
156		public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
157		{
158	0	encode(string, offset, length, false, true, writer);
159	0	}
160
161		/**
162		* Variant of {@link #encode} where encodeNbsp is true.
163		*/
164		public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) throws IOException
165		{
166	0	encode(string, offset, length, encodeNewline, true, writer);
167	0	}
168
169		/**
170		* Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true
171		*/
172		public static void encode (char[] string, int offset, int length, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
173		{
174	0	encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
175	0	}
176
177
178		/**
179		* Encodes the given string, so that it can be used within a html page.
180		* @param string the string to convert
181		* @param encodeNewline if true newline characters are converted to <br>'s
182		* @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s
183		* @param encodeNonLatin if true encode non-latin characters as numeric character references
184		*/
185		public static void encode (char[] string, int offset, int length,
186		boolean encodeNewline,
187		boolean encodeSubsequentBlanksToNbsp,
188		boolean encodeNonLatin, Writer writer) throws IOException
189		{
190	0	if (string == null \|\| length < 0 \|\| offset >= string.length)
191		{
192	0	return;
193		}
194	0	offset = Math.max(0, offset);
195	0	int realLength = Math.min(length, string.length - offset);
196
197	0	StringBuilder sb = null; //create later on demand
198		String app;
199		char c;
200
201	0	for (int i = offset; i < offset + realLength; ++i)
202		{
203	0	app = null;
204	0	c = string[i];
205
206		// All characters before letters
207	0	if ((int)c < 0x41)
208		{
209	0	switch (c)
210		{
211	0	case '"': app = """; break; //"
212	0	case '&': app = "&"; break; //&
213	0	case '<': app = "<"; break; //<
214	0	case '>': app = ">"; break; //>
215		case ' ':
216	0	if (encodeSubsequentBlanksToNbsp &&
217		(i == 0 \|\| (i - 1 >= 0 && string[i - 1] == ' ')))
218		{
219		//Space at beginning or after another space
220	0	app = " ";
221		}
222		break;
223		case '\n':
224	0	if (encodeNewline)
225		{
226	0	app = "<br/>";
227		}
228	0	break;
229		}
230	0	} else if (encodeNonLatin && (int)c > 0x80) {
231	0	switch(c) {
232		//german umlauts
233	0	case '\u00E4' : app = "ä"; break;
234	0	case '\u00C4' : app = "Ä"; break;
235	0	case '\u00F6' : app = "ö"; break;
236	0	case '\u00D6' : app = "Ö"; break;
237	0	case '\u00FC' : app = "ü"; break;
238	0	case '\u00DC' : app = "Ü"; break;
239	0	case '\u00DF' : app = "ß"; break;
240
241		//misc
242		//case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it?
243	0	case '\u20AC': app = "€"; break;
244	0	case '\u00AB': app = "«"; break;
245	0	case '\u00BB': app = "»"; break;
246	0	case '\u00A0': app = " "; break;
247
248		default :
249		//encode all non basic latin characters
250	0	app = "&#" + ((int)c) + ";";
251		break;
252		}
253		}
254	0	if (app != null)
255		{
256	0	if (sb == null)
257		{
258	0	sb = new StringBuilder(realLength*2);
259	0	sb.append(string, offset, i - offset);
260		}
261	0	sb.append(app);
262		} else {
263	0	if (sb != null)
264		{
265	0	sb.append(c);
266		}
267		}
268		}
269
270	0	if (sb == null)
271		{
272	0	writer.write(string, offset, realLength);
273		}
274		else
275		{
276	0	writer.write(sb.toString());
277		}
278	0	}
279
280		private static final String HEX_CHARSET = "0123456789ABCDEF";
281
282		private static final String UTF8 = "UTF-8";
283
284		/**
285		* Encode an URI, escaping or percent-encoding all required characters and
286		* following the rules mentioned on RFC 3986.
287		*
288		* @param string
289		* @param encodeNonLatin
290		* @return
291		* @throws IOException
292		*/
293		public static String encodeURIAtributte(final String string, final String characterEncoding)
294		throws IOException
295		{
296	0	StringBuilder sb = null; //create later on demand
297		String app;
298		char c;
299	0	boolean endLoop = false;
300	0	for (int i = 0; i < string.length (); ++i)
301		{
302	0	app = null;
303	0	c = string.charAt(i);
304
305		// This are the guidelines to be taken into account by this algorithm to encode:
306
307		// RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
308		//
309		// control = <US-ASCII coded characters 00-1F and 7F hexadecimal>
310		// space = <US-ASCII coded character 20 hexadecimal>
311		// delims = "<" \| ">" \| "#" \| "%" \| <">
312		// %3C %3E %23 %25 %22
313		// unwise = "{" \| "}" \| "\|" \| "\" \| "^" \| "[" \| "]" \| "`"
314		// %7D %7B %7C %5C %5E %5B %5D %60
315		//
316		// ".... Data corresponding to excluded characters must be escaped in order to
317		// be properly represented within a URI....."
318
319		// RFC 3986 Section 3. Syntax Components
320		//
321		// "... The generic URI syntax consists of a hierarchical sequence of
322		// components referred to as the scheme, authority, path, query, and
323		// fragment.
324		//
325		// URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
326		//
327		// hier-part = "//" authority path-abempty
328		// / path-absolute
329		// / path-rootless
330		// / path-empty
331		// ...."
332
333		// RFC 3986 Section 2.2:
334		// Reserved characters (should not be percent-encoded)
335		// reserved = gen-delims / sub-delims
336		// gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
337		// %3A %2F %3F %23 %5B %5D %40
338		// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
339		// %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D
340
341		// Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
342		// but on the part D. Changes from RFC 2396 says about this chars (used on IPv6)
343		// "...those rules were redefined to directly specify the characters allowed...."
344		// There is also other characters moved from excluded list to reserved:
345		// "[" / "]" / "#"
346
347		// RFC 3986 Section 2.3:
348		// "... for consistency, percent-encoded octets in the ranges of ALPHA
349		// (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
350		// underscore (%5F), or tilde (%7E) should not be created by URI
351		// producers...."
352
353		// RFC 3986 Section 3.2.2. Host
354
355		// host = IP-literal / IPv4address / reg-name
356
357		// The reg-name syntax allows percent-encoded octets in order to
358		// represent non-ASCII registered names in a uniform way that is
359		// independent of the underlying name resolution technology. Non-ASCII
360		// characters must first be encoded according to UTF-8 [STD63], and then
361		// each octet of the corresponding UTF-8 sequence must be percent-
362		// encoded to be represented as URI characters. URI producing
363		// applications must not use percent-encoding in host unless it is used
364		// to represent a UTF-8 character sequence.
365
366		// RFC 3986 Section 3.4 Query
367		// query = *( pchar / "/" / "?" )
368		//
369		// "... However, as query components are often used to carry identifying information
370		// in the form of "key=value" pairs and one frequently used value is a reference to
371		// another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
372		//
373		// RFC 3986 Section 2.5 Identifying Data (Apply to query section)
374		//
375		// When a new URI scheme defines a component that represents textual
376		// data consisting of characters from the Universal Character Set [UCS],
377		// the data should first be encoded as octets according to the UTF-8
378		// character encoding [STD63]; then only those octets that do not
379		// correspond to characters in the unreserved set should be percent-
380		// encoded. For example, the character A would be represented as "A",
381		// the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
382		// as "%C3%80", and the character KATAKANA LETTER A would be represented
383		// as "%E3%82%A2".
384		//
385		// RFC 3986 Section 3.5 Fragment
386		// fragment = *( pchar / "/" / "?" )
387		//
388		// Note that follows the same as query
389
390		// Based on the extracts the strategy to apply on this method is:
391		//
392		// On scheme ":" hier-part
393		//
394		// Escape or percent encode chars inside :
395		//
396		// - From %00 to %20,
397		// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
398		// duplicate encoding, encode it when we are sure
399		// that there are not encoded twice)
400		// - "<" %3C, ">" %3E
401		// - "\" %5C, "^" %5E, "`" %60
402		// - "{" %7B, "\|" %7C, "}" %7D
403		// - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
404		// part of an URI, but it is preferred to encode it that omit it).
405		//
406		// The remaining characters must not be encoded
407		//
408		// Characters after ? or # should be percent encoding but only the necessary ones:
409		//
410		// - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
411		// - <"> %22, "%" %25 (If there is encode of "%", there is a risk of
412		// duplicate encoding, encode it when we are sure
413		// that there are not encoded twice)
414		// - "<" %3C, ">" %3E,
415		// - "\" %5C, "^" %5E, "`" %60
416		// - "{" %7B, "\|" %7C, "}" %7D
417		// - From %7F ad infinitum (each character as many bytes as necessary but take into account
418		// that a single char should contain 2,3 or more bytes!. This data should be encoded
419		// translating from the document character encoding to percent encoding, because this values
420		// could be retrieved from httpRequest.getParameter() and it uses the current character encoding
421		// for decode values)
422		//
423		// "&" should be encoded as "&" because this link is inside an html page, and
424		// put only & is invalid in this context.
425
426	0	if ( (c <= (char)0x20) \|\| (c >= (char)0x7F) \|\|
427		c == '"' \|\| c == '<' \|\|
428		c == '>' \|\| c == '\\' \|\| c == '^' \|\| c == '`' \|\|
429		c == '{' \|\| c == '\|' \|\| c == '}')
430		{
431		// The percent encoding on this part should be done using UTF-8 charset
432		// as RFC 3986 Section 3.2.2 says.
433		// Also there is a reference on
434		// http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
435		// that recommend use of UTF-8 instead the document character encoding.
436		// Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
437	0	app = percentEncode(c, "UTF-8");
438		}
439	0	else if (c == '%')
440		{
441	0	if (i + 2 < string.length())
442		{
443	0	char c1 = string.charAt(i+1);
444	0	char c2 = string.charAt(i+2);
445	0	if ((( c1 >= '0' && c1 <='9') \|\| (c1 >='A' && c1 <='Z')) &&
446		(( c2 >= '0' && c2 <='9') \|\| (c2 >='A' && c2 <='Z')))
447		{
448		// do not percent encode, because it could be already encoded
449		// and we don't want encode it twice
450		}
451		else
452		{
453	0	app = percentEncode(c, UTF8);
454		}
455	0	}
456		else
457		{
458	0	app = percentEncode(c, UTF8);
459		}
460		}
461	0	else if (c == '?' \|\| c == '#')
462		{
463	0	if (i+1 < string.length())
464		{
465		// The remaining part of the URI are data that should be encoded
466		// using the document character encoding.
467	0	app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
468	0	endLoop = true;
469		}
470		}
471		else
472		{
473		//No encoding, just do nothing, char will be added later.
474		}
475
476	0	if (app != null)
477		{
478	0	if (sb == null)
479		{
480	0	sb = new StringBuilder(string.substring(0, i));
481		}
482	0	sb.append(app);
483		} else {
484	0	if (sb != null)
485		{
486	0	sb.append(c);
487		}
488		}
489	0	if (endLoop)
490		{
491	0	break;
492		}
493		}
494	0	if (sb == null)
495		{
496	0	return string;
497		}
498		else
499		{
500	0	return sb.toString();
501		}
502		}
503
504		/**
505		* Encode a unicode char value in percentEncode, decoding its bytes using a specified
506		* characterEncoding.
507		*
508		* @param c
509		* @param characterEncoding
510		* @return
511		*/
512		private static String percentEncode(char c, String characterEncoding)
513		{
514	0	String app = null;
515	0	if (c > (char)((short)0x007F))
516		{
517		//percent encode in the proper encoding to be consistent
518	0	app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
519		}
520		else
521		{
522		//percent encode US-ASCII char (0x00-0x7F range)
523	0	app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
524		}
525	0	return app;
526		}
527
528		private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
529		{
530	0	ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
531	0	StringBuffer builder = new StringBuffer();
532		try
533		{
534	0	OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
535	0	writer.write(c);
536	0	writer.flush();
537		}
538	0	catch(IOException e)
539		{
540	0	baos.reset();
541	0	return null;
542	0	}
543
544	0	byte [] byteArray = baos.toByteArray();
545	0	for (int i=0; i < byteArray.length; i++)
546		{
547	0	builder.append('%');
548	0	builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
549	0	builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
550		}
551
552	0	return builder.toString();
553		}
554
555		/**
556		* Encode the query part using the document charset encoding provided.
557		*
558		*
559		* @param string
560		* @param characterEncoding
561		* @return
562		*/
563		private static String encodeURIQuery(final String string, final String characterEncoding)
564		{
565	0	StringBuilder sb = null; //create later on demand
566		String app;
567		char c;
568	0	boolean endLoop = false;
569	0	for (int i = 0; i < string.length (); ++i)
570		{
571	0	app = null;
572	0	c = string.charAt(i);
573
574		// - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
575		// - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one)
576		// - "<" %3C, ">" %3E,
577		// - "\" %5C, "^" %5E, "`" %60
578		// - "{" %7B, "\|" %7C, "}" %7D
579		// - From %7F ad infinitum (each character as many bytes as necessary but take into account
580		// that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document
581		// character encoding to percent encoding)
582		//
583		// "&" should be encoded as "&" because this link is inside an html page, and
584		// put & is invalid in this context
585
586	0	if ( (c <= (char)0x20) \|\| (c >= (char)0x7F) \|\|
587		c == '"' \|\| c == '<' \|\|
588		c == '>' \|\| c == '\\' \|\| c == '^' \|\| c == '`' \|\|
589		c == '{' \|\| c == '\|' \|\| c == '}')
590		{
591		// The percent encoding on this part should be done using UTF-8 charset
592		// as RFC 3986 Section 3.2.2 says
593	0	app = percentEncode(c, characterEncoding);
594		}
595	0	else if (c == '%')
596		{
597	0	if (i + 2 < string.length())
598		{
599	0	char c1 = string.charAt(i+1);
600	0	char c2 = string.charAt(i+2);
601	0	if ((( c1 >= '0' && c1 <='9') \|\| (c1 >='A' && c1 <='Z')) &&
602		(( c2 >= '0' && c2 <='9') \|\| (c2 >='A' && c2 <='Z')))
603		{
604		// do not percent encode, because it could be already encoded
605		}
606		else
607		{
608	0	app = percentEncode(c, characterEncoding);
609		}
610	0	}
611		else
612		{
613	0	app = percentEncode(c, characterEncoding);
614		}
615		}
616	0	else if (c == '&')
617		{
618	0	if (i+4 < string.length() )
619		{
620	0	if ('a' == string.charAt(i+1) &&
621		'm' == string.charAt(i+2) &&
622		'p' == string.charAt(i+3) &&
623		';' == string.charAt(i+4))
624		{
625		//Skip
626		}
627		else
628		{
629	0	app = "&";
630		}
631		}
632		else
633		{
634	0	app = "&";
635		}
636		}
637		else
638		{
639		//No encoding, just do nothing, char will be added later.
640		}
641
642	0	if (app != null)
643		{
644	0	if (sb == null)
645		{
646	0	sb = new StringBuilder(string.substring(0, i));
647		}
648	0	sb.append(app);
649		} else {
650	0	if (sb != null)
651		{
652	0	sb.append(c);
653		}
654		}
655	0	if (endLoop)
656		{
657	0	break;
658		}
659		}
660	0	if (sb == null)
661		{
662	0	return string;
663		}
664		else
665		{
666	0	return sb.toString();
667		}
668		}
669		}