View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.myfaces.shared.renderkit.html.util;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.OutputStreamWriter;
24  import java.io.Writer;
25  
26  /**
27   * Converts Strings so that they can be used within HTML-Code.
28   */
29  public abstract class HTMLEncoder
30  {
31      /**
32       * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33       */
34      public static String encode (String string)
35      {
36          return encode(string, false, true);
37      }
38  
39      /**
40       * Variant of {@link #encode} where encodeNbsp is true.
41       */
42      public static String encode (String string, boolean encodeNewline)
43      {
44          return encode(string, encodeNewline, true);
45      }
46  
47      /**
48       * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
49       */
50      public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51      {
52          return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53      }
54  
55      /**
56       * Encodes the given string, so that it can be used within a html page.
57       * @param string the string to convert
58       * @param encodeNewline if true newline characters are converted to <br>'s
59       * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to  's
60       * @param encodeNonLatin if true encode non-latin characters as numeric character references
61       */
62      public static String encode (String string,
63                                   boolean encodeNewline,
64                                   boolean encodeSubsequentBlanksToNbsp,
65                                   boolean encodeNonLatin)
66      {
67          if (string == null)
68          {
69              return "";
70          }
71  
72          StringBuilder sb = null;    //create later on demand
73          String app;
74          char c = ' ';
75          char prevC;
76          int length = string.length();
77          for (int i = 0; i < length; ++i)
78          {
79              app = null;
80              prevC = c;
81              c = string.charAt(i);
82              
83              // All characters before letters
84              if ((int)c < 0x41)
85              {
86                  switch (c)
87                  {
88                      case '"': app = "&quot;"; break;    //"
89                      case '&': app = "&amp;"; break;     //&
90                      case '<': app = "&lt;"; break;      //<
91                      case '>': app = "&gt;"; break;      //>
92                      case ' ':
93                          if (encodeSubsequentBlanksToNbsp &&
94                                  prevC == ' ')
95                          {
96                              //Space at beginning or after another space
97                              app = "&#160;";
98                          }
99                          break;
100                     case '\n':
101                         if (encodeNewline)
102                         {
103                             app = "<br/>";
104                         }
105                         break;
106                     default:
107                         break;
108                 }
109                 // http://www.w3.org/MarkUp/html3/specialchars.html
110                 // From C0 extension U+0000-U+001F only U+0009, U+000A and
111                 // U+000D are valid control characters
112                 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
113                 {
114                     // Ignore escape character
115                     app = "";
116                 }
117             }
118             else if (encodeNonLatin && (int)c > 0x80)
119             {
120                  switch(c)
121                  {
122                     //german umlauts
123                     case '\u00E4' : app = "&auml;";  break;
124                     case '\u00C4' : app = "&Auml;";  break;
125                     case '\u00F6' : app = "&ouml;";  break;
126                     case '\u00D6' : app = "&Ouml;";  break;
127                     case '\u00FC' : app = "&uuml;";  break;
128                     case '\u00DC' : app = "&Uuml;";  break;
129                     case '\u00DF' : app = "&szlig;"; break;
130 
131                     //misc
132                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
133                     case '\u20AC': app = "&euro;";  break;
134                     case '\u00AB': app = "&laquo;"; break;
135                     case '\u00BB': app = "&raquo;"; break;
136                     case '\u00A0': app = "&#160;"; break;
137 
138                     default :
139                         //encode all non basic latin characters
140                         app = "&#" + ((int)c) + ";";
141                     break;
142                 }
143             }
144             if (app != null)
145             {
146                 if (sb == null)
147                 {
148                     sb = new StringBuilder(string.substring(0, i));
149                 }
150                 sb.append(app);
151             }
152             else
153             {
154                 if (sb != null)
155                 {
156                     sb.append(c);
157                 }
158             }
159         }
160 
161         if (sb == null)
162         {
163             return string;
164         }
165         else
166         {
167             return sb.toString();
168         }
169     }
170     
171     /**
172      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
173      */
174     public static void encode (Writer writer, String string) throws IOException
175     {
176         encode(writer, string, false, true);
177     }
178 
179     /**
180      * Variant of {@link #encode} where encodeNbsp is true.
181      */
182     public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException
183     {
184         encode(writer, string, encodeNewline, true);
185     }
186 
187     /**
188      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
189      */
190     public static void encode (Writer writer, String string, 
191             boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException
192     {
193         encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
194     }
195     
196     public static void encode (Writer writer, String string,
197                                  boolean encodeNewline,
198                                  boolean encodeSubsequentBlanksToNbsp,
199                                  boolean encodeNonLatin) throws IOException
200     {
201         if (string == null)
202         {
203             return;
204         }
205 
206         int start = 0;
207         String app;
208         char c = ' ';
209         char prevC;
210         int length = string.length();
211         for (int i = 0; i < length; ++i)
212         {
213             app = null;
214             prevC = c;
215             c = string.charAt(i);
216             
217             // All characters before letters
218             if ((int)c < 0x41)
219             {
220                 switch (c)
221                 {
222                     case '"': app = "&quot;"; break;    //"
223                     case '&': app = "&amp;"; break;     //&
224                     case '<': app = "&lt;"; break;      //<
225                     case '>': app = "&gt;"; break;      //>
226                     case ' ':
227                         if (encodeSubsequentBlanksToNbsp &&
228                                 prevC == ' ')
229                         {
230                             //Space at beginning or after another space
231                             app = "&#160;";
232                         }
233                         break;
234                     case '\n':
235                         if (encodeNewline)
236                         {
237                             app = "<br/>";
238                         }
239                         break;
240                     default:
241                         break;
242                 }
243                 // http://www.w3.org/MarkUp/html3/specialchars.html
244                 // From C0 extension U+0000-U+001F only U+0009, U+000A and
245                 // U+000D are valid control characters
246                 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
247                 {
248                     // Ignore escape character
249                     app = "";
250                 }
251             }
252             else if (encodeNonLatin && (int)c > 0x80)
253             {
254                  switch(c)
255                  {
256                     //german umlauts
257                     case '\u00E4' : app = "&auml;";  break;
258                     case '\u00C4' : app = "&Auml;";  break;
259                     case '\u00F6' : app = "&ouml;";  break;
260                     case '\u00D6' : app = "&Ouml;";  break;
261                     case '\u00FC' : app = "&uuml;";  break;
262                     case '\u00DC' : app = "&Uuml;";  break;
263                     case '\u00DF' : app = "&szlig;"; break;
264 
265                     //misc
266                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
267                     case '\u20AC': app = "&euro;";  break;
268                     case '\u00AB': app = "&laquo;"; break;
269                     case '\u00BB': app = "&raquo;"; break;
270                     case '\u00A0': app = "&#160;"; break;
271 
272                     default :
273                         //encode all non basic latin characters
274                         app = "&#" + ((int)c) + ";";
275                     break;
276                 }
277             }
278             if (app != null)
279             {
280                 //if (sb == null)
281                 //{
282                 //    sb = new StringBuilder(string.substring(0, i));
283                 //}
284                 //sb.append(app);
285                 if (start < i)
286                 {
287                     writer.write(string, start, i-start);
288                 }
289                 start = i+1;
290                 writer.write(app);
291             }
292             //else
293             //{
294             //    if (sb != null)
295             //    {
296             //        sb.append(c);
297             //    }
298             //}
299         }
300 
301         //if (sb == null)
302         //{
303         //    return string;
304         //}
305         //else
306         //{
307         //    return sb.toString();
308         //}
309         if (start == 0)
310         {
311             writer.write(string);
312         }
313         else if (start < length)
314         {
315             writer.write(string,start,length-start);
316         }
317     }
318 
319 
320     /**
321      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
322      */
323     public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
324     {
325         encode(string, offset, length, false, true, writer);
326     }
327 
328     /**
329      * Variant of {@link #encode} where encodeNbsp is true.
330      */
331     public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer)
332         throws IOException
333     {
334         encode(string, offset, length, encodeNewline, true, writer);
335     }
336 
337     /**
338      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
339      */
340     public static void encode (char[] string, int offset, int length, boolean encodeNewline, 
341             boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
342     {
343         encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
344     }
345 
346 
347     /**
348      * Encodes the given string, so that it can be used within a html page.
349      * @param string the string to convert
350      * @param encodeNewline if true newline characters are converted to &lt;br&gt;'s
351      * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &amp;nbsp;'s
352      * @param encodeNonLatin if true encode non-latin characters as numeric character references
353      */
354     public static void encode (char[] string, int offset, int length,
355                                  boolean encodeNewline,
356                                  boolean encodeSubsequentBlanksToNbsp,
357                                  boolean encodeNonLatin, Writer writer) throws IOException
358     {
359         if (string == null || length < 0 || offset >= string.length)
360         {
361             return;
362         }
363         offset = Math.max(0, offset);
364         int realLength = Math.min(length, string.length - offset);
365 
366         //StringBuilder sb = null;    //create later on demand
367         String app;
368         char c = ' ';
369         char prevC;
370         int start = offset;
371         
372         for (int i = offset; i < offset + realLength; ++i)
373         {
374             app = null;
375             prevC = c;
376             c = string[i];
377 
378             // All characters before letters
379             if ((int)c < 0x41)
380             {
381                 switch (c)
382                 {
383                     case '"': app = "&quot;"; break;    //"
384                     case '&': app = "&amp;"; break;     //&
385                     case '<': app = "&lt;"; break;      //<
386                     case '>': app = "&gt;"; break;      //>
387                     case ' ':
388                         if (encodeSubsequentBlanksToNbsp &&
389                                 prevC == ' ')
390                         {
391                             //Space at beginning or after another space
392                             app = "&#160;";
393                         }
394                         break;
395                     case '\n':
396                         if (encodeNewline)
397                         {
398                             app = "<br/>";
399                         }
400                         break;
401                     default:
402                         break;
403                 }
404                 // http://www.w3.org/MarkUp/html3/specialchars.html
405                 // From C0 extension U+0000-U+001F only U+0009, U+000A and
406                 // U+000D are valid control characters
407                 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
408                 {
409                     // Ignore escape character
410                     app = "";
411                 }
412             }
413             else if (encodeNonLatin && (int)c > 0x80)
414             {
415                  switch(c)
416                  {
417                     //german umlauts
418                     case '\u00E4' : app = "&auml;";  break;
419                     case '\u00C4' : app = "&Auml;";  break;
420                     case '\u00F6' : app = "&ouml;";  break;
421                     case '\u00D6' : app = "&Ouml;";  break;
422                     case '\u00FC' : app = "&uuml;";  break;
423                     case '\u00DC' : app = "&Uuml;";  break;
424                     case '\u00DF' : app = "&szlig;"; break;
425 
426                     //misc
427                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
428                     case '\u20AC': app = "&euro;";  break;
429                     case '\u00AB': app = "&laquo;"; break;
430                     case '\u00BB': app = "&raquo;"; break;
431                     case '\u00A0': app = "&#160;"; break;
432 
433                     default :
434                         //encode all non basic latin characters
435                         app = "&#" + ((int)c) + ";";
436                     break;
437                 }
438             }
439             if (app != null)
440             {
441                 //if (sb == null)
442                 //{
443                 //    sb = new StringBuilder(realLength*2);
444                 //    sb.append(string, offset, i - offset);
445                 //}
446                 //sb.append(app);
447                 if (start < i)
448                 {
449                     writer.write(string, start, i-start);
450                 }
451                 start = i+1;
452                 writer.write(app);
453             }
454             /*
455             else
456             {
457                 if (sb != null)
458                 {
459                     sb.append(c);
460                 }
461             }*/
462         }
463 
464         //if (sb == null)
465         //{
466         //    writer.write(string, offset, realLength);
467         //}
468         //else
469         //{
470         //    writer.write(sb.toString());
471         //}
472         if (start == offset)
473         {
474             writer.write(string, offset, realLength);
475         }
476         else if (start < offset+realLength)
477         {
478             writer.write(string,start,offset+realLength-start);
479         }
480     }
481     
482     private static final String HEX_CHARSET = "0123456789ABCDEF";
483     
484     private static final String UTF8 = "UTF-8";
485     
486     /**
487      * Encode an URI, escaping or percent-encoding all required characters and
488      * following the rules mentioned on RFC 3986.  
489      * 
490      * @param string
491      * @param characterEncoding
492      * @return
493      * @throws IOException
494      */
495     public static String encodeURIAttribute(final String string, final String characterEncoding)
496         throws IOException
497     {
498         StringBuilder sb = null;    //create later on demand
499         String app;
500         char c;
501         boolean endLoop = false;
502         int length = string.length();
503         for (int i = 0; i < length; ++i)
504         {
505             app = null;
506             c = string.charAt(i);
507             
508             // This are the guidelines to be taken into account by this algorithm to encode:
509             
510             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
511             //
512             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
513             // space       = <US-ASCII coded character 20 hexadecimal>
514             // delims      = "<" | ">" | "#" | "%" | <">
515             //               %3C   %3E   %23   %25   %22
516             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
517             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
518             //
519             // ".... Data corresponding to excluded characters must be escaped in order to
520             // be properly represented within a URI....."
521             
522             // RFC 3986 Section 3.  Syntax Components
523             //
524             // "... The generic URI syntax consists of a hierarchical sequence of
525             // components referred to as the scheme, authority, path, query, and
526             // fragment.
527             //
528             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
529             //
530             //   hier-part   = "//" authority path-abempty
531             //               / path-absolute
532             //               / path-rootless
533             //               / path-empty
534             // ...."
535             
536             // RFC 3986 Section 2.2:
537             // Reserved characters (should not be percent-encoded)
538             // reserved    = gen-delims / sub-delims
539             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
540             //               %3A   %2F   %3F   %23   %5B   %5D   %40
541             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
542             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
543             
544             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
545             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
546             // "...those rules were redefined to directly specify the characters allowed...."
547             // There is also other characters moved from excluded list to reserved:
548             // "[" / "]" / "#"  
549             
550             // RFC 3986 Section 2.3:
551             // "... for consistency, percent-encoded octets in the ranges of ALPHA
552             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
553             // underscore (%5F), or tilde (%7E) should not be created by URI
554             // producers...."
555             
556             // RFC 3986 Section  3.2.2.  Host
557 
558             // host = IP-literal / IPv4address / reg-name
559 
560             // The reg-name syntax allows percent-encoded octets in order to
561             // represent non-ASCII registered names in a uniform way that is
562             // independent of the underlying name resolution technology.  Non-ASCII
563             // characters must first be encoded according to UTF-8 [STD63], and then
564             // each octet of the corresponding UTF-8 sequence must be percent-
565             // encoded to be represented as URI characters.  URI producing
566             // applications must not use percent-encoding in host unless it is used
567             // to represent a UTF-8 character sequence.
568             
569             // RFC 3986 Section 3.4 Query 
570             //         query       = *( pchar / "/" / "?" )
571             //
572             // "...  However, as query components are often used to carry identifying information 
573             // in the form of "key=value" pairs and one frequently used value is a reference to
574             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
575             //
576             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
577             //
578             // When a new URI scheme defines a component that represents textual
579             // data consisting of characters from the Universal Character Set [UCS],
580             // the data should first be encoded as octets according to the UTF-8
581             // character encoding [STD63]; then only those octets that do not
582             // correspond to characters in the unreserved set should be percent-
583             // encoded.  For example, the character A would be represented as "A",
584             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
585             // as "%C3%80", and the character KATAKANA LETTER A would be represented
586             // as "%E3%82%A2".
587             //
588             // RFC 3986 Section 3.5 Fragment
589             //         fragment    = *( pchar / "/" / "?" )
590             //
591             // Note that follows the same as query
592             
593             // Based on the extracts the strategy to apply on this method is:
594             // 
595             // On scheme ":" hier-part
596             //
597             // Escape or percent encode chars inside :
598             // 
599             // - From %00 to %20, 
600             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
601             //                     duplicate encoding, encode it when we are sure 
602             //                     that there are not encoded twice)
603             // - "<" %3C, ">" %3E
604             // - "\" %5C, "^" %5E, "`" %60 
605             // - "{" %7B, "|" %7C, "}" %7D
606             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
607             //   part of an URI, but it is preferred to encode it that omit it).
608             //
609             // The remaining characters must not be encoded
610             //
611             // Characters after ? or # should be percent encoding but only the necessary ones:
612             //
613             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
614             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
615             //                     duplicate encoding, encode it when we are sure 
616             //                     that there are not encoded twice)
617             // - "<" %3C, ">" %3E,
618             // - "\" %5C, "^" %5E, "`" %60 
619             // - "{" %7B, "|" %7C, "}" %7D
620             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
621             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
622             //   translating from the document character encoding to percent encoding, because this values
623             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
624             //   for decode values)
625             //
626             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
627             // put only & is invalid in this context.
628 
629             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
630                     c == '"' || c == '<' ||
631                     c == '>' || c == '\\' || c == '^' || c == '`' ||
632                     c == '{' || c == '|' || c == '}')
633             {
634                 // The percent encoding on this part should be done using UTF-8 charset
635                 // as RFC 3986 Section 3.2.2 says.
636                 // Also there is a reference on 
637                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
638                 // that recommend use of UTF-8 instead the document character encoding.
639                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
640                 app = percentEncode(c, "UTF-8");
641             }
642             else if (c == '%')
643             {
644                 if (i + 2 < length)
645                 {
646                     char c1 = string.charAt(i+1);
647                     char c2 = string.charAt(i+2);
648                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
649                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
650                     {
651                         // do not percent encode, because it could be already encoded
652                         // and we don't want encode it twice
653                     }
654                     else
655                     {
656                         app = percentEncode(c, UTF8);
657                     }
658                 }
659                 else
660                 {
661                     app = percentEncode(c, UTF8);
662                 }
663             }
664             else if (c == '?' || c == '#')
665             {
666                 if (i+1 < length)
667                 {
668                     // The remaining part of the URI are data that should be encoded
669                     // using the document character encoding.
670                     app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
671                     endLoop = true;
672                 }
673             }
674             else
675             {
676                 //No encoding, just do nothing, char will be added later.
677             }
678                         
679             if (app != null)
680             {
681                 if (sb == null)
682                 {
683                     sb = new StringBuilder(string.substring(0, i));
684                 }
685                 sb.append(app);
686             }
687             else
688             {
689                 if (sb != null)
690                 {
691                     sb.append(c);
692                 }
693             }
694             if (endLoop)
695             {
696                 break;
697             }
698         }
699         if (sb == null)
700         {
701             return string;
702         }
703         else
704         {
705             return sb.toString();
706         }
707     }
708     
709     /**
710      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
711      * characterEncoding.
712      * 
713      * @param c
714      * @param characterEncoding
715      * @return
716      */
717     private static String percentEncode(char c, String characterEncoding)
718     {
719         String app = null;
720         if (c > (char)((short)0x007F))
721         {
722             //percent encode in the proper encoding to be consistent
723             app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
724         }
725         else
726         {
727             //percent encode US-ASCII char (0x00-0x7F range)
728             app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
729         }
730         return app;
731     }
732     
733     private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
734     {
735         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
736         StringBuilder builder = new StringBuilder();
737         try
738         {
739             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
740             writer.write(c);
741             writer.flush();
742         }
743         catch(IOException e)
744         {
745             baos.reset();
746             return null;
747         }
748         
749         byte [] byteArray =  baos.toByteArray();
750         for (int i=0; i < byteArray.length; i++)
751         {
752             builder.append('%');
753             builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
754             builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
755         }
756         
757         return builder.toString();
758     }
759 
760     /**
761      * Encode the query part using the document charset encoding provided.
762      * 
763      * 
764      * @param string
765      * @param characterEncoding
766      * @return
767      */
768     private static String encodeURIQuery(final String string, final String characterEncoding)
769     {
770         StringBuilder sb = null;    //create later on demand
771         String app;
772         char c;
773         boolean endLoop = false;
774         int length = string.length();
775         for (int i = 0; i < length; ++i)
776         {
777             app = null;
778             c = string.charAt(i);
779             
780             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
781             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 
782             //            we make easier and omit this one)
783             // - "<" %3C, ">" %3E,
784             // - "\" %5C, "^" %5E, "`" %60 
785             // - "{" %7B, "|" %7C, "}" %7D
786             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
787             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
788             //   translating from the document character encoding to percent encoding)
789             //
790             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
791             // put & is invalid in this context   
792             
793             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
794                     c == '"' || c == '<' ||
795                     c == '>' || c == '\\' || c == '^' || c == '`' ||
796                     c == '{' || c == '|' || c == '}')
797             {
798                 // The percent encoding on this part should be done using UTF-8 charset
799                 // as RFC 3986 Section 3.2.2 says
800                 app = percentEncode(c, characterEncoding);
801             }
802             else if (c == '%')
803             {
804                 if (i + 2 < length)
805                 {
806                     char c1 = string.charAt(i+1);
807                     char c2 = string.charAt(i+2);
808                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
809                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
810                     {
811                         // do not percent encode, because it could be already encoded
812                     }
813                     else
814                     {
815                         app = percentEncode(c, characterEncoding);
816                     }
817                 }
818                 else
819                 {
820                     app = percentEncode(c, characterEncoding);
821                 }
822             }
823             else if (c == '&')
824             {
825                 if (i+4 < length )
826                 {
827                     if ('a' == string.charAt(i+1) &&
828                         'm' == string.charAt(i+2) &&
829                         'p' == string.charAt(i+3) &&
830                         ';' == string.charAt(i+4))
831                     {
832                         //Skip
833                     }
834                     else
835                     {
836                         app = "&amp;";
837                     }
838                 }
839                 else
840                 {
841                     app = "&amp;";
842                 }
843             }
844             else
845             {
846                 //No encoding, just do nothing, char will be added later.
847             }
848                         
849             if (app != null)
850             {
851                 if (sb == null)
852                 {
853                     sb = new StringBuilder(string.substring(0, i));
854                 }
855                 sb.append(app);
856             }
857             else
858             {
859                 if (sb != null)
860                 {
861                     sb.append(c);
862                 }
863             }
864             if (endLoop)
865             {
866                 break;
867             }
868         }
869         if (sb == null)
870         {
871             return string;
872         }
873         else
874         {
875             return sb.toString();
876         }
877     }
878 
879     /**
880      * Encode an URI, escaping or percent-encoding all required characters and
881      * following the rules mentioned on RFC 3986.  
882      * 
883      * @param writer
884      * @param string
885      * @param characterEncoding
886      * @throws IOException
887      */
888     public static void encodeURIAttribute(Writer writer, final String string, final String characterEncoding)
889         throws IOException
890     {
891         //StringBuilder sb = null;    //create later on demand
892         int start = 0;
893         String app;
894         char c;
895         boolean endLoop = false;
896         int length = string.length();
897         for (int i = 0; i < length; ++i)
898         {
899             app = null;
900             c = string.charAt(i);
901             
902             // This are the guidelines to be taken into account by this algorithm to encode:
903             
904             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
905             //
906             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
907             // space       = <US-ASCII coded character 20 hexadecimal>
908             // delims      = "<" | ">" | "#" | "%" | <">
909             //               %3C   %3E   %23   %25   %22
910             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
911             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
912             //
913             // ".... Data corresponding to excluded characters must be escaped in order to
914             // be properly represented within a URI....."
915             
916             // RFC 3986 Section 3.  Syntax Components
917             //
918             // "... The generic URI syntax consists of a hierarchical sequence of
919             // components referred to as the scheme, authority, path, query, and
920             // fragment.
921             //
922             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
923             //
924             //   hier-part   = "//" authority path-abempty
925             //               / path-absolute
926             //               / path-rootless
927             //               / path-empty
928             // ...."
929             
930             // RFC 3986 Section 2.2:
931             // Reserved characters (should not be percent-encoded)
932             // reserved    = gen-delims / sub-delims
933             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
934             //               %3A   %2F   %3F   %23   %5B   %5D   %40
935             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
936             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
937             
938             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
939             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
940             // "...those rules were redefined to directly specify the characters allowed...."
941             // There is also other characters moved from excluded list to reserved:
942             // "[" / "]" / "#"  
943             
944             // RFC 3986 Section 2.3:
945             // "... for consistency, percent-encoded octets in the ranges of ALPHA
946             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
947             // underscore (%5F), or tilde (%7E) should not be created by URI
948             // producers...."
949             
950             // RFC 3986 Section  3.2.2.  Host
951 
952             // host = IP-literal / IPv4address / reg-name
953 
954             // The reg-name syntax allows percent-encoded octets in order to
955             // represent non-ASCII registered names in a uniform way that is
956             // independent of the underlying name resolution technology.  Non-ASCII
957             // characters must first be encoded according to UTF-8 [STD63], and then
958             // each octet of the corresponding UTF-8 sequence must be percent-
959             // encoded to be represented as URI characters.  URI producing
960             // applications must not use percent-encoding in host unless it is used
961             // to represent a UTF-8 character sequence.
962             
963             // RFC 3986 Section 3.4 Query 
964             //         query       = *( pchar / "/" / "?" )
965             //
966             // "...  However, as query components are often used to carry identifying information 
967             // in the form of "key=value" pairs and one frequently used value is a reference to
968             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
969             //
970             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
971             //
972             // When a new URI scheme defines a component that represents textual
973             // data consisting of characters from the Universal Character Set [UCS],
974             // the data should first be encoded as octets according to the UTF-8
975             // character encoding [STD63]; then only those octets that do not
976             // correspond to characters in the unreserved set should be percent-
977             // encoded.  For example, the character A would be represented as "A",
978             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
979             // as "%C3%80", and the character KATAKANA LETTER A would be represented
980             // as "%E3%82%A2".
981             //
982             // RFC 3986 Section 3.5 Fragment
983             //         fragment    = *( pchar / "/" / "?" )
984             //
985             // Note that follows the same as query
986             
987             // Based on the extracts the strategy to apply on this method is:
988             // 
989             // On scheme ":" hier-part
990             //
991             // Escape or percent encode chars inside :
992             // 
993             // - From %00 to %20, 
994             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
995             //                     duplicate encoding, encode it when we are sure 
996             //                     that there are not encoded twice)
997             // - "<" %3C, ">" %3E
998             // - "\" %5C, "^" %5E, "`" %60 
999             // - "{" %7B, "|" %7C, "}" %7D
1000             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
1001             //   part of an URI, but it is preferred to encode it that omit it).
1002             //
1003             // The remaining characters must not be encoded
1004             //
1005             // Characters after ? or # should be percent encoding but only the necessary ones:
1006             //
1007             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
1008             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
1009             //                     duplicate encoding, encode it when we are sure 
1010             //                     that there are not encoded twice)
1011             // - "<" %3C, ">" %3E,
1012             // - "\" %5C, "^" %5E, "`" %60 
1013             // - "{" %7B, "|" %7C, "}" %7D
1014             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
1015             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
1016             //   translating from the document character encoding to percent encoding, because this values
1017             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
1018             //   for decode values)
1019             //
1020             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
1021             // put only & is invalid in this context.
1022 
1023             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
1024                     c == '"' || c == '<' ||
1025                     c == '>' || c == '\\' || c == '^' || c == '`' ||
1026                     c == '{' || c == '|' || c == '}')
1027             {
1028                 // The percent encoding on this part should be done using UTF-8 charset
1029                 // as RFC 3986 Section 3.2.2 says.
1030                 // Also there is a reference on 
1031                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
1032                 // that recommend use of UTF-8 instead the document character encoding.
1033                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
1034                 //app = percentEncode(c, "UTF-8");
1035                 if (start < i)
1036                 {
1037                     writer.write(string, start, i-start);
1038                 }
1039                 start = i+1;
1040                 percentEncode(writer, c, "UTF-8");
1041             }
1042             else if (c == '%')
1043             {
1044                 if (i + 2 < length)
1045                 {
1046                     char c1 = string.charAt(i+1);
1047                     char c2 = string.charAt(i+2);
1048                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
1049                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
1050                     {
1051                         // do not percent encode, because it could be already encoded
1052                         // and we don't want encode it twice
1053                     }
1054                     else
1055                     {
1056                         //app = percentEncode(c, UTF8);
1057                         if (start < i)
1058                         {
1059                             writer.write(string, start, i-start);
1060                         }
1061                         start = i+1;
1062                         percentEncode(writer, c, UTF8);
1063                     }
1064                 }
1065                 else
1066                 {
1067                     //app = percentEncode(c, UTF8);
1068                     if (start < i)
1069                     {
1070                         writer.write(string, start, i-start);
1071                     }
1072                     start = i+1;
1073                     percentEncode(writer, c, UTF8);
1074                 }
1075             }
1076             else if (c == '?' || c == '#')
1077             {
1078                 if (i+1 < length)
1079                 {
1080                     // The remaining part of the URI are data that should be encoded
1081                     // using the document character encoding.
1082                     //app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
1083                     if (start < i)
1084                     {
1085                         writer.write(string, start, i-start);
1086                     }
1087                     start = i+1;
1088                     writer.write(c);
1089                     //encodeURIQuery(writer, string.substring(i+1), characterEncoding);
1090                     encodeURIQuery(writer, string, i+1, characterEncoding);
1091                     endLoop = true;
1092                 }
1093             }
1094             else
1095             {
1096                 //No encoding, just do nothing, char will be added later.
1097             }
1098                         
1099             if (app != null)
1100             {
1101                 //if (sb == null)
1102                 //{
1103                 //    sb = new StringBuilder(string.substring(0, i));
1104                 //}
1105                 //sb.append(app);
1106                 if (start < i)
1107                 {
1108                     writer.write(string, start, i-start);
1109                 }
1110                 start = i+1;
1111                 writer.write(app);
1112             }
1113             //else
1114             //{
1115             //    if (sb != null)
1116             //    {
1117             //        sb.append(c);
1118             //    }
1119             //}
1120             if (endLoop)
1121             {
1122                 start = length;
1123                 break;
1124             }
1125         }
1126         //if (sb == null)
1127         //{
1128         //    return string;
1129         //}
1130         //else
1131         //{
1132         //    return sb.toString();
1133         //}
1134         if (start == 0)
1135         {
1136             writer.write(string);
1137         }
1138         else if (start < length)
1139         {
1140             writer.write(string,start,length-start);
1141         }
1142     }
1143 
1144     /**
1145      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
1146      * characterEncoding.
1147      * 
1148      * @param c
1149      * @param characterEncoding
1150      * @return
1151      */
1152     private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException
1153     {
1154         if (c > (char)((short)0x007F))
1155         {
1156             //percent encode in the proper encoding to be consistent
1157             percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding);
1158         }
1159         else
1160         {
1161             //percent encode US-ASCII char (0x00-0x7F range)
1162             writer.write('%');
1163             writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)));
1164             writer.write(HEX_CHARSET.charAt(c % 0x10));
1165         }
1166     }
1167     
1168     private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding) 
1169         throws IOException
1170     {
1171         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
1172 
1173         try
1174         {
1175             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
1176             writer.write(c);
1177             writer.flush();
1178         }
1179         catch(IOException e)
1180         {
1181             baos.reset();
1182             return;
1183         }
1184         
1185         byte [] byteArray =  baos.toByteArray();
1186         for (int i=0; i < byteArray.length; i++)
1187         {
1188             //builder.append('%');
1189             //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1190             //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1191             currentWriter.write('%');
1192             currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1193             currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1194         }
1195         
1196         //return builder.toString();
1197     }
1198     
1199     /**
1200      * Encode the query part using the document charset encoding provided.
1201      * 
1202      * 
1203      * @param string
1204      * @param characterEncoding
1205      * @return
1206      */
1207     private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding)
1208             throws IOException
1209     {
1210         //StringBuilder sb = null;    //create later on demand
1211         int start = offset;
1212         int length = string.length();
1213         int realLength = length-offset;
1214         String app;
1215         char c;
1216         //boolean endLoop = false;
1217         for (int i = offset; i < length; ++i)
1218         {
1219             app = null;
1220             c = string.charAt(i);
1221             
1222             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
1223             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 
1224             //            we make easier and omit this one)
1225             // - "<" %3C, ">" %3E,
1226             // - "\" %5C, "^" %5E, "`" %60 
1227             // - "{" %7B, "|" %7C, "}" %7D
1228             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
1229             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
1230             //   translating from the document character encoding to percent encoding)
1231             //
1232             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
1233             // put & is invalid in this context   
1234             
1235             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
1236                     c == '"' || c == '<' ||
1237                     c == '>' || c == '\\' || c == '^' || c == '`' ||
1238                     c == '{' || c == '|' || c == '}')
1239             {
1240                 // The percent encoding on this part should be done using UTF-8 charset
1241                 // as RFC 3986 Section 3.2.2 says
1242                 //app = percentEncode(c, characterEncoding);
1243                 if (start < i)
1244                 {
1245                     writer.write(string, start, i-start);
1246                 }
1247                 start = i+1;
1248                 percentEncode(writer, c, characterEncoding);
1249             }
1250             else if (c == '%')
1251             {
1252                 if (i + 2 < length)
1253                 {
1254                     char c1 = string.charAt(i+1);
1255                     char c2 = string.charAt(i+2);
1256                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
1257                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
1258                     {
1259                         // do not percent encode, because it could be already encoded
1260                     }
1261                     else
1262                     {
1263                         //app = percentEncode(c, characterEncoding);
1264                         if (start < i)
1265                         {
1266                             writer.write(string, start, i-start);
1267                         }
1268                         start = i+1;
1269                         percentEncode(writer, c, characterEncoding);
1270                     }
1271                 }
1272                 else
1273                 {
1274                     //app = percentEncode(c, characterEncoding);
1275                     if (start < i)
1276                     {
1277                         writer.write(string, start, i-start);
1278                     }
1279                     start = i+1;
1280                     percentEncode(writer, c, characterEncoding);
1281                 }
1282             }
1283             else if (c == '&')
1284             {
1285                 if (i+4 < length )
1286                 {
1287                     if ('a' == string.charAt(i+1) &&
1288                         'm' == string.charAt(i+2) &&
1289                         'p' == string.charAt(i+3) &&
1290                         ';' == string.charAt(i+4))
1291                     {
1292                         //Skip
1293                     }
1294                     else
1295                     {
1296                         app = "&amp;";
1297                     }
1298                 }
1299                 else
1300                 {
1301                     app = "&amp;";
1302                 }
1303             }
1304             else
1305             {
1306                 //No encoding, just do nothing, char will be added later.
1307             }
1308                         
1309             if (app != null)
1310             {
1311                 //if (sb == null)
1312                 //{
1313                 //    sb = new StringBuilder(string.substring(0, i));
1314                 //}
1315                 //sb.append(app);
1316                 if (start < i)
1317                 {
1318                     writer.write(string, start, i-start);
1319                 }
1320                 start = i+1;
1321                 writer.write(app);
1322             }
1323             //else
1324             //{
1325             //    if (sb != null)
1326             //    {
1327             //        sb.append(c);
1328             //    }
1329             //}
1330             //if (endLoop)
1331             //{
1332             //    break;
1333             //}
1334         }
1335         
1336         //if (sb == null)
1337         //{
1338         //    return string;
1339         //}
1340         //else
1341         //{
1342         //    return sb.toString();
1343         //}
1344         if (start == offset)
1345         {
1346             writer.write(string, offset, realLength);
1347         }
1348         else if (start < length)
1349         {
1350             writer.write(string,start,length-start);
1351         }
1352     }
1353 }