View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *   http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing,
13   * software distributed under the License is distributed on an
14   * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15   * KIND, either express or implied.  See the License for the
16   * specific language governing permissions and limitations
17   * under the License.
18   */
19  package org.apache.myfaces.shared.renderkit.html.util;
20  
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.OutputStreamWriter;
24  import java.io.Writer;
25  
26  /**
27   * Converts Strings so that they can be used within HTML-Code.
28   */
29  public abstract class HTMLEncoder
30  {
31      /**
32       * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
33       */
34      public static String encode (String string)
35      {
36          return encode(string, false, true);
37      }
38  
39      /**
40       * Variant of {@link #encode} where encodeNbsp is true.
41       */
42      public static String encode (String string, boolean encodeNewline)
43      {
44          return encode(string, encodeNewline, true);
45      }
46  
47      /**
48       * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
49       */
50      public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp)
51      {
52          return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
53      }
54  
55      /**
56       * Encodes the given string, so that it can be used within a html page.
57       * @param string the string to convert
58       * @param encodeNewline if true newline characters are converted to <br>'s
59       * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to  's
60       * @param encodeNonLatin if true encode non-latin characters as numeric character references
61       */
62      public static String encode (String string,
63                                   boolean encodeNewline,
64                                   boolean encodeSubsequentBlanksToNbsp,
65                                   boolean encodeNonLatin)
66      {
67          if (string == null)
68          {
69              return "";
70          }
71  
72          StringBuilder sb = null;    //create later on demand
73          String app;
74          char c;
75          for (int i = 0; i < string.length (); ++i)
76          {
77              app = null;
78              c = string.charAt(i);
79              
80              // All characters before letters
81              if ((int)c < 0x41)
82              {
83                  switch (c)
84                  {
85                      case '"': app = "&quot;"; break;    //"
86                      case '&': app = "&amp;"; break;     //&
87                      case '<': app = "&lt;"; break;      //<
88                      case '>': app = "&gt;"; break;      //>
89                      case ' ':
90                          if (encodeSubsequentBlanksToNbsp &&
91                                  (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
92                          {
93                              //Space at beginning or after another space
94                              app = "&#160;";
95                          }
96                          break;
97                      case '\n':
98                          if (encodeNewline)
99                          {
100                             app = "<br/>";
101                         }
102                         break;
103                     default:
104                         break;
105                 }
106                 // http://www.w3.org/MarkUp/html3/specialchars.html
107                 // From C0 extension U+0000-U+001F only U+0009, U+000A and
108                 // U+000D are valid control characters
109                 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
110                 {
111                     // Ignore escape character
112                     app = "";
113                 }
114             }
115             else if (encodeNonLatin && (int)c > 0x80)
116             {
117                  switch(c)
118                  {
119                     //german umlauts
120                     case '\u00E4' : app = "&auml;";  break;
121                     case '\u00C4' : app = "&Auml;";  break;
122                     case '\u00F6' : app = "&ouml;";  break;
123                     case '\u00D6' : app = "&Ouml;";  break;
124                     case '\u00FC' : app = "&uuml;";  break;
125                     case '\u00DC' : app = "&Uuml;";  break;
126                     case '\u00DF' : app = "&szlig;"; break;
127 
128                     //misc
129                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
130                     case '\u20AC': app = "&euro;";  break;
131                     case '\u00AB': app = "&laquo;"; break;
132                     case '\u00BB': app = "&raquo;"; break;
133                     case '\u00A0': app = "&#160;"; break;
134 
135                     default :
136                         //encode all non basic latin characters
137                         app = "&#" + ((int)c) + ";";
138                     break;
139                 }
140             }
141             if (app != null)
142             {
143                 if (sb == null)
144                 {
145                     sb = new StringBuilder(string.substring(0, i));
146                 }
147                 sb.append(app);
148             }
149             else
150             {
151                 if (sb != null)
152                 {
153                     sb.append(c);
154                 }
155             }
156         }
157 
158         if (sb == null)
159         {
160             return string;
161         }
162         else
163         {
164             return sb.toString();
165         }
166     }
167     
168     /**
169      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
170      */
171     public static void encode (Writer writer, String string) throws IOException
172     {
173         encode(writer, string, false, true);
174     }
175 
176     /**
177      * Variant of {@link #encode} where encodeNbsp is true.
178      */
179     public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException
180     {
181         encode(writer, string, encodeNewline, true);
182     }
183 
184     /**
185      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
186      */
187     public static void encode (Writer writer, String string, 
188             boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException
189     {
190         encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true);
191     }
192     
193     public static void encode (Writer writer, String string,
194                                  boolean encodeNewline,
195                                  boolean encodeSubsequentBlanksToNbsp,
196                                  boolean encodeNonLatin) throws IOException
197     {
198         if (string == null)
199         {
200             return;
201         }
202 
203         int start = 0;
204         String app;
205         char c;
206         for (int i = 0; i < string.length (); ++i)
207         {
208             app = null;
209             c = string.charAt(i);
210             
211             // All characters before letters
212             if ((int)c < 0x41)
213             {
214                 switch (c)
215                 {
216                     case '"': app = "&quot;"; break;    //"
217                     case '&': app = "&amp;"; break;     //&
218                     case '<': app = "&lt;"; break;      //<
219                     case '>': app = "&gt;"; break;      //>
220                     case ' ':
221                         if (encodeSubsequentBlanksToNbsp &&
222                                 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' ')))
223                         {
224                             //Space at beginning or after another space
225                             app = "&#160;";
226                         }
227                         break;
228                     case '\n':
229                         if (encodeNewline)
230                         {
231                             app = "<br/>";
232                         }
233                         break;
234                     default:
235                         break;
236                 }
237                 // http://www.w3.org/MarkUp/html3/specialchars.html
238                 // From C0 extension U+0000-U+001F only U+0009, U+000A and
239                 // U+000D are valid control characters
240                 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
241                 {
242                     // Ignore escape character
243                     app = "";
244                 }
245             }
246             else if (encodeNonLatin && (int)c > 0x80)
247             {
248                  switch(c)
249                  {
250                     //german umlauts
251                     case '\u00E4' : app = "&auml;";  break;
252                     case '\u00C4' : app = "&Auml;";  break;
253                     case '\u00F6' : app = "&ouml;";  break;
254                     case '\u00D6' : app = "&Ouml;";  break;
255                     case '\u00FC' : app = "&uuml;";  break;
256                     case '\u00DC' : app = "&Uuml;";  break;
257                     case '\u00DF' : app = "&szlig;"; break;
258 
259                     //misc
260                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
261                     case '\u20AC': app = "&euro;";  break;
262                     case '\u00AB': app = "&laquo;"; break;
263                     case '\u00BB': app = "&raquo;"; break;
264                     case '\u00A0': app = "&#160;"; break;
265 
266                     default :
267                         //encode all non basic latin characters
268                         app = "&#" + ((int)c) + ";";
269                     break;
270                 }
271             }
272             if (app != null)
273             {
274                 //if (sb == null)
275                 //{
276                 //    sb = new StringBuilder(string.substring(0, i));
277                 //}
278                 //sb.append(app);
279                 if (start < i)
280                 {
281                     writer.write(string, start, i-start);
282                 }
283                 start = i+1;
284                 writer.write(app);
285             }
286             //else
287             //{
288             //    if (sb != null)
289             //    {
290             //        sb.append(c);
291             //    }
292             //}
293         }
294 
295         //if (sb == null)
296         //{
297         //    return string;
298         //}
299         //else
300         //{
301         //    return sb.toString();
302         //}
303         if (start == 0)
304         {
305             writer.write(string);
306         }
307         else if (start < string.length())
308         {
309             writer.write(string,start,string.length()-start);
310         }
311     }
312 
313 
314     /**
315      * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true.
316      */
317     public static void encode (char[] string, int offset, int length, Writer writer) throws IOException
318     {
319         encode(string, offset, length, false, true, writer);
320     }
321 
322     /**
323      * Variant of {@link #encode} where encodeNbsp is true.
324      */
325     public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer)
326         throws IOException
327     {
328         encode(string, offset, length, encodeNewline, true, writer);
329     }
330 
331     /**
332      * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 
333      */
334     public static void encode (char[] string, int offset, int length, boolean encodeNewline, 
335             boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException
336     {
337         encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer);
338     }
339 
340 
341     /**
342      * Encodes the given string, so that it can be used within a html page.
343      * @param string the string to convert
344      * @param encodeNewline if true newline characters are converted to &lt;br&gt;'s
345      * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &amp;nbsp;'s
346      * @param encodeNonLatin if true encode non-latin characters as numeric character references
347      */
348     public static void encode (char[] string, int offset, int length,
349                                  boolean encodeNewline,
350                                  boolean encodeSubsequentBlanksToNbsp,
351                                  boolean encodeNonLatin, Writer writer) throws IOException
352     {
353         if (string == null || length < 0 || offset >= string.length)
354         {
355             return;
356         }
357         offset = Math.max(0, offset);
358         int realLength = Math.min(length, string.length - offset);
359 
360         //StringBuilder sb = null;    //create later on demand
361         String app;
362         char c;
363         int start = offset;
364         
365         for (int i = offset; i < offset + realLength; ++i)
366         {
367             app = null;
368             c = string[i];
369 
370             // All characters before letters
371             if ((int)c < 0x41)
372             {
373                 switch (c)
374                 {
375                     case '"': app = "&quot;"; break;    //"
376                     case '&': app = "&amp;"; break;     //&
377                     case '<': app = "&lt;"; break;      //<
378                     case '>': app = "&gt;"; break;      //>
379                     case ' ':
380                         if (encodeSubsequentBlanksToNbsp &&
381                                 (i == 0 || (i - 1 >= 0 && string[i - 1] == ' ')))
382                         {
383                             //Space at beginning or after another space
384                             app = "&#160;";
385                         }
386                         break;
387                     case '\n':
388                         if (encodeNewline)
389                         {
390                             app = "<br/>";
391                         }
392                         break;
393                     default:
394                         break;
395                 }
396                 // http://www.w3.org/MarkUp/html3/specialchars.html
397                 // From C0 extension U+0000-U+001F only U+0009, U+000A and
398                 // U+000D are valid control characters
399                 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D)
400                 {
401                     // Ignore escape character
402                     app = "";
403                 }
404             }
405             else if (encodeNonLatin && (int)c > 0x80)
406             {
407                  switch(c)
408                  {
409                     //german umlauts
410                     case '\u00E4' : app = "&auml;";  break;
411                     case '\u00C4' : app = "&Auml;";  break;
412                     case '\u00F6' : app = "&ouml;";  break;
413                     case '\u00D6' : app = "&Ouml;";  break;
414                     case '\u00FC' : app = "&uuml;";  break;
415                     case '\u00DC' : app = "&Uuml;";  break;
416                     case '\u00DF' : app = "&szlig;"; break;
417 
418                     //misc
419                     //case 0x80: app = "&euro;"; break;  sometimes euro symbol is ascii 128, should we suport it?
420                     case '\u20AC': app = "&euro;";  break;
421                     case '\u00AB': app = "&laquo;"; break;
422                     case '\u00BB': app = "&raquo;"; break;
423                     case '\u00A0': app = "&#160;"; break;
424 
425                     default :
426                         //encode all non basic latin characters
427                         app = "&#" + ((int)c) + ";";
428                     break;
429                 }
430             }
431             if (app != null)
432             {
433                 //if (sb == null)
434                 //{
435                 //    sb = new StringBuilder(realLength*2);
436                 //    sb.append(string, offset, i - offset);
437                 //}
438                 //sb.append(app);
439                 if (start < i)
440                 {
441                     writer.write(string, start, i-start);
442                 }
443                 start = i+1;
444                 writer.write(app);
445             }
446             /*
447             else
448             {
449                 if (sb != null)
450                 {
451                     sb.append(c);
452                 }
453             }*/
454         }
455 
456         //if (sb == null)
457         //{
458         //    writer.write(string, offset, realLength);
459         //}
460         //else
461         //{
462         //    writer.write(sb.toString());
463         //}
464         if (start == offset)
465         {
466             writer.write(string, offset, realLength);
467         }
468         else if (start < offset+realLength)
469         {
470             writer.write(string,start,offset+realLength-start);
471         }
472     }
473     
474     private static final String HEX_CHARSET = "0123456789ABCDEF";
475     
476     private static final String UTF8 = "UTF-8";
477     
478     /**
479      * Encode an URI, escaping or percent-encoding all required characters and
480      * following the rules mentioned on RFC 3986.  
481      * 
482      * @param string
483      * @param encodeNonLatin
484      * @return
485      * @throws IOException
486      */
487     public static String encodeURIAtributte(final String string, final String characterEncoding)
488         throws IOException
489     {
490         StringBuilder sb = null;    //create later on demand
491         String app;
492         char c;
493         boolean endLoop = false;
494         for (int i = 0; i < string.length (); ++i)
495         {
496             app = null;
497             c = string.charAt(i);
498             
499             // This are the guidelines to be taken into account by this algorithm to encode:
500             
501             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
502             //
503             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
504             // space       = <US-ASCII coded character 20 hexadecimal>
505             // delims      = "<" | ">" | "#" | "%" | <">
506             //               %3C   %3E   %23   %25   %22
507             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
508             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
509             //
510             // ".... Data corresponding to excluded characters must be escaped in order to
511             // be properly represented within a URI....."
512             
513             // RFC 3986 Section 3.  Syntax Components
514             //
515             // "... The generic URI syntax consists of a hierarchical sequence of
516             // components referred to as the scheme, authority, path, query, and
517             // fragment.
518             //
519             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
520             //
521             //   hier-part   = "//" authority path-abempty
522             //               / path-absolute
523             //               / path-rootless
524             //               / path-empty
525             // ...."
526             
527             // RFC 3986 Section 2.2:
528             // Reserved characters (should not be percent-encoded)
529             // reserved    = gen-delims / sub-delims
530             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
531             //               %3A   %2F   %3F   %23   %5B   %5D   %40
532             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
533             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
534             
535             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
536             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
537             // "...those rules were redefined to directly specify the characters allowed...."
538             // There is also other characters moved from excluded list to reserved:
539             // "[" / "]" / "#"  
540             
541             // RFC 3986 Section 2.3:
542             // "... for consistency, percent-encoded octets in the ranges of ALPHA
543             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
544             // underscore (%5F), or tilde (%7E) should not be created by URI
545             // producers...."
546             
547             // RFC 3986 Section  3.2.2.  Host
548 
549             // host = IP-literal / IPv4address / reg-name
550 
551             // The reg-name syntax allows percent-encoded octets in order to
552             // represent non-ASCII registered names in a uniform way that is
553             // independent of the underlying name resolution technology.  Non-ASCII
554             // characters must first be encoded according to UTF-8 [STD63], and then
555             // each octet of the corresponding UTF-8 sequence must be percent-
556             // encoded to be represented as URI characters.  URI producing
557             // applications must not use percent-encoding in host unless it is used
558             // to represent a UTF-8 character sequence.
559             
560             // RFC 3986 Section 3.4 Query 
561             //         query       = *( pchar / "/" / "?" )
562             //
563             // "...  However, as query components are often used to carry identifying information 
564             // in the form of "key=value" pairs and one frequently used value is a reference to
565             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
566             //
567             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
568             //
569             // When a new URI scheme defines a component that represents textual
570             // data consisting of characters from the Universal Character Set [UCS],
571             // the data should first be encoded as octets according to the UTF-8
572             // character encoding [STD63]; then only those octets that do not
573             // correspond to characters in the unreserved set should be percent-
574             // encoded.  For example, the character A would be represented as "A",
575             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
576             // as "%C3%80", and the character KATAKANA LETTER A would be represented
577             // as "%E3%82%A2".
578             //
579             // RFC 3986 Section 3.5 Fragment
580             //         fragment    = *( pchar / "/" / "?" )
581             //
582             // Note that follows the same as query
583             
584             // Based on the extracts the strategy to apply on this method is:
585             // 
586             // On scheme ":" hier-part
587             //
588             // Escape or percent encode chars inside :
589             // 
590             // - From %00 to %20, 
591             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
592             //                     duplicate encoding, encode it when we are sure 
593             //                     that there are not encoded twice)
594             // - "<" %3C, ">" %3E
595             // - "\" %5C, "^" %5E, "`" %60 
596             // - "{" %7B, "|" %7C, "}" %7D
597             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
598             //   part of an URI, but it is preferred to encode it that omit it).
599             //
600             // The remaining characters must not be encoded
601             //
602             // Characters after ? or # should be percent encoding but only the necessary ones:
603             //
604             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
605             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
606             //                     duplicate encoding, encode it when we are sure 
607             //                     that there are not encoded twice)
608             // - "<" %3C, ">" %3E,
609             // - "\" %5C, "^" %5E, "`" %60 
610             // - "{" %7B, "|" %7C, "}" %7D
611             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
612             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
613             //   translating from the document character encoding to percent encoding, because this values
614             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
615             //   for decode values)
616             //
617             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
618             // put only & is invalid in this context.
619 
620             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
621                     c == '"' || c == '<' ||
622                     c == '>' || c == '\\' || c == '^' || c == '`' ||
623                     c == '{' || c == '|' || c == '}')
624             {
625                 // The percent encoding on this part should be done using UTF-8 charset
626                 // as RFC 3986 Section 3.2.2 says.
627                 // Also there is a reference on 
628                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
629                 // that recommend use of UTF-8 instead the document character encoding.
630                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
631                 app = percentEncode(c, "UTF-8");
632             }
633             else if (c == '%')
634             {
635                 if (i + 2 < string.length())
636                 {
637                     char c1 = string.charAt(i+1);
638                     char c2 = string.charAt(i+2);
639                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
640                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
641                     {
642                         // do not percent encode, because it could be already encoded
643                         // and we don't want encode it twice
644                     }
645                     else
646                     {
647                         app = percentEncode(c, UTF8);
648                     }
649                 }
650                 else
651                 {
652                     app = percentEncode(c, UTF8);
653                 }
654             }
655             else if (c == '?' || c == '#')
656             {
657                 if (i+1 < string.length())
658                 {
659                     // The remaining part of the URI are data that should be encoded
660                     // using the document character encoding.
661                     app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
662                     endLoop = true;
663                 }
664             }
665             else
666             {
667                 //No encoding, just do nothing, char will be added later.
668             }
669                         
670             if (app != null)
671             {
672                 if (sb == null)
673                 {
674                     sb = new StringBuilder(string.substring(0, i));
675                 }
676                 sb.append(app);
677             }
678             else
679             {
680                 if (sb != null)
681                 {
682                     sb.append(c);
683                 }
684             }
685             if (endLoop)
686             {
687                 break;
688             }
689         }
690         if (sb == null)
691         {
692             return string;
693         }
694         else
695         {
696             return sb.toString();
697         }
698     }
699     
700     /**
701      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
702      * characterEncoding.
703      * 
704      * @param c
705      * @param characterEncoding
706      * @return
707      */
708     private static String percentEncode(char c, String characterEncoding)
709     {
710         String app = null;
711         if (c > (char)((short)0x007F))
712         {
713             //percent encode in the proper encoding to be consistent
714             app = percentEncodeNonUsAsciiCharacter(c, characterEncoding);
715         }
716         else
717         {
718             //percent encode US-ASCII char (0x00-0x7F range)
719             app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
720         }
721         return app;
722     }
723     
724     private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding)
725     {
726         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
727         StringBuilder builder = new StringBuilder();
728         try
729         {
730             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
731             writer.write(c);
732             writer.flush();
733         }
734         catch(IOException e)
735         {
736             baos.reset();
737             return null;
738         }
739         
740         byte [] byteArray =  baos.toByteArray();
741         for (int i=0; i < byteArray.length; i++)
742         {
743             builder.append('%');
744             builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
745             builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
746         }
747         
748         return builder.toString();
749     }
750 
751     /**
752      * Encode the query part using the document charset encoding provided.
753      * 
754      * 
755      * @param string
756      * @param characterEncoding
757      * @return
758      */
759     private static String encodeURIQuery(final String string, final String characterEncoding)
760     {
761         StringBuilder sb = null;    //create later on demand
762         String app;
763         char c;
764         boolean endLoop = false;
765         for (int i = 0; i < string.length (); ++i)
766         {
767             app = null;
768             c = string.charAt(i);
769             
770             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
771             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 
772             //            we make easier and omit this one)
773             // - "<" %3C, ">" %3E,
774             // - "\" %5C, "^" %5E, "`" %60 
775             // - "{" %7B, "|" %7C, "}" %7D
776             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
777             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
778             //   translating from the document character encoding to percent encoding)
779             //
780             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
781             // put & is invalid in this context   
782             
783             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
784                     c == '"' || c == '<' ||
785                     c == '>' || c == '\\' || c == '^' || c == '`' ||
786                     c == '{' || c == '|' || c == '}')
787             {
788                 // The percent encoding on this part should be done using UTF-8 charset
789                 // as RFC 3986 Section 3.2.2 says
790                 app = percentEncode(c, characterEncoding);
791             }
792             else if (c == '%')
793             {
794                 if (i + 2 < string.length())
795                 {
796                     char c1 = string.charAt(i+1);
797                     char c2 = string.charAt(i+2);
798                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
799                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
800                     {
801                         // do not percent encode, because it could be already encoded
802                     }
803                     else
804                     {
805                         app = percentEncode(c, characterEncoding);
806                     }
807                 }
808                 else
809                 {
810                     app = percentEncode(c, characterEncoding);
811                 }
812             }
813             else if (c == '&')
814             {
815                 if (i+4 < string.length() )
816                 {
817                     if ('a' == string.charAt(i+1) &&
818                         'm' == string.charAt(i+2) &&
819                         'p' == string.charAt(i+3) &&
820                         ';' == string.charAt(i+4))
821                     {
822                         //Skip
823                     }
824                     else
825                     {
826                         app = "&amp;";
827                     }
828                 }
829                 else
830                 {
831                     app = "&amp;";
832                 }
833             }
834             else
835             {
836                 //No encoding, just do nothing, char will be added later.
837             }
838                         
839             if (app != null)
840             {
841                 if (sb == null)
842                 {
843                     sb = new StringBuilder(string.substring(0, i));
844                 }
845                 sb.append(app);
846             }
847             else
848             {
849                 if (sb != null)
850                 {
851                     sb.append(c);
852                 }
853             }
854             if (endLoop)
855             {
856                 break;
857             }
858         }
859         if (sb == null)
860         {
861             return string;
862         }
863         else
864         {
865             return sb.toString();
866         }
867     }
868 
869     /**
870      * Encode an URI, escaping or percent-encoding all required characters and
871      * following the rules mentioned on RFC 3986.  
872      * 
873      * @param string
874      * @param encodeNonLatin
875      * @return
876      * @throws IOException
877      */
878     public static void encodeURIAtributte(Writer writer, final String string, final String characterEncoding)
879         throws IOException
880     {
881         //StringBuilder sb = null;    //create later on demand
882         int start = 0;
883         String app;
884         char c;
885         boolean endLoop = false;
886         for (int i = 0; i < string.length (); ++i)
887         {
888             app = null;
889             c = string.charAt(i);
890             
891             // This are the guidelines to be taken into account by this algorithm to encode:
892             
893             // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters
894             //
895             // control     = <US-ASCII coded characters 00-1F and 7F hexadecimal>
896             // space       = <US-ASCII coded character 20 hexadecimal>
897             // delims      = "<" | ">" | "#" | "%" | <">
898             //               %3C   %3E   %23   %25   %22
899             // unwise      = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`"
900             //               %7D   %7B   %7C   %5C   %5E   %5B   %5D   %60
901             //
902             // ".... Data corresponding to excluded characters must be escaped in order to
903             // be properly represented within a URI....."
904             
905             // RFC 3986 Section 3.  Syntax Components
906             //
907             // "... The generic URI syntax consists of a hierarchical sequence of
908             // components referred to as the scheme, authority, path, query, and
909             // fragment.
910             //
911             //   URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
912             //
913             //   hier-part   = "//" authority path-abempty
914             //               / path-absolute
915             //               / path-rootless
916             //               / path-empty
917             // ...."
918             
919             // RFC 3986 Section 2.2:
920             // Reserved characters (should not be percent-encoded)
921             // reserved    = gen-delims / sub-delims
922             // gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
923             //               %3A   %2F   %3F   %23   %5B   %5D   %40
924             // sub-delims  = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
925             //               %21   %24   %26   %27   %28   %29   %2A   %2B   %2C   %3B   %3D
926             
927             // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396,
928             // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 
929             // "...those rules were redefined to directly specify the characters allowed...."
930             // There is also other characters moved from excluded list to reserved:
931             // "[" / "]" / "#"  
932             
933             // RFC 3986 Section 2.3:
934             // "... for consistency, percent-encoded octets in the ranges of ALPHA
935             // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E),
936             // underscore (%5F), or tilde (%7E) should not be created by URI
937             // producers...."
938             
939             // RFC 3986 Section  3.2.2.  Host
940 
941             // host = IP-literal / IPv4address / reg-name
942 
943             // The reg-name syntax allows percent-encoded octets in order to
944             // represent non-ASCII registered names in a uniform way that is
945             // independent of the underlying name resolution technology.  Non-ASCII
946             // characters must first be encoded according to UTF-8 [STD63], and then
947             // each octet of the corresponding UTF-8 sequence must be percent-
948             // encoded to be represented as URI characters.  URI producing
949             // applications must not use percent-encoding in host unless it is used
950             // to represent a UTF-8 character sequence.
951             
952             // RFC 3986 Section 3.4 Query 
953             //         query       = *( pchar / "/" / "?" )
954             //
955             // "...  However, as query components are often used to carry identifying information 
956             // in the form of "key=value" pairs and one frequently used value is a reference to
957             // another URI, it is sometimes better for usability to avoid percent-encoding those characters....."
958             //
959             // RFC 3986 Section 2.5 Identifying Data (Apply to query section)
960             //
961             // When a new URI scheme defines a component that represents textual
962             // data consisting of characters from the Universal Character Set [UCS],
963             // the data should first be encoded as octets according to the UTF-8
964             // character encoding [STD63]; then only those octets that do not
965             // correspond to characters in the unreserved set should be percent-
966             // encoded.  For example, the character A would be represented as "A",
967             // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented
968             // as "%C3%80", and the character KATAKANA LETTER A would be represented
969             // as "%E3%82%A2".
970             //
971             // RFC 3986 Section 3.5 Fragment
972             //         fragment    = *( pchar / "/" / "?" )
973             //
974             // Note that follows the same as query
975             
976             // Based on the extracts the strategy to apply on this method is:
977             // 
978             // On scheme ":" hier-part
979             //
980             // Escape or percent encode chars inside :
981             // 
982             // - From %00 to %20, 
983             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
984             //                     duplicate encoding, encode it when we are sure 
985             //                     that there are not encoded twice)
986             // - "<" %3C, ">" %3E
987             // - "\" %5C, "^" %5E, "`" %60 
988             // - "{" %7B, "|" %7C, "}" %7D
989             // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this
990             //   part of an URI, but it is preferred to encode it that omit it).
991             //
992             // The remaining characters must not be encoded
993             //
994             // Characters after ? or # should be percent encoding but only the necessary ones:
995             //
996             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
997             // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 
998             //                     duplicate encoding, encode it when we are sure 
999             //                     that there are not encoded twice)
1000             // - "<" %3C, ">" %3E,
1001             // - "\" %5C, "^" %5E, "`" %60 
1002             // - "{" %7B, "|" %7C, "}" %7D
1003             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
1004             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
1005             //   translating from the document character encoding to percent encoding, because this values
1006             //   could be retrieved from httpRequest.getParameter() and it uses the current character encoding
1007             //   for decode values)
1008             //
1009             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
1010             // put only & is invalid in this context.
1011 
1012             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
1013                     c == '"' || c == '<' ||
1014                     c == '>' || c == '\\' || c == '^' || c == '`' ||
1015                     c == '{' || c == '|' || c == '}')
1016             {
1017                 // The percent encoding on this part should be done using UTF-8 charset
1018                 // as RFC 3986 Section 3.2.2 says.
1019                 // Also there is a reference on 
1020                 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars
1021                 // that recommend use of UTF-8 instead the document character encoding.
1022                 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113)
1023                 //app = percentEncode(c, "UTF-8");
1024                 if (start < i)
1025                 {
1026                     writer.write(string, start, i-start);
1027                 }
1028                 start = i+1;
1029                 percentEncode(writer, c, "UTF-8");
1030             }
1031             else if (c == '%')
1032             {
1033                 if (i + 2 < string.length())
1034                 {
1035                     char c1 = string.charAt(i+1);
1036                     char c2 = string.charAt(i+2);
1037                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
1038                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
1039                     {
1040                         // do not percent encode, because it could be already encoded
1041                         // and we don't want encode it twice
1042                     }
1043                     else
1044                     {
1045                         //app = percentEncode(c, UTF8);
1046                         if (start < i)
1047                         {
1048                             writer.write(string, start, i-start);
1049                         }
1050                         start = i+1;
1051                         percentEncode(writer, c, UTF8);
1052                     }
1053                 }
1054                 else
1055                 {
1056                     //app = percentEncode(c, UTF8);
1057                     if (start < i)
1058                     {
1059                         writer.write(string, start, i-start);
1060                     }
1061                     start = i+1;
1062                     percentEncode(writer, c, UTF8);
1063                 }
1064             }
1065             else if (c == '?' || c == '#')
1066             {
1067                 if (i+1 < string.length())
1068                 {
1069                     // The remaining part of the URI are data that should be encoded
1070                     // using the document character encoding.
1071                     //app = c + encodeURIQuery(string.substring(i+1), characterEncoding);
1072                     if (start < i)
1073                     {
1074                         writer.write(string, start, i-start);
1075                     }
1076                     start = i+1;
1077                     writer.write(c);
1078                     //encodeURIQuery(writer, string.substring(i+1), characterEncoding);
1079                     encodeURIQuery(writer, string, i+1, characterEncoding);
1080                     endLoop = true;
1081                 }
1082             }
1083             else
1084             {
1085                 //No encoding, just do nothing, char will be added later.
1086             }
1087                         
1088             if (app != null)
1089             {
1090                 //if (sb == null)
1091                 //{
1092                 //    sb = new StringBuilder(string.substring(0, i));
1093                 //}
1094                 //sb.append(app);
1095                 if (start < i)
1096                 {
1097                     writer.write(string, start, i-start);
1098                 }
1099                 start = i+1;
1100                 writer.write(app);
1101             }
1102             //else
1103             //{
1104             //    if (sb != null)
1105             //    {
1106             //        sb.append(c);
1107             //    }
1108             //}
1109             if (endLoop)
1110             {
1111                 start = string.length();
1112                 break;
1113             }
1114         }
1115         //if (sb == null)
1116         //{
1117         //    return string;
1118         //}
1119         //else
1120         //{
1121         //    return sb.toString();
1122         //}
1123         if (start == 0)
1124         {
1125             writer.write(string);
1126         }
1127         else if (start < string.length())
1128         {
1129             writer.write(string,start,string.length()-start);
1130         }
1131     }
1132 
1133     /**
1134      * Encode a unicode char value in percentEncode, decoding its bytes using a specified 
1135      * characterEncoding.
1136      * 
1137      * @param c
1138      * @param characterEncoding
1139      * @return
1140      */
1141     private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException
1142     {
1143         String app = null;
1144         if (c > (char)((short)0x007F))
1145         {
1146             //percent encode in the proper encoding to be consistent
1147             //app = percentEncodeNonUsAsciiCharacter(writer c, characterEncoding);
1148             percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding);
1149         }
1150         else
1151         {
1152             //percent encode US-ASCII char (0x00-0x7F range)
1153             //app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10);
1154             writer.write('%');
1155             writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)));
1156             writer.write(HEX_CHARSET.charAt(c % 0x10));
1157         }
1158         //return app;
1159     }
1160     
1161     private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding) 
1162         throws IOException
1163     {
1164         ByteArrayOutputStream baos = new ByteArrayOutputStream(10);
1165         StringBuilder builder = new StringBuilder();
1166         try
1167         {
1168             OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding);
1169             writer.write(c);
1170             writer.flush();
1171         }
1172         catch(IOException e)
1173         {
1174             baos.reset();
1175             return;
1176         }
1177         
1178         byte [] byteArray =  baos.toByteArray();
1179         for (int i=0; i < byteArray.length; i++)
1180         {
1181             //builder.append('%');
1182             //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1183             //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1184             currentWriter.write('%');
1185             currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) );
1186             currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10));
1187         }
1188         
1189         //return builder.toString();
1190     }
1191     
1192     /**
1193      * Encode the query part using the document charset encoding provided.
1194      * 
1195      * 
1196      * @param string
1197      * @param characterEncoding
1198      * @return
1199      */
1200     private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding)
1201             throws IOException
1202     {
1203         //StringBuilder sb = null;    //create later on demand
1204         int start = offset;
1205         int realLength = string.length()-offset;
1206         String app;
1207         char c;
1208         //boolean endLoop = false;
1209         for (int i = offset; i < offset+realLength; ++i)
1210         {
1211             app = null;
1212             c = string.charAt(i);
1213             
1214             // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20)
1215             // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 
1216             //            we make easier and omit this one)
1217             // - "<" %3C, ">" %3E,
1218             // - "\" %5C, "^" %5E, "`" %60 
1219             // - "{" %7B, "|" %7C, "}" %7D
1220             // - From %7F ad infinitum (each character as many bytes as necessary but take into account
1221             //   that a single char should contain 2,3 or more bytes!. This data should be encoded 
1222             //   translating from the document character encoding to percent encoding)
1223             //
1224             // "&" should be encoded as "&amp;" because this link is inside an html page, and 
1225             // put & is invalid in this context   
1226             
1227             if (   (c <= (char)0x20) || (c >= (char)0x7F) || 
1228                     c == '"' || c == '<' ||
1229                     c == '>' || c == '\\' || c == '^' || c == '`' ||
1230                     c == '{' || c == '|' || c == '}')
1231             {
1232                 // The percent encoding on this part should be done using UTF-8 charset
1233                 // as RFC 3986 Section 3.2.2 says
1234                 //app = percentEncode(c, characterEncoding);
1235                 if (start < i)
1236                 {
1237                     writer.write(string, start, i-start);
1238                 }
1239                 start = i+1;
1240                 percentEncode(writer, c, characterEncoding);
1241             }
1242             else if (c == '%')
1243             {
1244                 if (i + 2 < string.length())
1245                 {
1246                     char c1 = string.charAt(i+1);
1247                     char c2 = string.charAt(i+2);
1248                     if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) &&
1249                         (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z')))
1250                     {
1251                         // do not percent encode, because it could be already encoded
1252                     }
1253                     else
1254                     {
1255                         //app = percentEncode(c, characterEncoding);
1256                         if (start < i)
1257                         {
1258                             writer.write(string, start, i-start);
1259                         }
1260                         start = i+1;
1261                         percentEncode(writer, c, characterEncoding);
1262                     }
1263                 }
1264                 else
1265                 {
1266                     //app = percentEncode(c, characterEncoding);
1267                     if (start < i)
1268                     {
1269                         writer.write(string, start, i-start);
1270                     }
1271                     start = i+1;
1272                     percentEncode(writer, c, characterEncoding);
1273                 }
1274             }
1275             else if (c == '&')
1276             {
1277                 if (i+4 < string.length() )
1278                 {
1279                     if ('a' == string.charAt(i+1) &&
1280                         'm' == string.charAt(i+2) &&
1281                         'p' == string.charAt(i+3) &&
1282                         ';' == string.charAt(i+4))
1283                     {
1284                         //Skip
1285                     }
1286                     else
1287                     {
1288                         app = "&amp;";
1289                     }
1290                 }
1291                 else
1292                 {
1293                     app = "&amp;";
1294                 }
1295             }
1296             else
1297             {
1298                 //No encoding, just do nothing, char will be added later.
1299             }
1300                         
1301             if (app != null)
1302             {
1303                 //if (sb == null)
1304                 //{
1305                 //    sb = new StringBuilder(string.substring(0, i));
1306                 //}
1307                 //sb.append(app);
1308                 if (start < i)
1309                 {
1310                     writer.write(string, start, i-start);
1311                 }
1312                 start = i+1;
1313                 writer.write(app);
1314             }
1315             //else
1316             //{
1317             //    if (sb != null)
1318             //    {
1319             //        sb.append(c);
1320             //    }
1321             //}
1322             //if (endLoop)
1323             //{
1324             //    break;
1325             //}
1326         }
1327         
1328         //if (sb == null)
1329         //{
1330         //    return string;
1331         //}
1332         //else
1333         //{
1334         //    return sb.toString();
1335         //}
1336         if (start == offset)
1337         {
1338             writer.write(string, offset, realLength);
1339         }
1340         else if (start < offset+realLength)
1341         {
1342             writer.write(string,start,offset+realLength-start);
1343         }
1344     }
1345 }