1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 package org.apache.myfaces.shared.renderkit.html.util; 20 21 import java.io.ByteArrayOutputStream; 22 import java.io.IOException; 23 import java.io.OutputStreamWriter; 24 import java.io.Writer; 25 26 /** 27 * Converts Strings so that they can be used within HTML-Code. 28 */ 29 public abstract class HTMLEncoder 30 { 31 /** 32 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. 33 */ 34 public static String encode (String string) 35 { 36 return encode(string, false, true); 37 } 38 39 /** 40 * Variant of {@link #encode} where encodeNbsp is true. 41 */ 42 public static String encode (String string, boolean encodeNewline) 43 { 44 return encode(string, encodeNewline, true); 45 } 46 47 /** 48 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 49 */ 50 public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) 51 { 52 return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true); 53 } 54 55 /** 56 * Encodes the given string, so that it can be used within a html page. 57 * @param string the string to convert 58 * @param encodeNewline if true newline characters are converted to <br>'s 59 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s 60 * @param encodeNonLatin if true encode non-latin characters as numeric character references 61 */ 62 public static String encode (String string, 63 boolean encodeNewline, 64 boolean encodeSubsequentBlanksToNbsp, 65 boolean encodeNonLatin) 66 { 67 if (string == null) 68 { 69 return ""; 70 } 71 72 StringBuilder sb = null; //create later on demand 73 String app; 74 char c; 75 for (int i = 0; i < string.length (); ++i) 76 { 77 app = null; 78 c = string.charAt(i); 79 80 // All characters before letters 81 if ((int)c < 0x41) 82 { 83 switch (c) 84 { 85 case '"': app = """; break; //" 86 case '&': app = "&"; break; //& 87 case '<': app = "<"; break; //< 88 case '>': app = ">"; break; //> 89 case ' ': 90 if (encodeSubsequentBlanksToNbsp && 91 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' '))) 92 { 93 //Space at beginning or after another space 94 app = " "; 95 } 96 break; 97 case '\n': 98 if (encodeNewline) 99 { 100 app = "<br/>"; 101 } 102 break; 103 default: 104 break; 105 } 106 // http://www.w3.org/MarkUp/html3/specialchars.html 107 // From C0 extension U+0000-U+001F only U+0009, U+000A and 108 // U+000D are valid control characters 109 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) 110 { 111 // Ignore escape character 112 app = ""; 113 } 114 } 115 else if (encodeNonLatin && (int)c > 0x80) 116 { 117 switch(c) 118 { 119 //german umlauts 120 case '\u00E4' : app = "ä"; break; 121 case '\u00C4' : app = "Ä"; break; 122 case '\u00F6' : app = "ö"; break; 123 case '\u00D6' : app = "Ö"; break; 124 case '\u00FC' : app = "ü"; break; 125 case '\u00DC' : app = "Ü"; break; 126 case '\u00DF' : app = "ß"; break; 127 128 //misc 129 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? 130 case '\u20AC': app = "€"; break; 131 case '\u00AB': app = "«"; break; 132 case '\u00BB': app = "»"; break; 133 case '\u00A0': app = " "; break; 134 135 default : 136 //encode all non basic latin characters 137 app = "&#" + ((int)c) + ";"; 138 break; 139 } 140 } 141 if (app != null) 142 { 143 if (sb == null) 144 { 145 sb = new StringBuilder(string.substring(0, i)); 146 } 147 sb.append(app); 148 } 149 else 150 { 151 if (sb != null) 152 { 153 sb.append(c); 154 } 155 } 156 } 157 158 if (sb == null) 159 { 160 return string; 161 } 162 else 163 { 164 return sb.toString(); 165 } 166 } 167 168 /** 169 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. 170 */ 171 public static void encode (Writer writer, String string) throws IOException 172 { 173 encode(writer, string, false, true); 174 } 175 176 /** 177 * Variant of {@link #encode} where encodeNbsp is true. 178 */ 179 public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException 180 { 181 encode(writer, string, encodeNewline, true); 182 } 183 184 /** 185 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 186 */ 187 public static void encode (Writer writer, String string, 188 boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException 189 { 190 encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true); 191 } 192 193 public static void encode (Writer writer, String string, 194 boolean encodeNewline, 195 boolean encodeSubsequentBlanksToNbsp, 196 boolean encodeNonLatin) throws IOException 197 { 198 if (string == null) 199 { 200 return; 201 } 202 203 int start = 0; 204 String app; 205 char c; 206 for (int i = 0; i < string.length (); ++i) 207 { 208 app = null; 209 c = string.charAt(i); 210 211 // All characters before letters 212 if ((int)c < 0x41) 213 { 214 switch (c) 215 { 216 case '"': app = """; break; //" 217 case '&': app = "&"; break; //& 218 case '<': app = "<"; break; //< 219 case '>': app = ">"; break; //> 220 case ' ': 221 if (encodeSubsequentBlanksToNbsp && 222 (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' '))) 223 { 224 //Space at beginning or after another space 225 app = " "; 226 } 227 break; 228 case '\n': 229 if (encodeNewline) 230 { 231 app = "<br/>"; 232 } 233 break; 234 default: 235 break; 236 } 237 // http://www.w3.org/MarkUp/html3/specialchars.html 238 // From C0 extension U+0000-U+001F only U+0009, U+000A and 239 // U+000D are valid control characters 240 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) 241 { 242 // Ignore escape character 243 app = ""; 244 } 245 } 246 else if (encodeNonLatin && (int)c > 0x80) 247 { 248 switch(c) 249 { 250 //german umlauts 251 case '\u00E4' : app = "ä"; break; 252 case '\u00C4' : app = "Ä"; break; 253 case '\u00F6' : app = "ö"; break; 254 case '\u00D6' : app = "Ö"; break; 255 case '\u00FC' : app = "ü"; break; 256 case '\u00DC' : app = "Ü"; break; 257 case '\u00DF' : app = "ß"; break; 258 259 //misc 260 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? 261 case '\u20AC': app = "€"; break; 262 case '\u00AB': app = "«"; break; 263 case '\u00BB': app = "»"; break; 264 case '\u00A0': app = " "; break; 265 266 default : 267 //encode all non basic latin characters 268 app = "&#" + ((int)c) + ";"; 269 break; 270 } 271 } 272 if (app != null) 273 { 274 //if (sb == null) 275 //{ 276 // sb = new StringBuilder(string.substring(0, i)); 277 //} 278 //sb.append(app); 279 if (start < i) 280 { 281 writer.write(string, start, i-start); 282 } 283 start = i+1; 284 writer.write(app); 285 } 286 //else 287 //{ 288 // if (sb != null) 289 // { 290 // sb.append(c); 291 // } 292 //} 293 } 294 295 //if (sb == null) 296 //{ 297 // return string; 298 //} 299 //else 300 //{ 301 // return sb.toString(); 302 //} 303 if (start == 0) 304 { 305 writer.write(string); 306 } 307 else if (start < string.length()) 308 { 309 writer.write(string,start,string.length()-start); 310 } 311 } 312 313 314 /** 315 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. 316 */ 317 public static void encode (char[] string, int offset, int length, Writer writer) throws IOException 318 { 319 encode(string, offset, length, false, true, writer); 320 } 321 322 /** 323 * Variant of {@link #encode} where encodeNbsp is true. 324 */ 325 public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) 326 throws IOException 327 { 328 encode(string, offset, length, encodeNewline, true, writer); 329 } 330 331 /** 332 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 333 */ 334 public static void encode (char[] string, int offset, int length, boolean encodeNewline, 335 boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException 336 { 337 encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer); 338 } 339 340 341 /** 342 * Encodes the given string, so that it can be used within a html page. 343 * @param string the string to convert 344 * @param encodeNewline if true newline characters are converted to <br>'s 345 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s 346 * @param encodeNonLatin if true encode non-latin characters as numeric character references 347 */ 348 public static void encode (char[] string, int offset, int length, 349 boolean encodeNewline, 350 boolean encodeSubsequentBlanksToNbsp, 351 boolean encodeNonLatin, Writer writer) throws IOException 352 { 353 if (string == null || length < 0 || offset >= string.length) 354 { 355 return; 356 } 357 offset = Math.max(0, offset); 358 int realLength = Math.min(length, string.length - offset); 359 360 //StringBuilder sb = null; //create later on demand 361 String app; 362 char c; 363 int start = offset; 364 365 for (int i = offset; i < offset + realLength; ++i) 366 { 367 app = null; 368 c = string[i]; 369 370 // All characters before letters 371 if ((int)c < 0x41) 372 { 373 switch (c) 374 { 375 case '"': app = """; break; //" 376 case '&': app = "&"; break; //& 377 case '<': app = "<"; break; //< 378 case '>': app = ">"; break; //> 379 case ' ': 380 if (encodeSubsequentBlanksToNbsp && 381 (i == 0 || (i - 1 >= 0 && string[i - 1] == ' '))) 382 { 383 //Space at beginning or after another space 384 app = " "; 385 } 386 break; 387 case '\n': 388 if (encodeNewline) 389 { 390 app = "<br/>"; 391 } 392 break; 393 default: 394 break; 395 } 396 // http://www.w3.org/MarkUp/html3/specialchars.html 397 // From C0 extension U+0000-U+001F only U+0009, U+000A and 398 // U+000D are valid control characters 399 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) 400 { 401 // Ignore escape character 402 app = ""; 403 } 404 } 405 else if (encodeNonLatin && (int)c > 0x80) 406 { 407 switch(c) 408 { 409 //german umlauts 410 case '\u00E4' : app = "ä"; break; 411 case '\u00C4' : app = "Ä"; break; 412 case '\u00F6' : app = "ö"; break; 413 case '\u00D6' : app = "Ö"; break; 414 case '\u00FC' : app = "ü"; break; 415 case '\u00DC' : app = "Ü"; break; 416 case '\u00DF' : app = "ß"; break; 417 418 //misc 419 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? 420 case '\u20AC': app = "€"; break; 421 case '\u00AB': app = "«"; break; 422 case '\u00BB': app = "»"; break; 423 case '\u00A0': app = " "; break; 424 425 default : 426 //encode all non basic latin characters 427 app = "&#" + ((int)c) + ";"; 428 break; 429 } 430 } 431 if (app != null) 432 { 433 //if (sb == null) 434 //{ 435 // sb = new StringBuilder(realLength*2); 436 // sb.append(string, offset, i - offset); 437 //} 438 //sb.append(app); 439 if (start < i) 440 { 441 writer.write(string, start, i-start); 442 } 443 start = i+1; 444 writer.write(app); 445 } 446 /* 447 else 448 { 449 if (sb != null) 450 { 451 sb.append(c); 452 } 453 }*/ 454 } 455 456 //if (sb == null) 457 //{ 458 // writer.write(string, offset, realLength); 459 //} 460 //else 461 //{ 462 // writer.write(sb.toString()); 463 //} 464 if (start == offset) 465 { 466 writer.write(string, offset, realLength); 467 } 468 else if (start < offset+realLength) 469 { 470 writer.write(string,start,offset+realLength-start); 471 } 472 } 473 474 private static final String HEX_CHARSET = "0123456789ABCDEF"; 475 476 private static final String UTF8 = "UTF-8"; 477 478 /** 479 * Encode an URI, escaping or percent-encoding all required characters and 480 * following the rules mentioned on RFC 3986. 481 * 482 * @param string 483 * @param encodeNonLatin 484 * @return 485 * @throws IOException 486 */ 487 public static String encodeURIAtributte(final String string, final String characterEncoding) 488 throws IOException 489 { 490 StringBuilder sb = null; //create later on demand 491 String app; 492 char c; 493 boolean endLoop = false; 494 for (int i = 0; i < string.length (); ++i) 495 { 496 app = null; 497 c = string.charAt(i); 498 499 // This are the guidelines to be taken into account by this algorithm to encode: 500 501 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters 502 // 503 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> 504 // space = <US-ASCII coded character 20 hexadecimal> 505 // delims = "<" | ">" | "#" | "%" | <"> 506 // %3C %3E %23 %25 %22 507 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" 508 // %7D %7B %7C %5C %5E %5B %5D %60 509 // 510 // ".... Data corresponding to excluded characters must be escaped in order to 511 // be properly represented within a URI....." 512 513 // RFC 3986 Section 3. Syntax Components 514 // 515 // "... The generic URI syntax consists of a hierarchical sequence of 516 // components referred to as the scheme, authority, path, query, and 517 // fragment. 518 // 519 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 520 // 521 // hier-part = "//" authority path-abempty 522 // / path-absolute 523 // / path-rootless 524 // / path-empty 525 // ...." 526 527 // RFC 3986 Section 2.2: 528 // Reserved characters (should not be percent-encoded) 529 // reserved = gen-delims / sub-delims 530 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 531 // %3A %2F %3F %23 %5B %5D %40 532 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 533 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D 534 535 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, 536 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 537 // "...those rules were redefined to directly specify the characters allowed...." 538 // There is also other characters moved from excluded list to reserved: 539 // "[" / "]" / "#" 540 541 // RFC 3986 Section 2.3: 542 // "... for consistency, percent-encoded octets in the ranges of ALPHA 543 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), 544 // underscore (%5F), or tilde (%7E) should not be created by URI 545 // producers...." 546 547 // RFC 3986 Section 3.2.2. Host 548 549 // host = IP-literal / IPv4address / reg-name 550 551 // The reg-name syntax allows percent-encoded octets in order to 552 // represent non-ASCII registered names in a uniform way that is 553 // independent of the underlying name resolution technology. Non-ASCII 554 // characters must first be encoded according to UTF-8 [STD63], and then 555 // each octet of the corresponding UTF-8 sequence must be percent- 556 // encoded to be represented as URI characters. URI producing 557 // applications must not use percent-encoding in host unless it is used 558 // to represent a UTF-8 character sequence. 559 560 // RFC 3986 Section 3.4 Query 561 // query = *( pchar / "/" / "?" ) 562 // 563 // "... However, as query components are often used to carry identifying information 564 // in the form of "key=value" pairs and one frequently used value is a reference to 565 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." 566 // 567 // RFC 3986 Section 2.5 Identifying Data (Apply to query section) 568 // 569 // When a new URI scheme defines a component that represents textual 570 // data consisting of characters from the Universal Character Set [UCS], 571 // the data should first be encoded as octets according to the UTF-8 572 // character encoding [STD63]; then only those octets that do not 573 // correspond to characters in the unreserved set should be percent- 574 // encoded. For example, the character A would be represented as "A", 575 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented 576 // as "%C3%80", and the character KATAKANA LETTER A would be represented 577 // as "%E3%82%A2". 578 // 579 // RFC 3986 Section 3.5 Fragment 580 // fragment = *( pchar / "/" / "?" ) 581 // 582 // Note that follows the same as query 583 584 // Based on the extracts the strategy to apply on this method is: 585 // 586 // On scheme ":" hier-part 587 // 588 // Escape or percent encode chars inside : 589 // 590 // - From %00 to %20, 591 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 592 // duplicate encoding, encode it when we are sure 593 // that there are not encoded twice) 594 // - "<" %3C, ">" %3E 595 // - "\" %5C, "^" %5E, "`" %60 596 // - "{" %7B, "|" %7C, "}" %7D 597 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this 598 // part of an URI, but it is preferred to encode it that omit it). 599 // 600 // The remaining characters must not be encoded 601 // 602 // Characters after ? or # should be percent encoding but only the necessary ones: 603 // 604 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 605 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 606 // duplicate encoding, encode it when we are sure 607 // that there are not encoded twice) 608 // - "<" %3C, ">" %3E, 609 // - "\" %5C, "^" %5E, "`" %60 610 // - "{" %7B, "|" %7C, "}" %7D 611 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 612 // that a single char should contain 2,3 or more bytes!. This data should be encoded 613 // translating from the document character encoding to percent encoding, because this values 614 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding 615 // for decode values) 616 // 617 // "&" should be encoded as "&" because this link is inside an html page, and 618 // put only & is invalid in this context. 619 620 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 621 c == '"' || c == '<' || 622 c == '>' || c == '\\' || c == '^' || c == '`' || 623 c == '{' || c == '|' || c == '}') 624 { 625 // The percent encoding on this part should be done using UTF-8 charset 626 // as RFC 3986 Section 3.2.2 says. 627 // Also there is a reference on 628 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars 629 // that recommend use of UTF-8 instead the document character encoding. 630 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) 631 app = percentEncode(c, "UTF-8"); 632 } 633 else if (c == '%') 634 { 635 if (i + 2 < string.length()) 636 { 637 char c1 = string.charAt(i+1); 638 char c2 = string.charAt(i+2); 639 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 640 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 641 { 642 // do not percent encode, because it could be already encoded 643 // and we don't want encode it twice 644 } 645 else 646 { 647 app = percentEncode(c, UTF8); 648 } 649 } 650 else 651 { 652 app = percentEncode(c, UTF8); 653 } 654 } 655 else if (c == '?' || c == '#') 656 { 657 if (i+1 < string.length()) 658 { 659 // The remaining part of the URI are data that should be encoded 660 // using the document character encoding. 661 app = c + encodeURIQuery(string.substring(i+1), characterEncoding); 662 endLoop = true; 663 } 664 } 665 else 666 { 667 //No encoding, just do nothing, char will be added later. 668 } 669 670 if (app != null) 671 { 672 if (sb == null) 673 { 674 sb = new StringBuilder(string.substring(0, i)); 675 } 676 sb.append(app); 677 } 678 else 679 { 680 if (sb != null) 681 { 682 sb.append(c); 683 } 684 } 685 if (endLoop) 686 { 687 break; 688 } 689 } 690 if (sb == null) 691 { 692 return string; 693 } 694 else 695 { 696 return sb.toString(); 697 } 698 } 699 700 /** 701 * Encode a unicode char value in percentEncode, decoding its bytes using a specified 702 * characterEncoding. 703 * 704 * @param c 705 * @param characterEncoding 706 * @return 707 */ 708 private static String percentEncode(char c, String characterEncoding) 709 { 710 String app = null; 711 if (c > (char)((short)0x007F)) 712 { 713 //percent encode in the proper encoding to be consistent 714 app = percentEncodeNonUsAsciiCharacter(c, characterEncoding); 715 } 716 else 717 { 718 //percent encode US-ASCII char (0x00-0x7F range) 719 app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); 720 } 721 return app; 722 } 723 724 private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding) 725 { 726 ByteArrayOutputStream baos = new ByteArrayOutputStream(10); 727 StringBuilder builder = new StringBuilder(); 728 try 729 { 730 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); 731 writer.write(c); 732 writer.flush(); 733 } 734 catch(IOException e) 735 { 736 baos.reset(); 737 return null; 738 } 739 740 byte [] byteArray = baos.toByteArray(); 741 for (int i=0; i < byteArray.length; i++) 742 { 743 builder.append('%'); 744 builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); 745 builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); 746 } 747 748 return builder.toString(); 749 } 750 751 /** 752 * Encode the query part using the document charset encoding provided. 753 * 754 * 755 * @param string 756 * @param characterEncoding 757 * @return 758 */ 759 private static String encodeURIQuery(final String string, final String characterEncoding) 760 { 761 StringBuilder sb = null; //create later on demand 762 String app; 763 char c; 764 boolean endLoop = false; 765 for (int i = 0; i < string.length (); ++i) 766 { 767 app = null; 768 c = string.charAt(i); 769 770 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 771 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 772 // we make easier and omit this one) 773 // - "<" %3C, ">" %3E, 774 // - "\" %5C, "^" %5E, "`" %60 775 // - "{" %7B, "|" %7C, "}" %7D 776 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 777 // that a single char should contain 2,3 or more bytes!. This data should be encoded 778 // translating from the document character encoding to percent encoding) 779 // 780 // "&" should be encoded as "&" because this link is inside an html page, and 781 // put & is invalid in this context 782 783 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 784 c == '"' || c == '<' || 785 c == '>' || c == '\\' || c == '^' || c == '`' || 786 c == '{' || c == '|' || c == '}') 787 { 788 // The percent encoding on this part should be done using UTF-8 charset 789 // as RFC 3986 Section 3.2.2 says 790 app = percentEncode(c, characterEncoding); 791 } 792 else if (c == '%') 793 { 794 if (i + 2 < string.length()) 795 { 796 char c1 = string.charAt(i+1); 797 char c2 = string.charAt(i+2); 798 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 799 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 800 { 801 // do not percent encode, because it could be already encoded 802 } 803 else 804 { 805 app = percentEncode(c, characterEncoding); 806 } 807 } 808 else 809 { 810 app = percentEncode(c, characterEncoding); 811 } 812 } 813 else if (c == '&') 814 { 815 if (i+4 < string.length() ) 816 { 817 if ('a' == string.charAt(i+1) && 818 'm' == string.charAt(i+2) && 819 'p' == string.charAt(i+3) && 820 ';' == string.charAt(i+4)) 821 { 822 //Skip 823 } 824 else 825 { 826 app = "&"; 827 } 828 } 829 else 830 { 831 app = "&"; 832 } 833 } 834 else 835 { 836 //No encoding, just do nothing, char will be added later. 837 } 838 839 if (app != null) 840 { 841 if (sb == null) 842 { 843 sb = new StringBuilder(string.substring(0, i)); 844 } 845 sb.append(app); 846 } 847 else 848 { 849 if (sb != null) 850 { 851 sb.append(c); 852 } 853 } 854 if (endLoop) 855 { 856 break; 857 } 858 } 859 if (sb == null) 860 { 861 return string; 862 } 863 else 864 { 865 return sb.toString(); 866 } 867 } 868 869 /** 870 * Encode an URI, escaping or percent-encoding all required characters and 871 * following the rules mentioned on RFC 3986. 872 * 873 * @param string 874 * @param encodeNonLatin 875 * @return 876 * @throws IOException 877 */ 878 public static void encodeURIAtributte(Writer writer, final String string, final String characterEncoding) 879 throws IOException 880 { 881 //StringBuilder sb = null; //create later on demand 882 int start = 0; 883 String app; 884 char c; 885 boolean endLoop = false; 886 for (int i = 0; i < string.length (); ++i) 887 { 888 app = null; 889 c = string.charAt(i); 890 891 // This are the guidelines to be taken into account by this algorithm to encode: 892 893 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters 894 // 895 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> 896 // space = <US-ASCII coded character 20 hexadecimal> 897 // delims = "<" | ">" | "#" | "%" | <"> 898 // %3C %3E %23 %25 %22 899 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" 900 // %7D %7B %7C %5C %5E %5B %5D %60 901 // 902 // ".... Data corresponding to excluded characters must be escaped in order to 903 // be properly represented within a URI....." 904 905 // RFC 3986 Section 3. Syntax Components 906 // 907 // "... The generic URI syntax consists of a hierarchical sequence of 908 // components referred to as the scheme, authority, path, query, and 909 // fragment. 910 // 911 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 912 // 913 // hier-part = "//" authority path-abempty 914 // / path-absolute 915 // / path-rootless 916 // / path-empty 917 // ...." 918 919 // RFC 3986 Section 2.2: 920 // Reserved characters (should not be percent-encoded) 921 // reserved = gen-delims / sub-delims 922 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 923 // %3A %2F %3F %23 %5B %5D %40 924 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 925 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D 926 927 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, 928 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 929 // "...those rules were redefined to directly specify the characters allowed...." 930 // There is also other characters moved from excluded list to reserved: 931 // "[" / "]" / "#" 932 933 // RFC 3986 Section 2.3: 934 // "... for consistency, percent-encoded octets in the ranges of ALPHA 935 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), 936 // underscore (%5F), or tilde (%7E) should not be created by URI 937 // producers...." 938 939 // RFC 3986 Section 3.2.2. Host 940 941 // host = IP-literal / IPv4address / reg-name 942 943 // The reg-name syntax allows percent-encoded octets in order to 944 // represent non-ASCII registered names in a uniform way that is 945 // independent of the underlying name resolution technology. Non-ASCII 946 // characters must first be encoded according to UTF-8 [STD63], and then 947 // each octet of the corresponding UTF-8 sequence must be percent- 948 // encoded to be represented as URI characters. URI producing 949 // applications must not use percent-encoding in host unless it is used 950 // to represent a UTF-8 character sequence. 951 952 // RFC 3986 Section 3.4 Query 953 // query = *( pchar / "/" / "?" ) 954 // 955 // "... However, as query components are often used to carry identifying information 956 // in the form of "key=value" pairs and one frequently used value is a reference to 957 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." 958 // 959 // RFC 3986 Section 2.5 Identifying Data (Apply to query section) 960 // 961 // When a new URI scheme defines a component that represents textual 962 // data consisting of characters from the Universal Character Set [UCS], 963 // the data should first be encoded as octets according to the UTF-8 964 // character encoding [STD63]; then only those octets that do not 965 // correspond to characters in the unreserved set should be percent- 966 // encoded. For example, the character A would be represented as "A", 967 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented 968 // as "%C3%80", and the character KATAKANA LETTER A would be represented 969 // as "%E3%82%A2". 970 // 971 // RFC 3986 Section 3.5 Fragment 972 // fragment = *( pchar / "/" / "?" ) 973 // 974 // Note that follows the same as query 975 976 // Based on the extracts the strategy to apply on this method is: 977 // 978 // On scheme ":" hier-part 979 // 980 // Escape or percent encode chars inside : 981 // 982 // - From %00 to %20, 983 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 984 // duplicate encoding, encode it when we are sure 985 // that there are not encoded twice) 986 // - "<" %3C, ">" %3E 987 // - "\" %5C, "^" %5E, "`" %60 988 // - "{" %7B, "|" %7C, "}" %7D 989 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this 990 // part of an URI, but it is preferred to encode it that omit it). 991 // 992 // The remaining characters must not be encoded 993 // 994 // Characters after ? or # should be percent encoding but only the necessary ones: 995 // 996 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 997 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 998 // duplicate encoding, encode it when we are sure 999 // that there are not encoded twice) 1000 // - "<" %3C, ">" %3E, 1001 // - "\" %5C, "^" %5E, "`" %60 1002 // - "{" %7B, "|" %7C, "}" %7D 1003 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 1004 // that a single char should contain 2,3 or more bytes!. This data should be encoded 1005 // translating from the document character encoding to percent encoding, because this values 1006 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding 1007 // for decode values) 1008 // 1009 // "&" should be encoded as "&" because this link is inside an html page, and 1010 // put only & is invalid in this context. 1011 1012 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 1013 c == '"' || c == '<' || 1014 c == '>' || c == '\\' || c == '^' || c == '`' || 1015 c == '{' || c == '|' || c == '}') 1016 { 1017 // The percent encoding on this part should be done using UTF-8 charset 1018 // as RFC 3986 Section 3.2.2 says. 1019 // Also there is a reference on 1020 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars 1021 // that recommend use of UTF-8 instead the document character encoding. 1022 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) 1023 //app = percentEncode(c, "UTF-8"); 1024 if (start < i) 1025 { 1026 writer.write(string, start, i-start); 1027 } 1028 start = i+1; 1029 percentEncode(writer, c, "UTF-8"); 1030 } 1031 else if (c == '%') 1032 { 1033 if (i + 2 < string.length()) 1034 { 1035 char c1 = string.charAt(i+1); 1036 char c2 = string.charAt(i+2); 1037 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 1038 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 1039 { 1040 // do not percent encode, because it could be already encoded 1041 // and we don't want encode it twice 1042 } 1043 else 1044 { 1045 //app = percentEncode(c, UTF8); 1046 if (start < i) 1047 { 1048 writer.write(string, start, i-start); 1049 } 1050 start = i+1; 1051 percentEncode(writer, c, UTF8); 1052 } 1053 } 1054 else 1055 { 1056 //app = percentEncode(c, UTF8); 1057 if (start < i) 1058 { 1059 writer.write(string, start, i-start); 1060 } 1061 start = i+1; 1062 percentEncode(writer, c, UTF8); 1063 } 1064 } 1065 else if (c == '?' || c == '#') 1066 { 1067 if (i+1 < string.length()) 1068 { 1069 // The remaining part of the URI are data that should be encoded 1070 // using the document character encoding. 1071 //app = c + encodeURIQuery(string.substring(i+1), characterEncoding); 1072 if (start < i) 1073 { 1074 writer.write(string, start, i-start); 1075 } 1076 start = i+1; 1077 writer.write(c); 1078 //encodeURIQuery(writer, string.substring(i+1), characterEncoding); 1079 encodeURIQuery(writer, string, i+1, characterEncoding); 1080 endLoop = true; 1081 } 1082 } 1083 else 1084 { 1085 //No encoding, just do nothing, char will be added later. 1086 } 1087 1088 if (app != null) 1089 { 1090 //if (sb == null) 1091 //{ 1092 // sb = new StringBuilder(string.substring(0, i)); 1093 //} 1094 //sb.append(app); 1095 if (start < i) 1096 { 1097 writer.write(string, start, i-start); 1098 } 1099 start = i+1; 1100 writer.write(app); 1101 } 1102 //else 1103 //{ 1104 // if (sb != null) 1105 // { 1106 // sb.append(c); 1107 // } 1108 //} 1109 if (endLoop) 1110 { 1111 start = string.length(); 1112 break; 1113 } 1114 } 1115 //if (sb == null) 1116 //{ 1117 // return string; 1118 //} 1119 //else 1120 //{ 1121 // return sb.toString(); 1122 //} 1123 if (start == 0) 1124 { 1125 writer.write(string); 1126 } 1127 else if (start < string.length()) 1128 { 1129 writer.write(string,start,string.length()-start); 1130 } 1131 } 1132 1133 /** 1134 * Encode a unicode char value in percentEncode, decoding its bytes using a specified 1135 * characterEncoding. 1136 * 1137 * @param c 1138 * @param characterEncoding 1139 * @return 1140 */ 1141 private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException 1142 { 1143 String app = null; 1144 if (c > (char)((short)0x007F)) 1145 { 1146 //percent encode in the proper encoding to be consistent 1147 //app = percentEncodeNonUsAsciiCharacter(writer c, characterEncoding); 1148 percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding); 1149 } 1150 else 1151 { 1152 //percent encode US-ASCII char (0x00-0x7F range) 1153 //app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); 1154 writer.write('%'); 1155 writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10))); 1156 writer.write(HEX_CHARSET.charAt(c % 0x10)); 1157 } 1158 //return app; 1159 } 1160 1161 private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding) 1162 throws IOException 1163 { 1164 ByteArrayOutputStream baos = new ByteArrayOutputStream(10); 1165 StringBuilder builder = new StringBuilder(); 1166 try 1167 { 1168 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); 1169 writer.write(c); 1170 writer.flush(); 1171 } 1172 catch(IOException e) 1173 { 1174 baos.reset(); 1175 return; 1176 } 1177 1178 byte [] byteArray = baos.toByteArray(); 1179 for (int i=0; i < byteArray.length; i++) 1180 { 1181 //builder.append('%'); 1182 //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); 1183 //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); 1184 currentWriter.write('%'); 1185 currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); 1186 currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); 1187 } 1188 1189 //return builder.toString(); 1190 } 1191 1192 /** 1193 * Encode the query part using the document charset encoding provided. 1194 * 1195 * 1196 * @param string 1197 * @param characterEncoding 1198 * @return 1199 */ 1200 private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding) 1201 throws IOException 1202 { 1203 //StringBuilder sb = null; //create later on demand 1204 int start = offset; 1205 int realLength = string.length()-offset; 1206 String app; 1207 char c; 1208 //boolean endLoop = false; 1209 for (int i = offset; i < offset+realLength; ++i) 1210 { 1211 app = null; 1212 c = string.charAt(i); 1213 1214 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 1215 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 1216 // we make easier and omit this one) 1217 // - "<" %3C, ">" %3E, 1218 // - "\" %5C, "^" %5E, "`" %60 1219 // - "{" %7B, "|" %7C, "}" %7D 1220 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 1221 // that a single char should contain 2,3 or more bytes!. This data should be encoded 1222 // translating from the document character encoding to percent encoding) 1223 // 1224 // "&" should be encoded as "&" because this link is inside an html page, and 1225 // put & is invalid in this context 1226 1227 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 1228 c == '"' || c == '<' || 1229 c == '>' || c == '\\' || c == '^' || c == '`' || 1230 c == '{' || c == '|' || c == '}') 1231 { 1232 // The percent encoding on this part should be done using UTF-8 charset 1233 // as RFC 3986 Section 3.2.2 says 1234 //app = percentEncode(c, characterEncoding); 1235 if (start < i) 1236 { 1237 writer.write(string, start, i-start); 1238 } 1239 start = i+1; 1240 percentEncode(writer, c, characterEncoding); 1241 } 1242 else if (c == '%') 1243 { 1244 if (i + 2 < string.length()) 1245 { 1246 char c1 = string.charAt(i+1); 1247 char c2 = string.charAt(i+2); 1248 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 1249 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 1250 { 1251 // do not percent encode, because it could be already encoded 1252 } 1253 else 1254 { 1255 //app = percentEncode(c, characterEncoding); 1256 if (start < i) 1257 { 1258 writer.write(string, start, i-start); 1259 } 1260 start = i+1; 1261 percentEncode(writer, c, characterEncoding); 1262 } 1263 } 1264 else 1265 { 1266 //app = percentEncode(c, characterEncoding); 1267 if (start < i) 1268 { 1269 writer.write(string, start, i-start); 1270 } 1271 start = i+1; 1272 percentEncode(writer, c, characterEncoding); 1273 } 1274 } 1275 else if (c == '&') 1276 { 1277 if (i+4 < string.length() ) 1278 { 1279 if ('a' == string.charAt(i+1) && 1280 'm' == string.charAt(i+2) && 1281 'p' == string.charAt(i+3) && 1282 ';' == string.charAt(i+4)) 1283 { 1284 //Skip 1285 } 1286 else 1287 { 1288 app = "&"; 1289 } 1290 } 1291 else 1292 { 1293 app = "&"; 1294 } 1295 } 1296 else 1297 { 1298 //No encoding, just do nothing, char will be added later. 1299 } 1300 1301 if (app != null) 1302 { 1303 //if (sb == null) 1304 //{ 1305 // sb = new StringBuilder(string.substring(0, i)); 1306 //} 1307 //sb.append(app); 1308 if (start < i) 1309 { 1310 writer.write(string, start, i-start); 1311 } 1312 start = i+1; 1313 writer.write(app); 1314 } 1315 //else 1316 //{ 1317 // if (sb != null) 1318 // { 1319 // sb.append(c); 1320 // } 1321 //} 1322 //if (endLoop) 1323 //{ 1324 // break; 1325 //} 1326 } 1327 1328 //if (sb == null) 1329 //{ 1330 // return string; 1331 //} 1332 //else 1333 //{ 1334 // return sb.toString(); 1335 //} 1336 if (start == offset) 1337 { 1338 writer.write(string, offset, realLength); 1339 } 1340 else if (start < offset+realLength) 1341 { 1342 writer.write(string,start,offset+realLength-start); 1343 } 1344 } 1345 }