1 /* 2 * Licensed to the Apache Software Foundation (ASF) under one 3 * or more contributor license agreements. See the NOTICE file 4 * distributed with this work for additional information 5 * regarding copyright ownership. The ASF licenses this file 6 * to you under the Apache License, Version 2.0 (the 7 * "License"); you may not use this file except in compliance 8 * with the License. You may obtain a copy of the License at 9 * 10 * http://www.apache.org/licenses/LICENSE-2.0 11 * 12 * Unless required by applicable law or agreed to in writing, 13 * software distributed under the License is distributed on an 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 15 * KIND, either express or implied. See the License for the 16 * specific language governing permissions and limitations 17 * under the License. 18 */ 19 package org.apache.myfaces.shared.renderkit.html.util; 20 21 import java.io.ByteArrayOutputStream; 22 import java.io.IOException; 23 import java.io.OutputStreamWriter; 24 import java.io.Writer; 25 26 /** 27 * Converts Strings so that they can be used within HTML-Code. 28 */ 29 public abstract class HTMLEncoder 30 { 31 /** 32 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. 33 */ 34 public static String encode (String string) 35 { 36 return encode(string, false, true); 37 } 38 39 /** 40 * Variant of {@link #encode} where encodeNbsp is true. 41 */ 42 public static String encode (String string, boolean encodeNewline) 43 { 44 return encode(string, encodeNewline, true); 45 } 46 47 /** 48 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 49 */ 50 public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) 51 { 52 return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true); 53 } 54 55 /** 56 * Encodes the given string, so that it can be used within a html page. 57 * @param string the string to convert 58 * @param encodeNewline if true newline characters are converted to <br>'s 59 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s 60 * @param encodeNonLatin if true encode non-latin characters as numeric character references 61 */ 62 public static String encode (String string, 63 boolean encodeNewline, 64 boolean encodeSubsequentBlanksToNbsp, 65 boolean encodeNonLatin) 66 { 67 if (string == null) 68 { 69 return ""; 70 } 71 72 StringBuilder sb = null; //create later on demand 73 String app; 74 char c = ' '; 75 char prevC; 76 int length = string.length(); 77 for (int i = 0; i < length; ++i) 78 { 79 app = null; 80 prevC = c; 81 c = string.charAt(i); 82 83 // All characters before letters 84 if ((int)c < 0x41) 85 { 86 switch (c) 87 { 88 case '"': app = """; break; //" 89 case '&': app = "&"; break; //& 90 case '<': app = "<"; break; //< 91 case '>': app = ">"; break; //> 92 case ' ': 93 if (encodeSubsequentBlanksToNbsp && 94 prevC == ' ') 95 { 96 //Space at beginning or after another space 97 app = " "; 98 } 99 break; 100 case '\n': 101 if (encodeNewline) 102 { 103 app = "<br/>"; 104 } 105 break; 106 default: 107 break; 108 } 109 // http://www.w3.org/MarkUp/html3/specialchars.html 110 // From C0 extension U+0000-U+001F only U+0009, U+000A and 111 // U+000D are valid control characters 112 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) 113 { 114 // Ignore escape character 115 app = ""; 116 } 117 } 118 else if (encodeNonLatin && (int)c > 0x80) 119 { 120 switch(c) 121 { 122 //german umlauts 123 case '\u00E4' : app = "ä"; break; 124 case '\u00C4' : app = "Ä"; break; 125 case '\u00F6' : app = "ö"; break; 126 case '\u00D6' : app = "Ö"; break; 127 case '\u00FC' : app = "ü"; break; 128 case '\u00DC' : app = "Ü"; break; 129 case '\u00DF' : app = "ß"; break; 130 131 //misc 132 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? 133 case '\u20AC': app = "€"; break; 134 case '\u00AB': app = "«"; break; 135 case '\u00BB': app = "»"; break; 136 case '\u00A0': app = " "; break; 137 138 default : 139 //encode all non basic latin characters 140 app = "&#" + ((int)c) + ";"; 141 break; 142 } 143 } 144 if (app != null) 145 { 146 if (sb == null) 147 { 148 sb = new StringBuilder(string.substring(0, i)); 149 } 150 sb.append(app); 151 } 152 else 153 { 154 if (sb != null) 155 { 156 sb.append(c); 157 } 158 } 159 } 160 161 if (sb == null) 162 { 163 return string; 164 } 165 else 166 { 167 return sb.toString(); 168 } 169 } 170 171 /** 172 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. 173 */ 174 public static void encode (Writer writer, String string) throws IOException 175 { 176 encode(writer, string, false, true); 177 } 178 179 /** 180 * Variant of {@link #encode} where encodeNbsp is true. 181 */ 182 public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException 183 { 184 encode(writer, string, encodeNewline, true); 185 } 186 187 /** 188 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 189 */ 190 public static void encode (Writer writer, String string, 191 boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException 192 { 193 encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true); 194 } 195 196 public static void encode (Writer writer, String string, 197 boolean encodeNewline, 198 boolean encodeSubsequentBlanksToNbsp, 199 boolean encodeNonLatin) throws IOException 200 { 201 if (string == null) 202 { 203 return; 204 } 205 206 int start = 0; 207 String app; 208 char c = ' '; 209 char prevC; 210 int length = string.length(); 211 for (int i = 0; i < length; ++i) 212 { 213 app = null; 214 prevC = c; 215 c = string.charAt(i); 216 217 // All characters before letters 218 if ((int)c < 0x41) 219 { 220 switch (c) 221 { 222 case '"': app = """; break; //" 223 case '&': app = "&"; break; //& 224 case '<': app = "<"; break; //< 225 case '>': app = ">"; break; //> 226 case ' ': 227 if (encodeSubsequentBlanksToNbsp && 228 prevC == ' ') 229 { 230 //Space at beginning or after another space 231 app = " "; 232 } 233 break; 234 case '\n': 235 if (encodeNewline) 236 { 237 app = "<br/>"; 238 } 239 break; 240 default: 241 break; 242 } 243 // http://www.w3.org/MarkUp/html3/specialchars.html 244 // From C0 extension U+0000-U+001F only U+0009, U+000A and 245 // U+000D are valid control characters 246 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) 247 { 248 // Ignore escape character 249 app = ""; 250 } 251 } 252 else if (encodeNonLatin && (int)c > 0x80) 253 { 254 switch(c) 255 { 256 //german umlauts 257 case '\u00E4' : app = "ä"; break; 258 case '\u00C4' : app = "Ä"; break; 259 case '\u00F6' : app = "ö"; break; 260 case '\u00D6' : app = "Ö"; break; 261 case '\u00FC' : app = "ü"; break; 262 case '\u00DC' : app = "Ü"; break; 263 case '\u00DF' : app = "ß"; break; 264 265 //misc 266 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? 267 case '\u20AC': app = "€"; break; 268 case '\u00AB': app = "«"; break; 269 case '\u00BB': app = "»"; break; 270 case '\u00A0': app = " "; break; 271 272 default : 273 //encode all non basic latin characters 274 app = "&#" + ((int)c) + ";"; 275 break; 276 } 277 } 278 if (app != null) 279 { 280 //if (sb == null) 281 //{ 282 // sb = new StringBuilder(string.substring(0, i)); 283 //} 284 //sb.append(app); 285 if (start < i) 286 { 287 writer.write(string, start, i-start); 288 } 289 start = i+1; 290 writer.write(app); 291 } 292 //else 293 //{ 294 // if (sb != null) 295 // { 296 // sb.append(c); 297 // } 298 //} 299 } 300 301 //if (sb == null) 302 //{ 303 // return string; 304 //} 305 //else 306 //{ 307 // return sb.toString(); 308 //} 309 if (start == 0) 310 { 311 writer.write(string); 312 } 313 else if (start < length) 314 { 315 writer.write(string,start,length-start); 316 } 317 } 318 319 320 /** 321 * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. 322 */ 323 public static void encode (char[] string, int offset, int length, Writer writer) throws IOException 324 { 325 encode(string, offset, length, false, true, writer); 326 } 327 328 /** 329 * Variant of {@link #encode} where encodeNbsp is true. 330 */ 331 public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) 332 throws IOException 333 { 334 encode(string, offset, length, encodeNewline, true, writer); 335 } 336 337 /** 338 * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true 339 */ 340 public static void encode (char[] string, int offset, int length, boolean encodeNewline, 341 boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException 342 { 343 encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer); 344 } 345 346 347 /** 348 * Encodes the given string, so that it can be used within a html page. 349 * @param string the string to convert 350 * @param encodeNewline if true newline characters are converted to <br>'s 351 * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s 352 * @param encodeNonLatin if true encode non-latin characters as numeric character references 353 */ 354 public static void encode (char[] string, int offset, int length, 355 boolean encodeNewline, 356 boolean encodeSubsequentBlanksToNbsp, 357 boolean encodeNonLatin, Writer writer) throws IOException 358 { 359 if (string == null || length < 0 || offset >= string.length) 360 { 361 return; 362 } 363 offset = Math.max(0, offset); 364 int realLength = Math.min(length, string.length - offset); 365 366 //StringBuilder sb = null; //create later on demand 367 String app; 368 char c = ' '; 369 char prevC; 370 int start = offset; 371 372 for (int i = offset; i < offset + realLength; ++i) 373 { 374 app = null; 375 prevC = c; 376 c = string[i]; 377 378 // All characters before letters 379 if ((int)c < 0x41) 380 { 381 switch (c) 382 { 383 case '"': app = """; break; //" 384 case '&': app = "&"; break; //& 385 case '<': app = "<"; break; //< 386 case '>': app = ">"; break; //> 387 case ' ': 388 if (encodeSubsequentBlanksToNbsp && 389 prevC == ' ') 390 { 391 //Space at beginning or after another space 392 app = " "; 393 } 394 break; 395 case '\n': 396 if (encodeNewline) 397 { 398 app = "<br/>"; 399 } 400 break; 401 default: 402 break; 403 } 404 // http://www.w3.org/MarkUp/html3/specialchars.html 405 // From C0 extension U+0000-U+001F only U+0009, U+000A and 406 // U+000D are valid control characters 407 if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) 408 { 409 // Ignore escape character 410 app = ""; 411 } 412 } 413 else if (encodeNonLatin && (int)c > 0x80) 414 { 415 switch(c) 416 { 417 //german umlauts 418 case '\u00E4' : app = "ä"; break; 419 case '\u00C4' : app = "Ä"; break; 420 case '\u00F6' : app = "ö"; break; 421 case '\u00D6' : app = "Ö"; break; 422 case '\u00FC' : app = "ü"; break; 423 case '\u00DC' : app = "Ü"; break; 424 case '\u00DF' : app = "ß"; break; 425 426 //misc 427 //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? 428 case '\u20AC': app = "€"; break; 429 case '\u00AB': app = "«"; break; 430 case '\u00BB': app = "»"; break; 431 case '\u00A0': app = " "; break; 432 433 default : 434 //encode all non basic latin characters 435 app = "&#" + ((int)c) + ";"; 436 break; 437 } 438 } 439 if (app != null) 440 { 441 //if (sb == null) 442 //{ 443 // sb = new StringBuilder(realLength*2); 444 // sb.append(string, offset, i - offset); 445 //} 446 //sb.append(app); 447 if (start < i) 448 { 449 writer.write(string, start, i-start); 450 } 451 start = i+1; 452 writer.write(app); 453 } 454 /* 455 else 456 { 457 if (sb != null) 458 { 459 sb.append(c); 460 } 461 }*/ 462 } 463 464 //if (sb == null) 465 //{ 466 // writer.write(string, offset, realLength); 467 //} 468 //else 469 //{ 470 // writer.write(sb.toString()); 471 //} 472 if (start == offset) 473 { 474 writer.write(string, offset, realLength); 475 } 476 else if (start < offset+realLength) 477 { 478 writer.write(string,start,offset+realLength-start); 479 } 480 } 481 482 private static final String HEX_CHARSET = "0123456789ABCDEF"; 483 484 private static final String UTF8 = "UTF-8"; 485 486 /** 487 * Encode an URI, escaping or percent-encoding all required characters and 488 * following the rules mentioned on RFC 3986. 489 * 490 * @param string 491 * @param characterEncoding 492 * @return 493 * @throws IOException 494 */ 495 public static String encodeURIAttribute(final String string, final String characterEncoding) 496 throws IOException 497 { 498 StringBuilder sb = null; //create later on demand 499 String app; 500 char c; 501 boolean endLoop = false; 502 int length = string.length(); 503 for (int i = 0; i < length; ++i) 504 { 505 app = null; 506 c = string.charAt(i); 507 508 // This are the guidelines to be taken into account by this algorithm to encode: 509 510 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters 511 // 512 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> 513 // space = <US-ASCII coded character 20 hexadecimal> 514 // delims = "<" | ">" | "#" | "%" | <"> 515 // %3C %3E %23 %25 %22 516 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" 517 // %7D %7B %7C %5C %5E %5B %5D %60 518 // 519 // ".... Data corresponding to excluded characters must be escaped in order to 520 // be properly represented within a URI....." 521 522 // RFC 3986 Section 3. Syntax Components 523 // 524 // "... The generic URI syntax consists of a hierarchical sequence of 525 // components referred to as the scheme, authority, path, query, and 526 // fragment. 527 // 528 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 529 // 530 // hier-part = "//" authority path-abempty 531 // / path-absolute 532 // / path-rootless 533 // / path-empty 534 // ...." 535 536 // RFC 3986 Section 2.2: 537 // Reserved characters (should not be percent-encoded) 538 // reserved = gen-delims / sub-delims 539 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 540 // %3A %2F %3F %23 %5B %5D %40 541 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 542 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D 543 544 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, 545 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 546 // "...those rules were redefined to directly specify the characters allowed...." 547 // There is also other characters moved from excluded list to reserved: 548 // "[" / "]" / "#" 549 550 // RFC 3986 Section 2.3: 551 // "... for consistency, percent-encoded octets in the ranges of ALPHA 552 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), 553 // underscore (%5F), or tilde (%7E) should not be created by URI 554 // producers...." 555 556 // RFC 3986 Section 3.2.2. Host 557 558 // host = IP-literal / IPv4address / reg-name 559 560 // The reg-name syntax allows percent-encoded octets in order to 561 // represent non-ASCII registered names in a uniform way that is 562 // independent of the underlying name resolution technology. Non-ASCII 563 // characters must first be encoded according to UTF-8 [STD63], and then 564 // each octet of the corresponding UTF-8 sequence must be percent- 565 // encoded to be represented as URI characters. URI producing 566 // applications must not use percent-encoding in host unless it is used 567 // to represent a UTF-8 character sequence. 568 569 // RFC 3986 Section 3.4 Query 570 // query = *( pchar / "/" / "?" ) 571 // 572 // "... However, as query components are often used to carry identifying information 573 // in the form of "key=value" pairs and one frequently used value is a reference to 574 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." 575 // 576 // RFC 3986 Section 2.5 Identifying Data (Apply to query section) 577 // 578 // When a new URI scheme defines a component that represents textual 579 // data consisting of characters from the Universal Character Set [UCS], 580 // the data should first be encoded as octets according to the UTF-8 581 // character encoding [STD63]; then only those octets that do not 582 // correspond to characters in the unreserved set should be percent- 583 // encoded. For example, the character A would be represented as "A", 584 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented 585 // as "%C3%80", and the character KATAKANA LETTER A would be represented 586 // as "%E3%82%A2". 587 // 588 // RFC 3986 Section 3.5 Fragment 589 // fragment = *( pchar / "/" / "?" ) 590 // 591 // Note that follows the same as query 592 593 // Based on the extracts the strategy to apply on this method is: 594 // 595 // On scheme ":" hier-part 596 // 597 // Escape or percent encode chars inside : 598 // 599 // - From %00 to %20, 600 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 601 // duplicate encoding, encode it when we are sure 602 // that there are not encoded twice) 603 // - "<" %3C, ">" %3E 604 // - "\" %5C, "^" %5E, "`" %60 605 // - "{" %7B, "|" %7C, "}" %7D 606 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this 607 // part of an URI, but it is preferred to encode it that omit it). 608 // 609 // The remaining characters must not be encoded 610 // 611 // Characters after ? or # should be percent encoding but only the necessary ones: 612 // 613 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 614 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 615 // duplicate encoding, encode it when we are sure 616 // that there are not encoded twice) 617 // - "<" %3C, ">" %3E, 618 // - "\" %5C, "^" %5E, "`" %60 619 // - "{" %7B, "|" %7C, "}" %7D 620 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 621 // that a single char should contain 2,3 or more bytes!. This data should be encoded 622 // translating from the document character encoding to percent encoding, because this values 623 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding 624 // for decode values) 625 // 626 // "&" should be encoded as "&" because this link is inside an html page, and 627 // put only & is invalid in this context. 628 629 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 630 c == '"' || c == '<' || 631 c == '>' || c == '\\' || c == '^' || c == '`' || 632 c == '{' || c == '|' || c == '}') 633 { 634 // The percent encoding on this part should be done using UTF-8 charset 635 // as RFC 3986 Section 3.2.2 says. 636 // Also there is a reference on 637 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars 638 // that recommend use of UTF-8 instead the document character encoding. 639 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) 640 app = percentEncode(c, "UTF-8"); 641 } 642 else if (c == '%') 643 { 644 if (i + 2 < length) 645 { 646 char c1 = string.charAt(i+1); 647 char c2 = string.charAt(i+2); 648 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 649 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 650 { 651 // do not percent encode, because it could be already encoded 652 // and we don't want encode it twice 653 } 654 else 655 { 656 app = percentEncode(c, UTF8); 657 } 658 } 659 else 660 { 661 app = percentEncode(c, UTF8); 662 } 663 } 664 else if (c == '?' || c == '#') 665 { 666 if (i+1 < length) 667 { 668 // The remaining part of the URI are data that should be encoded 669 // using the document character encoding. 670 app = c + encodeURIQuery(string.substring(i+1), characterEncoding); 671 endLoop = true; 672 } 673 } 674 else 675 { 676 //No encoding, just do nothing, char will be added later. 677 } 678 679 if (app != null) 680 { 681 if (sb == null) 682 { 683 sb = new StringBuilder(string.substring(0, i)); 684 } 685 sb.append(app); 686 } 687 else 688 { 689 if (sb != null) 690 { 691 sb.append(c); 692 } 693 } 694 if (endLoop) 695 { 696 break; 697 } 698 } 699 if (sb == null) 700 { 701 return string; 702 } 703 else 704 { 705 return sb.toString(); 706 } 707 } 708 709 /** 710 * Encode a unicode char value in percentEncode, decoding its bytes using a specified 711 * characterEncoding. 712 * 713 * @param c 714 * @param characterEncoding 715 * @return 716 */ 717 private static String percentEncode(char c, String characterEncoding) 718 { 719 String app = null; 720 if (c > (char)((short)0x007F)) 721 { 722 //percent encode in the proper encoding to be consistent 723 app = percentEncodeNonUsAsciiCharacter(c, characterEncoding); 724 } 725 else 726 { 727 //percent encode US-ASCII char (0x00-0x7F range) 728 app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); 729 } 730 return app; 731 } 732 733 private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding) 734 { 735 ByteArrayOutputStream baos = new ByteArrayOutputStream(10); 736 StringBuilder builder = new StringBuilder(); 737 try 738 { 739 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); 740 writer.write(c); 741 writer.flush(); 742 } 743 catch(IOException e) 744 { 745 baos.reset(); 746 return null; 747 } 748 749 byte [] byteArray = baos.toByteArray(); 750 for (int i=0; i < byteArray.length; i++) 751 { 752 builder.append('%'); 753 builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); 754 builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); 755 } 756 757 return builder.toString(); 758 } 759 760 /** 761 * Encode the query part using the document charset encoding provided. 762 * 763 * 764 * @param string 765 * @param characterEncoding 766 * @return 767 */ 768 private static String encodeURIQuery(final String string, final String characterEncoding) 769 { 770 StringBuilder sb = null; //create later on demand 771 String app; 772 char c; 773 boolean endLoop = false; 774 int length = string.length(); 775 for (int i = 0; i < length; ++i) 776 { 777 app = null; 778 c = string.charAt(i); 779 780 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 781 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 782 // we make easier and omit this one) 783 // - "<" %3C, ">" %3E, 784 // - "\" %5C, "^" %5E, "`" %60 785 // - "{" %7B, "|" %7C, "}" %7D 786 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 787 // that a single char should contain 2,3 or more bytes!. This data should be encoded 788 // translating from the document character encoding to percent encoding) 789 // 790 // "&" should be encoded as "&" because this link is inside an html page, and 791 // put & is invalid in this context 792 793 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 794 c == '"' || c == '<' || 795 c == '>' || c == '\\' || c == '^' || c == '`' || 796 c == '{' || c == '|' || c == '}') 797 { 798 // The percent encoding on this part should be done using UTF-8 charset 799 // as RFC 3986 Section 3.2.2 says 800 app = percentEncode(c, characterEncoding); 801 } 802 else if (c == '%') 803 { 804 if (i + 2 < length) 805 { 806 char c1 = string.charAt(i+1); 807 char c2 = string.charAt(i+2); 808 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 809 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 810 { 811 // do not percent encode, because it could be already encoded 812 } 813 else 814 { 815 app = percentEncode(c, characterEncoding); 816 } 817 } 818 else 819 { 820 app = percentEncode(c, characterEncoding); 821 } 822 } 823 else if (c == '&') 824 { 825 if (i+4 < length ) 826 { 827 if ('a' == string.charAt(i+1) && 828 'm' == string.charAt(i+2) && 829 'p' == string.charAt(i+3) && 830 ';' == string.charAt(i+4)) 831 { 832 //Skip 833 } 834 else 835 { 836 app = "&"; 837 } 838 } 839 else 840 { 841 app = "&"; 842 } 843 } 844 else 845 { 846 //No encoding, just do nothing, char will be added later. 847 } 848 849 if (app != null) 850 { 851 if (sb == null) 852 { 853 sb = new StringBuilder(string.substring(0, i)); 854 } 855 sb.append(app); 856 } 857 else 858 { 859 if (sb != null) 860 { 861 sb.append(c); 862 } 863 } 864 if (endLoop) 865 { 866 break; 867 } 868 } 869 if (sb == null) 870 { 871 return string; 872 } 873 else 874 { 875 return sb.toString(); 876 } 877 } 878 879 /** 880 * Encode an URI, escaping or percent-encoding all required characters and 881 * following the rules mentioned on RFC 3986. 882 * 883 * @param writer 884 * @param string 885 * @param characterEncoding 886 * @throws IOException 887 */ 888 public static void encodeURIAttribute(Writer writer, final String string, final String characterEncoding) 889 throws IOException 890 { 891 //StringBuilder sb = null; //create later on demand 892 int start = 0; 893 String app; 894 char c; 895 boolean endLoop = false; 896 int length = string.length(); 897 for (int i = 0; i < length; ++i) 898 { 899 app = null; 900 c = string.charAt(i); 901 902 // This are the guidelines to be taken into account by this algorithm to encode: 903 904 // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters 905 // 906 // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> 907 // space = <US-ASCII coded character 20 hexadecimal> 908 // delims = "<" | ">" | "#" | "%" | <"> 909 // %3C %3E %23 %25 %22 910 // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" 911 // %7D %7B %7C %5C %5E %5B %5D %60 912 // 913 // ".... Data corresponding to excluded characters must be escaped in order to 914 // be properly represented within a URI....." 915 916 // RFC 3986 Section 3. Syntax Components 917 // 918 // "... The generic URI syntax consists of a hierarchical sequence of 919 // components referred to as the scheme, authority, path, query, and 920 // fragment. 921 // 922 // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] 923 // 924 // hier-part = "//" authority path-abempty 925 // / path-absolute 926 // / path-rootless 927 // / path-empty 928 // ...." 929 930 // RFC 3986 Section 2.2: 931 // Reserved characters (should not be percent-encoded) 932 // reserved = gen-delims / sub-delims 933 // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" 934 // %3A %2F %3F %23 %5B %5D %40 935 // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 936 // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D 937 938 // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, 939 // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) 940 // "...those rules were redefined to directly specify the characters allowed...." 941 // There is also other characters moved from excluded list to reserved: 942 // "[" / "]" / "#" 943 944 // RFC 3986 Section 2.3: 945 // "... for consistency, percent-encoded octets in the ranges of ALPHA 946 // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), 947 // underscore (%5F), or tilde (%7E) should not be created by URI 948 // producers...." 949 950 // RFC 3986 Section 3.2.2. Host 951 952 // host = IP-literal / IPv4address / reg-name 953 954 // The reg-name syntax allows percent-encoded octets in order to 955 // represent non-ASCII registered names in a uniform way that is 956 // independent of the underlying name resolution technology. Non-ASCII 957 // characters must first be encoded according to UTF-8 [STD63], and then 958 // each octet of the corresponding UTF-8 sequence must be percent- 959 // encoded to be represented as URI characters. URI producing 960 // applications must not use percent-encoding in host unless it is used 961 // to represent a UTF-8 character sequence. 962 963 // RFC 3986 Section 3.4 Query 964 // query = *( pchar / "/" / "?" ) 965 // 966 // "... However, as query components are often used to carry identifying information 967 // in the form of "key=value" pairs and one frequently used value is a reference to 968 // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." 969 // 970 // RFC 3986 Section 2.5 Identifying Data (Apply to query section) 971 // 972 // When a new URI scheme defines a component that represents textual 973 // data consisting of characters from the Universal Character Set [UCS], 974 // the data should first be encoded as octets according to the UTF-8 975 // character encoding [STD63]; then only those octets that do not 976 // correspond to characters in the unreserved set should be percent- 977 // encoded. For example, the character A would be represented as "A", 978 // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented 979 // as "%C3%80", and the character KATAKANA LETTER A would be represented 980 // as "%E3%82%A2". 981 // 982 // RFC 3986 Section 3.5 Fragment 983 // fragment = *( pchar / "/" / "?" ) 984 // 985 // Note that follows the same as query 986 987 // Based on the extracts the strategy to apply on this method is: 988 // 989 // On scheme ":" hier-part 990 // 991 // Escape or percent encode chars inside : 992 // 993 // - From %00 to %20, 994 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 995 // duplicate encoding, encode it when we are sure 996 // that there are not encoded twice) 997 // - "<" %3C, ">" %3E 998 // - "\" %5C, "^" %5E, "`" %60 999 // - "{" %7B, "|" %7C, "}" %7D 1000 // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this 1001 // part of an URI, but it is preferred to encode it that omit it). 1002 // 1003 // The remaining characters must not be encoded 1004 // 1005 // Characters after ? or # should be percent encoding but only the necessary ones: 1006 // 1007 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 1008 // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of 1009 // duplicate encoding, encode it when we are sure 1010 // that there are not encoded twice) 1011 // - "<" %3C, ">" %3E, 1012 // - "\" %5C, "^" %5E, "`" %60 1013 // - "{" %7B, "|" %7C, "}" %7D 1014 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 1015 // that a single char should contain 2,3 or more bytes!. This data should be encoded 1016 // translating from the document character encoding to percent encoding, because this values 1017 // could be retrieved from httpRequest.getParameter() and it uses the current character encoding 1018 // for decode values) 1019 // 1020 // "&" should be encoded as "&" because this link is inside an html page, and 1021 // put only & is invalid in this context. 1022 1023 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 1024 c == '"' || c == '<' || 1025 c == '>' || c == '\\' || c == '^' || c == '`' || 1026 c == '{' || c == '|' || c == '}') 1027 { 1028 // The percent encoding on this part should be done using UTF-8 charset 1029 // as RFC 3986 Section 3.2.2 says. 1030 // Also there is a reference on 1031 // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars 1032 // that recommend use of UTF-8 instead the document character encoding. 1033 // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) 1034 //app = percentEncode(c, "UTF-8"); 1035 if (start < i) 1036 { 1037 writer.write(string, start, i-start); 1038 } 1039 start = i+1; 1040 percentEncode(writer, c, "UTF-8"); 1041 } 1042 else if (c == '%') 1043 { 1044 if (i + 2 < length) 1045 { 1046 char c1 = string.charAt(i+1); 1047 char c2 = string.charAt(i+2); 1048 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 1049 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 1050 { 1051 // do not percent encode, because it could be already encoded 1052 // and we don't want encode it twice 1053 } 1054 else 1055 { 1056 //app = percentEncode(c, UTF8); 1057 if (start < i) 1058 { 1059 writer.write(string, start, i-start); 1060 } 1061 start = i+1; 1062 percentEncode(writer, c, UTF8); 1063 } 1064 } 1065 else 1066 { 1067 //app = percentEncode(c, UTF8); 1068 if (start < i) 1069 { 1070 writer.write(string, start, i-start); 1071 } 1072 start = i+1; 1073 percentEncode(writer, c, UTF8); 1074 } 1075 } 1076 else if (c == '?' || c == '#') 1077 { 1078 if (i+1 < length) 1079 { 1080 // The remaining part of the URI are data that should be encoded 1081 // using the document character encoding. 1082 //app = c + encodeURIQuery(string.substring(i+1), characterEncoding); 1083 if (start < i) 1084 { 1085 writer.write(string, start, i-start); 1086 } 1087 start = i+1; 1088 writer.write(c); 1089 //encodeURIQuery(writer, string.substring(i+1), characterEncoding); 1090 encodeURIQuery(writer, string, i+1, characterEncoding); 1091 endLoop = true; 1092 } 1093 } 1094 else 1095 { 1096 //No encoding, just do nothing, char will be added later. 1097 } 1098 1099 if (app != null) 1100 { 1101 //if (sb == null) 1102 //{ 1103 // sb = new StringBuilder(string.substring(0, i)); 1104 //} 1105 //sb.append(app); 1106 if (start < i) 1107 { 1108 writer.write(string, start, i-start); 1109 } 1110 start = i+1; 1111 writer.write(app); 1112 } 1113 //else 1114 //{ 1115 // if (sb != null) 1116 // { 1117 // sb.append(c); 1118 // } 1119 //} 1120 if (endLoop) 1121 { 1122 start = length; 1123 break; 1124 } 1125 } 1126 //if (sb == null) 1127 //{ 1128 // return string; 1129 //} 1130 //else 1131 //{ 1132 // return sb.toString(); 1133 //} 1134 if (start == 0) 1135 { 1136 writer.write(string); 1137 } 1138 else if (start < length) 1139 { 1140 writer.write(string,start,length-start); 1141 } 1142 } 1143 1144 /** 1145 * Encode a unicode char value in percentEncode, decoding its bytes using a specified 1146 * characterEncoding. 1147 * 1148 * @param c 1149 * @param characterEncoding 1150 * @return 1151 */ 1152 private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException 1153 { 1154 if (c > (char)((short)0x007F)) 1155 { 1156 //percent encode in the proper encoding to be consistent 1157 percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding); 1158 } 1159 else 1160 { 1161 //percent encode US-ASCII char (0x00-0x7F range) 1162 writer.write('%'); 1163 writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10))); 1164 writer.write(HEX_CHARSET.charAt(c % 0x10)); 1165 } 1166 } 1167 1168 private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding) 1169 throws IOException 1170 { 1171 ByteArrayOutputStream baos = new ByteArrayOutputStream(10); 1172 1173 try 1174 { 1175 OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); 1176 writer.write(c); 1177 writer.flush(); 1178 } 1179 catch(IOException e) 1180 { 1181 baos.reset(); 1182 return; 1183 } 1184 1185 byte [] byteArray = baos.toByteArray(); 1186 for (int i=0; i < byteArray.length; i++) 1187 { 1188 //builder.append('%'); 1189 //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); 1190 //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); 1191 currentWriter.write('%'); 1192 currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); 1193 currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); 1194 } 1195 1196 //return builder.toString(); 1197 } 1198 1199 /** 1200 * Encode the query part using the document charset encoding provided. 1201 * 1202 * 1203 * @param string 1204 * @param characterEncoding 1205 * @return 1206 */ 1207 private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding) 1208 throws IOException 1209 { 1210 //StringBuilder sb = null; //create later on demand 1211 int start = offset; 1212 int length = string.length(); 1213 int realLength = length-offset; 1214 String app; 1215 char c; 1216 //boolean endLoop = false; 1217 for (int i = offset; i < length; ++i) 1218 { 1219 app = null; 1220 c = string.charAt(i); 1221 1222 // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) 1223 // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so 1224 // we make easier and omit this one) 1225 // - "<" %3C, ">" %3E, 1226 // - "\" %5C, "^" %5E, "`" %60 1227 // - "{" %7B, "|" %7C, "}" %7D 1228 // - From %7F ad infinitum (each character as many bytes as necessary but take into account 1229 // that a single char should contain 2,3 or more bytes!. This data should be encoded 1230 // translating from the document character encoding to percent encoding) 1231 // 1232 // "&" should be encoded as "&" because this link is inside an html page, and 1233 // put & is invalid in this context 1234 1235 if ( (c <= (char)0x20) || (c >= (char)0x7F) || 1236 c == '"' || c == '<' || 1237 c == '>' || c == '\\' || c == '^' || c == '`' || 1238 c == '{' || c == '|' || c == '}') 1239 { 1240 // The percent encoding on this part should be done using UTF-8 charset 1241 // as RFC 3986 Section 3.2.2 says 1242 //app = percentEncode(c, characterEncoding); 1243 if (start < i) 1244 { 1245 writer.write(string, start, i-start); 1246 } 1247 start = i+1; 1248 percentEncode(writer, c, characterEncoding); 1249 } 1250 else if (c == '%') 1251 { 1252 if (i + 2 < length) 1253 { 1254 char c1 = string.charAt(i+1); 1255 char c2 = string.charAt(i+2); 1256 if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && 1257 (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) 1258 { 1259 // do not percent encode, because it could be already encoded 1260 } 1261 else 1262 { 1263 //app = percentEncode(c, characterEncoding); 1264 if (start < i) 1265 { 1266 writer.write(string, start, i-start); 1267 } 1268 start = i+1; 1269 percentEncode(writer, c, characterEncoding); 1270 } 1271 } 1272 else 1273 { 1274 //app = percentEncode(c, characterEncoding); 1275 if (start < i) 1276 { 1277 writer.write(string, start, i-start); 1278 } 1279 start = i+1; 1280 percentEncode(writer, c, characterEncoding); 1281 } 1282 } 1283 else if (c == '&') 1284 { 1285 if (i+4 < length ) 1286 { 1287 if ('a' == string.charAt(i+1) && 1288 'm' == string.charAt(i+2) && 1289 'p' == string.charAt(i+3) && 1290 ';' == string.charAt(i+4)) 1291 { 1292 //Skip 1293 } 1294 else 1295 { 1296 app = "&"; 1297 } 1298 } 1299 else 1300 { 1301 app = "&"; 1302 } 1303 } 1304 else 1305 { 1306 //No encoding, just do nothing, char will be added later. 1307 } 1308 1309 if (app != null) 1310 { 1311 //if (sb == null) 1312 //{ 1313 // sb = new StringBuilder(string.substring(0, i)); 1314 //} 1315 //sb.append(app); 1316 if (start < i) 1317 { 1318 writer.write(string, start, i-start); 1319 } 1320 start = i+1; 1321 writer.write(app); 1322 } 1323 //else 1324 //{ 1325 // if (sb != null) 1326 // { 1327 // sb.append(c); 1328 // } 1329 //} 1330 //if (endLoop) 1331 //{ 1332 // break; 1333 //} 1334 } 1335 1336 //if (sb == null) 1337 //{ 1338 // return string; 1339 //} 1340 //else 1341 //{ 1342 // return sb.toString(); 1343 //} 1344 if (start == offset) 1345 { 1346 writer.write(string, offset, realLength); 1347 } 1348 else if (start < length) 1349 { 1350 writer.write(string,start,length-start); 1351 } 1352 } 1353 }