Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
HTMLEncoder |
|
| 14.15;14.15 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one | |
3 | * or more contributor license agreements. See the NOTICE file | |
4 | * distributed with this work for additional information | |
5 | * regarding copyright ownership. The ASF licenses this file | |
6 | * to you under the Apache License, Version 2.0 (the | |
7 | * "License"); you may not use this file except in compliance | |
8 | * with the License. You may obtain a copy of the License at | |
9 | * | |
10 | * http://www.apache.org/licenses/LICENSE-2.0 | |
11 | * | |
12 | * Unless required by applicable law or agreed to in writing, | |
13 | * software distributed under the License is distributed on an | |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
15 | * KIND, either express or implied. See the License for the | |
16 | * specific language governing permissions and limitations | |
17 | * under the License. | |
18 | */ | |
19 | package org.apache.myfaces.shared.renderkit.html.util; | |
20 | ||
21 | import java.io.ByteArrayOutputStream; | |
22 | import java.io.IOException; | |
23 | import java.io.OutputStreamWriter; | |
24 | import java.io.Writer; | |
25 | ||
26 | /** | |
27 | * Converts Strings so that they can be used within HTML-Code. | |
28 | */ | |
29 | 0 | public abstract class HTMLEncoder |
30 | { | |
31 | /** | |
32 | * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. | |
33 | */ | |
34 | public static String encode (String string) | |
35 | { | |
36 | 5 | return encode(string, false, true); |
37 | } | |
38 | ||
39 | /** | |
40 | * Variant of {@link #encode} where encodeNbsp is true. | |
41 | */ | |
42 | public static String encode (String string, boolean encodeNewline) | |
43 | { | |
44 | 2 | return encode(string, encodeNewline, true); |
45 | } | |
46 | ||
47 | /** | |
48 | * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true | |
49 | */ | |
50 | public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) | |
51 | { | |
52 | 7 | return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true); |
53 | } | |
54 | ||
55 | /** | |
56 | * Encodes the given string, so that it can be used within a html page. | |
57 | * @param string the string to convert | |
58 | * @param encodeNewline if true newline characters are converted to <br>'s | |
59 | * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s | |
60 | * @param encodeNonLatin if true encode non-latin characters as numeric character references | |
61 | */ | |
62 | public static String encode (String string, | |
63 | boolean encodeNewline, | |
64 | boolean encodeSubsequentBlanksToNbsp, | |
65 | boolean encodeNonLatin) | |
66 | { | |
67 | 7 | if (string == null) |
68 | { | |
69 | 1 | return ""; |
70 | } | |
71 | ||
72 | 6 | StringBuilder sb = null; //create later on demand |
73 | String app; | |
74 | 6 | char c = ' '; |
75 | char prevC; | |
76 | 6 | int length = string.length(); |
77 | 172 | for (int i = 0; i < length; ++i) |
78 | { | |
79 | 166 | app = null; |
80 | 166 | prevC = c; |
81 | 166 | c = string.charAt(i); |
82 | ||
83 | // All characters before letters | |
84 | 166 | if ((int)c < 0x41) |
85 | { | |
86 | 36 | switch (c) |
87 | { | |
88 | 2 | case '"': app = """; break; //" |
89 | 1 | case '&': app = "&"; break; //& |
90 | 1 | case '<': app = "<"; break; //< |
91 | 4 | case '>': app = ">"; break; //> |
92 | case ' ': | |
93 | 19 | if (encodeSubsequentBlanksToNbsp && |
94 | prevC == ' ') | |
95 | { | |
96 | //Space at beginning or after another space | |
97 | 0 | app = " "; |
98 | } | |
99 | break; | |
100 | case '\n': | |
101 | 2 | if (encodeNewline) |
102 | { | |
103 | 1 | app = "<br/>"; |
104 | } | |
105 | break; | |
106 | default: | |
107 | break; | |
108 | } | |
109 | // http://www.w3.org/MarkUp/html3/specialchars.html | |
110 | // From C0 extension U+0000-U+001F only U+0009, U+000A and | |
111 | // U+000D are valid control characters | |
112 | 36 | if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) |
113 | { | |
114 | // Ignore escape character | |
115 | 0 | app = ""; |
116 | } | |
117 | } | |
118 | 130 | else if (encodeNonLatin && (int)c > 0x80) |
119 | { | |
120 | 2 | switch(c) |
121 | { | |
122 | //german umlauts | |
123 | 0 | case '\u00E4' : app = "ä"; break; |
124 | 0 | case '\u00C4' : app = "Ä"; break; |
125 | 2 | case '\u00F6' : app = "ö"; break; |
126 | 0 | case '\u00D6' : app = "Ö"; break; |
127 | 0 | case '\u00FC' : app = "ü"; break; |
128 | 0 | case '\u00DC' : app = "Ü"; break; |
129 | 0 | case '\u00DF' : app = "ß"; break; |
130 | ||
131 | //misc | |
132 | //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? | |
133 | 0 | case '\u20AC': app = "€"; break; |
134 | 0 | case '\u00AB': app = "«"; break; |
135 | 0 | case '\u00BB': app = "»"; break; |
136 | 0 | case '\u00A0': app = " "; break; |
137 | ||
138 | default : | |
139 | //encode all non basic latin characters | |
140 | 0 | app = "&#" + ((int)c) + ";"; |
141 | break; | |
142 | } | |
143 | } | |
144 | 166 | if (app != null) |
145 | { | |
146 | 11 | if (sb == null) |
147 | { | |
148 | 4 | sb = new StringBuilder(string.substring(0, i)); |
149 | } | |
150 | 11 | sb.append(app); |
151 | } | |
152 | else | |
153 | { | |
154 | 155 | if (sb != null) |
155 | { | |
156 | 109 | sb.append(c); |
157 | } | |
158 | } | |
159 | } | |
160 | ||
161 | 6 | if (sb == null) |
162 | { | |
163 | 2 | return string; |
164 | } | |
165 | else | |
166 | { | |
167 | 4 | return sb.toString(); |
168 | } | |
169 | } | |
170 | ||
171 | /** | |
172 | * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. | |
173 | */ | |
174 | public static void encode (Writer writer, String string) throws IOException | |
175 | { | |
176 | 5 | encode(writer, string, false, true); |
177 | 5 | } |
178 | ||
179 | /** | |
180 | * Variant of {@link #encode} where encodeNbsp is true. | |
181 | */ | |
182 | public static void encode (Writer writer, String string, boolean encodeNewline) throws IOException | |
183 | { | |
184 | 2 | encode(writer, string, encodeNewline, true); |
185 | 2 | } |
186 | ||
187 | /** | |
188 | * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true | |
189 | */ | |
190 | public static void encode (Writer writer, String string, | |
191 | boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) throws IOException | |
192 | { | |
193 | 7 | encode(writer, string, encodeNewline, encodeSubsequentBlanksToNbsp, true); |
194 | 7 | } |
195 | ||
196 | public static void encode (Writer writer, String string, | |
197 | boolean encodeNewline, | |
198 | boolean encodeSubsequentBlanksToNbsp, | |
199 | boolean encodeNonLatin) throws IOException | |
200 | { | |
201 | 11 | if (string == null) |
202 | { | |
203 | 1 | return; |
204 | } | |
205 | ||
206 | 10 | int start = 0; |
207 | String app; | |
208 | 10 | char c = ' '; |
209 | char prevC; | |
210 | 10 | int length = string.length(); |
211 | 196 | for (int i = 0; i < length; ++i) |
212 | { | |
213 | 186 | app = null; |
214 | 186 | prevC = c; |
215 | 186 | c = string.charAt(i); |
216 | ||
217 | // All characters before letters | |
218 | 186 | if ((int)c < 0x41) |
219 | { | |
220 | 36 | switch (c) |
221 | { | |
222 | 2 | case '"': app = """; break; //" |
223 | 1 | case '&': app = "&"; break; //& |
224 | 1 | case '<': app = "<"; break; //< |
225 | 4 | case '>': app = ">"; break; //> |
226 | case ' ': | |
227 | 19 | if (encodeSubsequentBlanksToNbsp && |
228 | prevC == ' ') | |
229 | { | |
230 | //Space at beginning or after another space | |
231 | 0 | app = " "; |
232 | } | |
233 | break; | |
234 | case '\n': | |
235 | 2 | if (encodeNewline) |
236 | { | |
237 | 1 | app = "<br/>"; |
238 | } | |
239 | break; | |
240 | default: | |
241 | break; | |
242 | } | |
243 | // http://www.w3.org/MarkUp/html3/specialchars.html | |
244 | // From C0 extension U+0000-U+001F only U+0009, U+000A and | |
245 | // U+000D are valid control characters | |
246 | 36 | if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) |
247 | { | |
248 | // Ignore escape character | |
249 | 0 | app = ""; |
250 | } | |
251 | } | |
252 | 150 | else if (encodeNonLatin && (int)c > 0x80) |
253 | { | |
254 | 2 | switch(c) |
255 | { | |
256 | //german umlauts | |
257 | 0 | case '\u00E4' : app = "ä"; break; |
258 | 0 | case '\u00C4' : app = "Ä"; break; |
259 | 2 | case '\u00F6' : app = "ö"; break; |
260 | 0 | case '\u00D6' : app = "Ö"; break; |
261 | 0 | case '\u00FC' : app = "ü"; break; |
262 | 0 | case '\u00DC' : app = "Ü"; break; |
263 | 0 | case '\u00DF' : app = "ß"; break; |
264 | ||
265 | //misc | |
266 | //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? | |
267 | 0 | case '\u20AC': app = "€"; break; |
268 | 0 | case '\u00AB': app = "«"; break; |
269 | 0 | case '\u00BB': app = "»"; break; |
270 | 0 | case '\u00A0': app = " "; break; |
271 | ||
272 | default : | |
273 | //encode all non basic latin characters | |
274 | 0 | app = "&#" + ((int)c) + ";"; |
275 | break; | |
276 | } | |
277 | } | |
278 | 186 | if (app != null) |
279 | { | |
280 | //if (sb == null) | |
281 | //{ | |
282 | // sb = new StringBuilder(string.substring(0, i)); | |
283 | //} | |
284 | //sb.append(app); | |
285 | 11 | if (start < i) |
286 | { | |
287 | 8 | writer.write(string, start, i-start); |
288 | } | |
289 | 11 | start = i+1; |
290 | 11 | writer.write(app); |
291 | } | |
292 | //else | |
293 | //{ | |
294 | // if (sb != null) | |
295 | // { | |
296 | // sb.append(c); | |
297 | // } | |
298 | //} | |
299 | } | |
300 | ||
301 | //if (sb == null) | |
302 | //{ | |
303 | // return string; | |
304 | //} | |
305 | //else | |
306 | //{ | |
307 | // return sb.toString(); | |
308 | //} | |
309 | 10 | if (start == 0) |
310 | { | |
311 | 6 | writer.write(string); |
312 | } | |
313 | 4 | else if (start < length) |
314 | { | |
315 | 0 | writer.write(string,start,length-start); |
316 | } | |
317 | 10 | } |
318 | ||
319 | ||
320 | /** | |
321 | * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. | |
322 | */ | |
323 | public static void encode (char[] string, int offset, int length, Writer writer) throws IOException | |
324 | { | |
325 | 20 | encode(string, offset, length, false, true, writer); |
326 | 20 | } |
327 | ||
328 | /** | |
329 | * Variant of {@link #encode} where encodeNbsp is true. | |
330 | */ | |
331 | public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) | |
332 | throws IOException | |
333 | { | |
334 | 6 | encode(string, offset, length, encodeNewline, true, writer); |
335 | 6 | } |
336 | ||
337 | /** | |
338 | * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true | |
339 | */ | |
340 | public static void encode (char[] string, int offset, int length, boolean encodeNewline, | |
341 | boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException | |
342 | { | |
343 | 26 | encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer); |
344 | 26 | } |
345 | ||
346 | ||
347 | /** | |
348 | * Encodes the given string, so that it can be used within a html page. | |
349 | * @param string the string to convert | |
350 | * @param encodeNewline if true newline characters are converted to <br>'s | |
351 | * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s | |
352 | * @param encodeNonLatin if true encode non-latin characters as numeric character references | |
353 | */ | |
354 | public static void encode (char[] string, int offset, int length, | |
355 | boolean encodeNewline, | |
356 | boolean encodeSubsequentBlanksToNbsp, | |
357 | boolean encodeNonLatin, Writer writer) throws IOException | |
358 | { | |
359 | 26 | if (string == null || length < 0 || offset >= string.length) |
360 | { | |
361 | 8 | return; |
362 | } | |
363 | 18 | offset = Math.max(0, offset); |
364 | 18 | int realLength = Math.min(length, string.length - offset); |
365 | ||
366 | //StringBuilder sb = null; //create later on demand | |
367 | String app; | |
368 | 18 | char c = ' '; |
369 | char prevC; | |
370 | 18 | int start = offset; |
371 | ||
372 | 590 | for (int i = offset; i < offset + realLength; ++i) |
373 | { | |
374 | 572 | app = null; |
375 | 572 | prevC = c; |
376 | 572 | c = string[i]; |
377 | ||
378 | // All characters before letters | |
379 | 572 | if ((int)c < 0x41) |
380 | { | |
381 | 120 | switch (c) |
382 | { | |
383 | 4 | case '"': app = """; break; //" |
384 | 4 | case '&': app = "&"; break; //& |
385 | 2 | case '<': app = "<"; break; //< |
386 | 12 | case '>': app = ">"; break; //> |
387 | case ' ': | |
388 | 68 | if (encodeSubsequentBlanksToNbsp && |
389 | prevC == ' ') | |
390 | { | |
391 | //Space at beginning or after another space | |
392 | 0 | app = " "; |
393 | } | |
394 | break; | |
395 | case '\n': | |
396 | 8 | if (encodeNewline) |
397 | { | |
398 | 2 | app = "<br/>"; |
399 | } | |
400 | break; | |
401 | default: | |
402 | break; | |
403 | } | |
404 | // http://www.w3.org/MarkUp/html3/specialchars.html | |
405 | // From C0 extension U+0000-U+001F only U+0009, U+000A and | |
406 | // U+000D are valid control characters | |
407 | 120 | if (c <= 0x1F && c != 0x09 && c != 0x0A && c != 0x0D) |
408 | { | |
409 | // Ignore escape character | |
410 | 0 | app = ""; |
411 | } | |
412 | } | |
413 | 452 | else if (encodeNonLatin && (int)c > 0x80) |
414 | { | |
415 | 8 | switch(c) |
416 | { | |
417 | //german umlauts | |
418 | 0 | case '\u00E4' : app = "ä"; break; |
419 | 0 | case '\u00C4' : app = "Ä"; break; |
420 | 8 | case '\u00F6' : app = "ö"; break; |
421 | 0 | case '\u00D6' : app = "Ö"; break; |
422 | 0 | case '\u00FC' : app = "ü"; break; |
423 | 0 | case '\u00DC' : app = "Ü"; break; |
424 | 0 | case '\u00DF' : app = "ß"; break; |
425 | ||
426 | //misc | |
427 | //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? | |
428 | 0 | case '\u20AC': app = "€"; break; |
429 | 0 | case '\u00AB': app = "«"; break; |
430 | 0 | case '\u00BB': app = "»"; break; |
431 | 0 | case '\u00A0': app = " "; break; |
432 | ||
433 | default : | |
434 | //encode all non basic latin characters | |
435 | 0 | app = "&#" + ((int)c) + ";"; |
436 | break; | |
437 | } | |
438 | } | |
439 | 572 | if (app != null) |
440 | { | |
441 | //if (sb == null) | |
442 | //{ | |
443 | // sb = new StringBuilder(realLength*2); | |
444 | // sb.append(string, offset, i - offset); | |
445 | //} | |
446 | //sb.append(app); | |
447 | 32 | if (start < i) |
448 | { | |
449 | 24 | writer.write(string, start, i-start); |
450 | } | |
451 | 32 | start = i+1; |
452 | 32 | writer.write(app); |
453 | } | |
454 | /* | |
455 | else | |
456 | { | |
457 | if (sb != null) | |
458 | { | |
459 | sb.append(c); | |
460 | } | |
461 | }*/ | |
462 | } | |
463 | ||
464 | //if (sb == null) | |
465 | //{ | |
466 | // writer.write(string, offset, realLength); | |
467 | //} | |
468 | //else | |
469 | //{ | |
470 | // writer.write(sb.toString()); | |
471 | //} | |
472 | 18 | if (start == offset) |
473 | { | |
474 | 4 | writer.write(string, offset, realLength); |
475 | } | |
476 | 14 | else if (start < offset+realLength) |
477 | { | |
478 | 2 | writer.write(string,start,offset+realLength-start); |
479 | } | |
480 | 18 | } |
481 | ||
482 | private static final String HEX_CHARSET = "0123456789ABCDEF"; | |
483 | ||
484 | private static final String UTF8 = "UTF-8"; | |
485 | ||
486 | /** | |
487 | * Encode an URI, escaping or percent-encoding all required characters and | |
488 | * following the rules mentioned on RFC 3986. | |
489 | * | |
490 | * @param string | |
491 | * @param encodeNonLatin | |
492 | * @return | |
493 | * @throws IOException | |
494 | */ | |
495 | public static String encodeURIAttribute(final String string, final String characterEncoding) | |
496 | throws IOException | |
497 | { | |
498 | 13 | StringBuilder sb = null; //create later on demand |
499 | String app; | |
500 | char c; | |
501 | 13 | boolean endLoop = false; |
502 | 13 | int length = string.length(); |
503 | 174 | for (int i = 0; i < length; ++i) |
504 | { | |
505 | 168 | app = null; |
506 | 168 | c = string.charAt(i); |
507 | ||
508 | // This are the guidelines to be taken into account by this algorithm to encode: | |
509 | ||
510 | // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters | |
511 | // | |
512 | // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> | |
513 | // space = <US-ASCII coded character 20 hexadecimal> | |
514 | // delims = "<" | ">" | "#" | "%" | <"> | |
515 | // %3C %3E %23 %25 %22 | |
516 | // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" | |
517 | // %7D %7B %7C %5C %5E %5B %5D %60 | |
518 | // | |
519 | // ".... Data corresponding to excluded characters must be escaped in order to | |
520 | // be properly represented within a URI....." | |
521 | ||
522 | // RFC 3986 Section 3. Syntax Components | |
523 | // | |
524 | // "... The generic URI syntax consists of a hierarchical sequence of | |
525 | // components referred to as the scheme, authority, path, query, and | |
526 | // fragment. | |
527 | // | |
528 | // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] | |
529 | // | |
530 | // hier-part = "//" authority path-abempty | |
531 | // / path-absolute | |
532 | // / path-rootless | |
533 | // / path-empty | |
534 | // ...." | |
535 | ||
536 | // RFC 3986 Section 2.2: | |
537 | // Reserved characters (should not be percent-encoded) | |
538 | // reserved = gen-delims / sub-delims | |
539 | // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" | |
540 | // %3A %2F %3F %23 %5B %5D %40 | |
541 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | |
542 | // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D | |
543 | ||
544 | // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, | |
545 | // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) | |
546 | // "...those rules were redefined to directly specify the characters allowed...." | |
547 | // There is also other characters moved from excluded list to reserved: | |
548 | // "[" / "]" / "#" | |
549 | ||
550 | // RFC 3986 Section 2.3: | |
551 | // "... for consistency, percent-encoded octets in the ranges of ALPHA | |
552 | // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), | |
553 | // underscore (%5F), or tilde (%7E) should not be created by URI | |
554 | // producers...." | |
555 | ||
556 | // RFC 3986 Section 3.2.2. Host | |
557 | ||
558 | // host = IP-literal / IPv4address / reg-name | |
559 | ||
560 | // The reg-name syntax allows percent-encoded octets in order to | |
561 | // represent non-ASCII registered names in a uniform way that is | |
562 | // independent of the underlying name resolution technology. Non-ASCII | |
563 | // characters must first be encoded according to UTF-8 [STD63], and then | |
564 | // each octet of the corresponding UTF-8 sequence must be percent- | |
565 | // encoded to be represented as URI characters. URI producing | |
566 | // applications must not use percent-encoding in host unless it is used | |
567 | // to represent a UTF-8 character sequence. | |
568 | ||
569 | // RFC 3986 Section 3.4 Query | |
570 | // query = *( pchar / "/" / "?" ) | |
571 | // | |
572 | // "... However, as query components are often used to carry identifying information | |
573 | // in the form of "key=value" pairs and one frequently used value is a reference to | |
574 | // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." | |
575 | // | |
576 | // RFC 3986 Section 2.5 Identifying Data (Apply to query section) | |
577 | // | |
578 | // When a new URI scheme defines a component that represents textual | |
579 | // data consisting of characters from the Universal Character Set [UCS], | |
580 | // the data should first be encoded as octets according to the UTF-8 | |
581 | // character encoding [STD63]; then only those octets that do not | |
582 | // correspond to characters in the unreserved set should be percent- | |
583 | // encoded. For example, the character A would be represented as "A", | |
584 | // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented | |
585 | // as "%C3%80", and the character KATAKANA LETTER A would be represented | |
586 | // as "%E3%82%A2". | |
587 | // | |
588 | // RFC 3986 Section 3.5 Fragment | |
589 | // fragment = *( pchar / "/" / "?" ) | |
590 | // | |
591 | // Note that follows the same as query | |
592 | ||
593 | // Based on the extracts the strategy to apply on this method is: | |
594 | // | |
595 | // On scheme ":" hier-part | |
596 | // | |
597 | // Escape or percent encode chars inside : | |
598 | // | |
599 | // - From %00 to %20, | |
600 | // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of | |
601 | // duplicate encoding, encode it when we are sure | |
602 | // that there are not encoded twice) | |
603 | // - "<" %3C, ">" %3E | |
604 | // - "\" %5C, "^" %5E, "`" %60 | |
605 | // - "{" %7B, "|" %7C, "}" %7D | |
606 | // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this | |
607 | // part of an URI, but it is preferred to encode it that omit it). | |
608 | // | |
609 | // The remaining characters must not be encoded | |
610 | // | |
611 | // Characters after ? or # should be percent encoding but only the necessary ones: | |
612 | // | |
613 | // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) | |
614 | // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of | |
615 | // duplicate encoding, encode it when we are sure | |
616 | // that there are not encoded twice) | |
617 | // - "<" %3C, ">" %3E, | |
618 | // - "\" %5C, "^" %5E, "`" %60 | |
619 | // - "{" %7B, "|" %7C, "}" %7D | |
620 | // - From %7F ad infinitum (each character as many bytes as necessary but take into account | |
621 | // that a single char should contain 2,3 or more bytes!. This data should be encoded | |
622 | // translating from the document character encoding to percent encoding, because this values | |
623 | // could be retrieved from httpRequest.getParameter() and it uses the current character encoding | |
624 | // for decode values) | |
625 | // | |
626 | // "&" should be encoded as "&" because this link is inside an html page, and | |
627 | // put only & is invalid in this context. | |
628 | ||
629 | 168 | if ( (c <= (char)0x20) || (c >= (char)0x7F) || |
630 | c == '"' || c == '<' || | |
631 | c == '>' || c == '\\' || c == '^' || c == '`' || | |
632 | c == '{' || c == '|' || c == '}') | |
633 | { | |
634 | // The percent encoding on this part should be done using UTF-8 charset | |
635 | // as RFC 3986 Section 3.2.2 says. | |
636 | // Also there is a reference on | |
637 | // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars | |
638 | // that recommend use of UTF-8 instead the document character encoding. | |
639 | // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) | |
640 | 42 | app = percentEncode(c, "UTF-8"); |
641 | } | |
642 | 126 | else if (c == '%') |
643 | { | |
644 | 2 | if (i + 2 < length) |
645 | { | |
646 | 2 | char c1 = string.charAt(i+1); |
647 | 2 | char c2 = string.charAt(i+2); |
648 | 2 | if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && |
649 | (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) | |
650 | { | |
651 | // do not percent encode, because it could be already encoded | |
652 | // and we don't want encode it twice | |
653 | } | |
654 | else | |
655 | { | |
656 | 2 | app = percentEncode(c, UTF8); |
657 | } | |
658 | 2 | } |
659 | else | |
660 | { | |
661 | 0 | app = percentEncode(c, UTF8); |
662 | } | |
663 | } | |
664 | 124 | else if (c == '?' || c == '#') |
665 | { | |
666 | 7 | if (i+1 < length) |
667 | { | |
668 | // The remaining part of the URI are data that should be encoded | |
669 | // using the document character encoding. | |
670 | 7 | app = c + encodeURIQuery(string.substring(i+1), characterEncoding); |
671 | 7 | endLoop = true; |
672 | } | |
673 | } | |
674 | else | |
675 | { | |
676 | //No encoding, just do nothing, char will be added later. | |
677 | } | |
678 | ||
679 | 168 | if (app != null) |
680 | { | |
681 | 51 | if (sb == null) |
682 | { | |
683 | 11 | sb = new StringBuilder(string.substring(0, i)); |
684 | } | |
685 | 51 | sb.append(app); |
686 | } | |
687 | else | |
688 | { | |
689 | 117 | if (sb != null) |
690 | { | |
691 | 0 | sb.append(c); |
692 | } | |
693 | } | |
694 | 168 | if (endLoop) |
695 | { | |
696 | 7 | break; |
697 | } | |
698 | } | |
699 | 13 | if (sb == null) |
700 | { | |
701 | 2 | return string; |
702 | } | |
703 | else | |
704 | { | |
705 | 11 | return sb.toString(); |
706 | } | |
707 | } | |
708 | ||
709 | /** | |
710 | * Encode a unicode char value in percentEncode, decoding its bytes using a specified | |
711 | * characterEncoding. | |
712 | * | |
713 | * @param c | |
714 | * @param characterEncoding | |
715 | * @return | |
716 | */ | |
717 | private static String percentEncode(char c, String characterEncoding) | |
718 | { | |
719 | 87 | String app = null; |
720 | 87 | if (c > (char)((short)0x007F)) |
721 | { | |
722 | //percent encode in the proper encoding to be consistent | |
723 | 39 | app = percentEncodeNonUsAsciiCharacter(c, characterEncoding); |
724 | } | |
725 | else | |
726 | { | |
727 | //percent encode US-ASCII char (0x00-0x7F range) | |
728 | 48 | app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); |
729 | } | |
730 | 87 | return app; |
731 | } | |
732 | ||
733 | private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding) | |
734 | { | |
735 | 39 | ByteArrayOutputStream baos = new ByteArrayOutputStream(10); |
736 | 39 | StringBuilder builder = new StringBuilder(); |
737 | try | |
738 | { | |
739 | 39 | OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); |
740 | 39 | writer.write(c); |
741 | 39 | writer.flush(); |
742 | } | |
743 | 0 | catch(IOException e) |
744 | { | |
745 | 0 | baos.reset(); |
746 | 0 | return null; |
747 | 39 | } |
748 | ||
749 | 39 | byte [] byteArray = baos.toByteArray(); |
750 | 117 | for (int i=0; i < byteArray.length; i++) |
751 | { | |
752 | 78 | builder.append('%'); |
753 | 78 | builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); |
754 | 78 | builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); |
755 | } | |
756 | ||
757 | 39 | return builder.toString(); |
758 | } | |
759 | ||
760 | /** | |
761 | * Encode the query part using the document charset encoding provided. | |
762 | * | |
763 | * | |
764 | * @param string | |
765 | * @param characterEncoding | |
766 | * @return | |
767 | */ | |
768 | private static String encodeURIQuery(final String string, final String characterEncoding) | |
769 | { | |
770 | 7 | StringBuilder sb = null; //create later on demand |
771 | String app; | |
772 | char c; | |
773 | 7 | boolean endLoop = false; |
774 | 7 | int length = string.length(); |
775 | 251 | for (int i = 0; i < length; ++i) |
776 | { | |
777 | 244 | app = null; |
778 | 244 | c = string.charAt(i); |
779 | ||
780 | // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) | |
781 | // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so | |
782 | // we make easier and omit this one) | |
783 | // - "<" %3C, ">" %3E, | |
784 | // - "\" %5C, "^" %5E, "`" %60 | |
785 | // - "{" %7B, "|" %7C, "}" %7D | |
786 | // - From %7F ad infinitum (each character as many bytes as necessary but take into account | |
787 | // that a single char should contain 2,3 or more bytes!. This data should be encoded | |
788 | // translating from the document character encoding to percent encoding) | |
789 | // | |
790 | // "&" should be encoded as "&" because this link is inside an html page, and | |
791 | // put & is invalid in this context | |
792 | ||
793 | 244 | if ( (c <= (char)0x20) || (c >= (char)0x7F) || |
794 | c == '"' || c == '<' || | |
795 | c == '>' || c == '\\' || c == '^' || c == '`' || | |
796 | c == '{' || c == '|' || c == '}') | |
797 | { | |
798 | // The percent encoding on this part should be done using UTF-8 charset | |
799 | // as RFC 3986 Section 3.2.2 says | |
800 | 41 | app = percentEncode(c, characterEncoding); |
801 | } | |
802 | 203 | else if (c == '%') |
803 | { | |
804 | 2 | if (i + 2 < length) |
805 | { | |
806 | 2 | char c1 = string.charAt(i+1); |
807 | 2 | char c2 = string.charAt(i+2); |
808 | 2 | if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && |
809 | (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) | |
810 | { | |
811 | // do not percent encode, because it could be already encoded | |
812 | } | |
813 | else | |
814 | { | |
815 | 2 | app = percentEncode(c, characterEncoding); |
816 | } | |
817 | 2 | } |
818 | else | |
819 | { | |
820 | 0 | app = percentEncode(c, characterEncoding); |
821 | } | |
822 | } | |
823 | 201 | else if (c == '&') |
824 | { | |
825 | 1 | if (i+4 < length ) |
826 | { | |
827 | 1 | if ('a' == string.charAt(i+1) && |
828 | 'm' == string.charAt(i+2) && | |
829 | 'p' == string.charAt(i+3) && | |
830 | ';' == string.charAt(i+4)) | |
831 | { | |
832 | //Skip | |
833 | } | |
834 | else | |
835 | { | |
836 | 1 | app = "&"; |
837 | } | |
838 | } | |
839 | else | |
840 | { | |
841 | 0 | app = "&"; |
842 | } | |
843 | } | |
844 | else | |
845 | { | |
846 | //No encoding, just do nothing, char will be added later. | |
847 | } | |
848 | ||
849 | 244 | if (app != null) |
850 | { | |
851 | 44 | if (sb == null) |
852 | { | |
853 | 4 | sb = new StringBuilder(string.substring(0, i)); |
854 | } | |
855 | 44 | sb.append(app); |
856 | } | |
857 | else | |
858 | { | |
859 | 200 | if (sb != null) |
860 | { | |
861 | 12 | sb.append(c); |
862 | } | |
863 | } | |
864 | 244 | if (endLoop) |
865 | { | |
866 | 0 | break; |
867 | } | |
868 | } | |
869 | 7 | if (sb == null) |
870 | { | |
871 | 3 | return string; |
872 | } | |
873 | else | |
874 | { | |
875 | 4 | return sb.toString(); |
876 | } | |
877 | } | |
878 | ||
879 | /** | |
880 | * Encode an URI, escaping or percent-encoding all required characters and | |
881 | * following the rules mentioned on RFC 3986. | |
882 | * | |
883 | * @param string | |
884 | * @param encodeNonLatin | |
885 | * @return | |
886 | * @throws IOException | |
887 | */ | |
888 | public static void encodeURIAttribute(Writer writer, final String string, final String characterEncoding) | |
889 | throws IOException | |
890 | { | |
891 | //StringBuilder sb = null; //create later on demand | |
892 | 11 | int start = 0; |
893 | String app; | |
894 | char c; | |
895 | 11 | boolean endLoop = false; |
896 | 11 | int length = string.length(); |
897 | 160 | for (int i = 0; i < length; ++i) |
898 | { | |
899 | 155 | app = null; |
900 | 155 | c = string.charAt(i); |
901 | ||
902 | // This are the guidelines to be taken into account by this algorithm to encode: | |
903 | ||
904 | // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters | |
905 | // | |
906 | // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> | |
907 | // space = <US-ASCII coded character 20 hexadecimal> | |
908 | // delims = "<" | ">" | "#" | "%" | <"> | |
909 | // %3C %3E %23 %25 %22 | |
910 | // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" | |
911 | // %7D %7B %7C %5C %5E %5B %5D %60 | |
912 | // | |
913 | // ".... Data corresponding to excluded characters must be escaped in order to | |
914 | // be properly represented within a URI....." | |
915 | ||
916 | // RFC 3986 Section 3. Syntax Components | |
917 | // | |
918 | // "... The generic URI syntax consists of a hierarchical sequence of | |
919 | // components referred to as the scheme, authority, path, query, and | |
920 | // fragment. | |
921 | // | |
922 | // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] | |
923 | // | |
924 | // hier-part = "//" authority path-abempty | |
925 | // / path-absolute | |
926 | // / path-rootless | |
927 | // / path-empty | |
928 | // ...." | |
929 | ||
930 | // RFC 3986 Section 2.2: | |
931 | // Reserved characters (should not be percent-encoded) | |
932 | // reserved = gen-delims / sub-delims | |
933 | // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" | |
934 | // %3A %2F %3F %23 %5B %5D %40 | |
935 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | |
936 | // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D | |
937 | ||
938 | // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, | |
939 | // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) | |
940 | // "...those rules were redefined to directly specify the characters allowed...." | |
941 | // There is also other characters moved from excluded list to reserved: | |
942 | // "[" / "]" / "#" | |
943 | ||
944 | // RFC 3986 Section 2.3: | |
945 | // "... for consistency, percent-encoded octets in the ranges of ALPHA | |
946 | // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), | |
947 | // underscore (%5F), or tilde (%7E) should not be created by URI | |
948 | // producers...." | |
949 | ||
950 | // RFC 3986 Section 3.2.2. Host | |
951 | ||
952 | // host = IP-literal / IPv4address / reg-name | |
953 | ||
954 | // The reg-name syntax allows percent-encoded octets in order to | |
955 | // represent non-ASCII registered names in a uniform way that is | |
956 | // independent of the underlying name resolution technology. Non-ASCII | |
957 | // characters must first be encoded according to UTF-8 [STD63], and then | |
958 | // each octet of the corresponding UTF-8 sequence must be percent- | |
959 | // encoded to be represented as URI characters. URI producing | |
960 | // applications must not use percent-encoding in host unless it is used | |
961 | // to represent a UTF-8 character sequence. | |
962 | ||
963 | // RFC 3986 Section 3.4 Query | |
964 | // query = *( pchar / "/" / "?" ) | |
965 | // | |
966 | // "... However, as query components are often used to carry identifying information | |
967 | // in the form of "key=value" pairs and one frequently used value is a reference to | |
968 | // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." | |
969 | // | |
970 | // RFC 3986 Section 2.5 Identifying Data (Apply to query section) | |
971 | // | |
972 | // When a new URI scheme defines a component that represents textual | |
973 | // data consisting of characters from the Universal Character Set [UCS], | |
974 | // the data should first be encoded as octets according to the UTF-8 | |
975 | // character encoding [STD63]; then only those octets that do not | |
976 | // correspond to characters in the unreserved set should be percent- | |
977 | // encoded. For example, the character A would be represented as "A", | |
978 | // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented | |
979 | // as "%C3%80", and the character KATAKANA LETTER A would be represented | |
980 | // as "%E3%82%A2". | |
981 | // | |
982 | // RFC 3986 Section 3.5 Fragment | |
983 | // fragment = *( pchar / "/" / "?" ) | |
984 | // | |
985 | // Note that follows the same as query | |
986 | ||
987 | // Based on the extracts the strategy to apply on this method is: | |
988 | // | |
989 | // On scheme ":" hier-part | |
990 | // | |
991 | // Escape or percent encode chars inside : | |
992 | // | |
993 | // - From %00 to %20, | |
994 | // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of | |
995 | // duplicate encoding, encode it when we are sure | |
996 | // that there are not encoded twice) | |
997 | // - "<" %3C, ">" %3E | |
998 | // - "\" %5C, "^" %5E, "`" %60 | |
999 | // - "{" %7B, "|" %7C, "}" %7D | |
1000 | // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this | |
1001 | // part of an URI, but it is preferred to encode it that omit it). | |
1002 | // | |
1003 | // The remaining characters must not be encoded | |
1004 | // | |
1005 | // Characters after ? or # should be percent encoding but only the necessary ones: | |
1006 | // | |
1007 | // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) | |
1008 | // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of | |
1009 | // duplicate encoding, encode it when we are sure | |
1010 | // that there are not encoded twice) | |
1011 | // - "<" %3C, ">" %3E, | |
1012 | // - "\" %5C, "^" %5E, "`" %60 | |
1013 | // - "{" %7B, "|" %7C, "}" %7D | |
1014 | // - From %7F ad infinitum (each character as many bytes as necessary but take into account | |
1015 | // that a single char should contain 2,3 or more bytes!. This data should be encoded | |
1016 | // translating from the document character encoding to percent encoding, because this values | |
1017 | // could be retrieved from httpRequest.getParameter() and it uses the current character encoding | |
1018 | // for decode values) | |
1019 | // | |
1020 | // "&" should be encoded as "&" because this link is inside an html page, and | |
1021 | // put only & is invalid in this context. | |
1022 | ||
1023 | 155 | if ( (c <= (char)0x20) || (c >= (char)0x7F) || |
1024 | c == '"' || c == '<' || | |
1025 | c == '>' || c == '\\' || c == '^' || c == '`' || | |
1026 | c == '{' || c == '|' || c == '}') | |
1027 | { | |
1028 | // The percent encoding on this part should be done using UTF-8 charset | |
1029 | // as RFC 3986 Section 3.2.2 says. | |
1030 | // Also there is a reference on | |
1031 | // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars | |
1032 | // that recommend use of UTF-8 instead the document character encoding. | |
1033 | // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) | |
1034 | //app = percentEncode(c, "UTF-8"); | |
1035 | 31 | if (start < i) |
1036 | { | |
1037 | 0 | writer.write(string, start, i-start); |
1038 | } | |
1039 | 31 | start = i+1; |
1040 | 31 | percentEncode(writer, c, "UTF-8"); |
1041 | } | |
1042 | 124 | else if (c == '%') |
1043 | { | |
1044 | 1 | if (i + 2 < length) |
1045 | { | |
1046 | 1 | char c1 = string.charAt(i+1); |
1047 | 1 | char c2 = string.charAt(i+2); |
1048 | 1 | if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && |
1049 | (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) | |
1050 | { | |
1051 | // do not percent encode, because it could be already encoded | |
1052 | // and we don't want encode it twice | |
1053 | } | |
1054 | else | |
1055 | { | |
1056 | //app = percentEncode(c, UTF8); | |
1057 | 1 | if (start < i) |
1058 | { | |
1059 | 0 | writer.write(string, start, i-start); |
1060 | } | |
1061 | 1 | start = i+1; |
1062 | 1 | percentEncode(writer, c, UTF8); |
1063 | } | |
1064 | 1 | } |
1065 | else | |
1066 | { | |
1067 | //app = percentEncode(c, UTF8); | |
1068 | 0 | if (start < i) |
1069 | { | |
1070 | 0 | writer.write(string, start, i-start); |
1071 | } | |
1072 | 0 | start = i+1; |
1073 | 0 | percentEncode(writer, c, UTF8); |
1074 | } | |
1075 | } | |
1076 | 123 | else if (c == '?' || c == '#') |
1077 | { | |
1078 | 6 | if (i+1 < length) |
1079 | { | |
1080 | // The remaining part of the URI are data that should be encoded | |
1081 | // using the document character encoding. | |
1082 | //app = c + encodeURIQuery(string.substring(i+1), characterEncoding); | |
1083 | 6 | if (start < i) |
1084 | { | |
1085 | 1 | writer.write(string, start, i-start); |
1086 | } | |
1087 | 6 | start = i+1; |
1088 | 6 | writer.write(c); |
1089 | //encodeURIQuery(writer, string.substring(i+1), characterEncoding); | |
1090 | 6 | encodeURIQuery(writer, string, i+1, characterEncoding); |
1091 | 6 | endLoop = true; |
1092 | } | |
1093 | } | |
1094 | else | |
1095 | { | |
1096 | //No encoding, just do nothing, char will be added later. | |
1097 | } | |
1098 | ||
1099 | 155 | if (app != null) |
1100 | { | |
1101 | //if (sb == null) | |
1102 | //{ | |
1103 | // sb = new StringBuilder(string.substring(0, i)); | |
1104 | //} | |
1105 | //sb.append(app); | |
1106 | 0 | if (start < i) |
1107 | { | |
1108 | 0 | writer.write(string, start, i-start); |
1109 | } | |
1110 | 0 | start = i+1; |
1111 | 0 | writer.write(app); |
1112 | } | |
1113 | //else | |
1114 | //{ | |
1115 | // if (sb != null) | |
1116 | // { | |
1117 | // sb.append(c); | |
1118 | // } | |
1119 | //} | |
1120 | 155 | if (endLoop) |
1121 | { | |
1122 | 6 | start = length; |
1123 | 6 | break; |
1124 | } | |
1125 | } | |
1126 | //if (sb == null) | |
1127 | //{ | |
1128 | // return string; | |
1129 | //} | |
1130 | //else | |
1131 | //{ | |
1132 | // return sb.toString(); | |
1133 | //} | |
1134 | 11 | if (start == 0) |
1135 | { | |
1136 | 2 | writer.write(string); |
1137 | } | |
1138 | 9 | else if (start < length) |
1139 | { | |
1140 | 0 | writer.write(string,start,length-start); |
1141 | } | |
1142 | 11 | } |
1143 | ||
1144 | /** | |
1145 | * Encode a unicode char value in percentEncode, decoding its bytes using a specified | |
1146 | * characterEncoding. | |
1147 | * | |
1148 | * @param c | |
1149 | * @param characterEncoding | |
1150 | * @return | |
1151 | */ | |
1152 | private static void percentEncode(Writer writer, char c, String characterEncoding) throws IOException | |
1153 | { | |
1154 | 63 | String app = null; |
1155 | 63 | if (c > (char)((short)0x007F)) |
1156 | { | |
1157 | //percent encode in the proper encoding to be consistent | |
1158 | //app = percentEncodeNonUsAsciiCharacter(writer c, characterEncoding); | |
1159 | 39 | percentEncodeNonUsAsciiCharacter(writer, c, characterEncoding); |
1160 | } | |
1161 | else | |
1162 | { | |
1163 | //percent encode US-ASCII char (0x00-0x7F range) | |
1164 | //app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); | |
1165 | 24 | writer.write('%'); |
1166 | 24 | writer.write(HEX_CHARSET.charAt( ((c >> 0x4) % 0x10))); |
1167 | 24 | writer.write(HEX_CHARSET.charAt(c % 0x10)); |
1168 | } | |
1169 | //return app; | |
1170 | 63 | } |
1171 | ||
1172 | private static void percentEncodeNonUsAsciiCharacter(Writer currentWriter, char c, String characterEncoding) | |
1173 | throws IOException | |
1174 | { | |
1175 | 39 | ByteArrayOutputStream baos = new ByteArrayOutputStream(10); |
1176 | ||
1177 | try | |
1178 | { | |
1179 | 39 | OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); |
1180 | 39 | writer.write(c); |
1181 | 39 | writer.flush(); |
1182 | } | |
1183 | 0 | catch(IOException e) |
1184 | { | |
1185 | 0 | baos.reset(); |
1186 | 0 | return; |
1187 | 39 | } |
1188 | ||
1189 | 39 | byte [] byteArray = baos.toByteArray(); |
1190 | 117 | for (int i=0; i < byteArray.length; i++) |
1191 | { | |
1192 | //builder.append('%'); | |
1193 | //builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); | |
1194 | //builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); | |
1195 | 78 | currentWriter.write('%'); |
1196 | 78 | currentWriter.write(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); |
1197 | 78 | currentWriter.write(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); |
1198 | } | |
1199 | ||
1200 | //return builder.toString(); | |
1201 | 39 | } |
1202 | ||
1203 | /** | |
1204 | * Encode the query part using the document charset encoding provided. | |
1205 | * | |
1206 | * | |
1207 | * @param string | |
1208 | * @param characterEncoding | |
1209 | * @return | |
1210 | */ | |
1211 | private static void encodeURIQuery(Writer writer, final String string, int offset, final String characterEncoding) | |
1212 | throws IOException | |
1213 | { | |
1214 | //StringBuilder sb = null; //create later on demand | |
1215 | 6 | int start = offset; |
1216 | 6 | int length = string.length(); |
1217 | 6 | int realLength = length-offset; |
1218 | String app; | |
1219 | char c; | |
1220 | //boolean endLoop = false; | |
1221 | 234 | for (int i = offset; i < length; ++i) |
1222 | { | |
1223 | 228 | app = null; |
1224 | 228 | c = string.charAt(i); |
1225 | ||
1226 | // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) | |
1227 | // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so | |
1228 | // we make easier and omit this one) | |
1229 | // - "<" %3C, ">" %3E, | |
1230 | // - "\" %5C, "^" %5E, "`" %60 | |
1231 | // - "{" %7B, "|" %7C, "}" %7D | |
1232 | // - From %7F ad infinitum (each character as many bytes as necessary but take into account | |
1233 | // that a single char should contain 2,3 or more bytes!. This data should be encoded | |
1234 | // translating from the document character encoding to percent encoding) | |
1235 | // | |
1236 | // "&" should be encoded as "&" because this link is inside an html page, and | |
1237 | // put & is invalid in this context | |
1238 | ||
1239 | 228 | if ( (c <= (char)0x20) || (c >= (char)0x7F) || |
1240 | c == '"' || c == '<' || | |
1241 | c == '>' || c == '\\' || c == '^' || c == '`' || | |
1242 | c == '{' || c == '|' || c == '}') | |
1243 | { | |
1244 | // The percent encoding on this part should be done using UTF-8 charset | |
1245 | // as RFC 3986 Section 3.2.2 says | |
1246 | //app = percentEncode(c, characterEncoding); | |
1247 | 30 | if (start < i) |
1248 | { | |
1249 | 2 | writer.write(string, start, i-start); |
1250 | } | |
1251 | 30 | start = i+1; |
1252 | 30 | percentEncode(writer, c, characterEncoding); |
1253 | } | |
1254 | 198 | else if (c == '%') |
1255 | { | |
1256 | 1 | if (i + 2 < length) |
1257 | { | |
1258 | 1 | char c1 = string.charAt(i+1); |
1259 | 1 | char c2 = string.charAt(i+2); |
1260 | 1 | if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z') || (c1 >='a' && c1 <='z')) && |
1261 | (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z') || (c2 >='a' && c2 <='z'))) | |
1262 | { | |
1263 | // do not percent encode, because it could be already encoded | |
1264 | } | |
1265 | else | |
1266 | { | |
1267 | //app = percentEncode(c, characterEncoding); | |
1268 | 1 | if (start < i) |
1269 | { | |
1270 | 0 | writer.write(string, start, i-start); |
1271 | } | |
1272 | 1 | start = i+1; |
1273 | 1 | percentEncode(writer, c, characterEncoding); |
1274 | } | |
1275 | 1 | } |
1276 | else | |
1277 | { | |
1278 | //app = percentEncode(c, characterEncoding); | |
1279 | 0 | if (start < i) |
1280 | { | |
1281 | 0 | writer.write(string, start, i-start); |
1282 | } | |
1283 | 0 | start = i+1; |
1284 | 0 | percentEncode(writer, c, characterEncoding); |
1285 | } | |
1286 | } | |
1287 | 197 | else if (c == '&') |
1288 | { | |
1289 | 1 | if (i+4 < length ) |
1290 | { | |
1291 | 1 | if ('a' == string.charAt(i+1) && |
1292 | 'm' == string.charAt(i+2) && | |
1293 | 'p' == string.charAt(i+3) && | |
1294 | ';' == string.charAt(i+4)) | |
1295 | { | |
1296 | //Skip | |
1297 | } | |
1298 | else | |
1299 | { | |
1300 | 1 | app = "&"; |
1301 | } | |
1302 | } | |
1303 | else | |
1304 | { | |
1305 | 0 | app = "&"; |
1306 | } | |
1307 | } | |
1308 | else | |
1309 | { | |
1310 | //No encoding, just do nothing, char will be added later. | |
1311 | } | |
1312 | ||
1313 | 228 | if (app != null) |
1314 | { | |
1315 | //if (sb == null) | |
1316 | //{ | |
1317 | // sb = new StringBuilder(string.substring(0, i)); | |
1318 | //} | |
1319 | //sb.append(app); | |
1320 | 1 | if (start < i) |
1321 | { | |
1322 | 1 | writer.write(string, start, i-start); |
1323 | } | |
1324 | 1 | start = i+1; |
1325 | 1 | writer.write(app); |
1326 | } | |
1327 | //else | |
1328 | //{ | |
1329 | // if (sb != null) | |
1330 | // { | |
1331 | // sb.append(c); | |
1332 | // } | |
1333 | //} | |
1334 | //if (endLoop) | |
1335 | //{ | |
1336 | // break; | |
1337 | //} | |
1338 | } | |
1339 | ||
1340 | //if (sb == null) | |
1341 | //{ | |
1342 | // return string; | |
1343 | //} | |
1344 | //else | |
1345 | //{ | |
1346 | // return sb.toString(); | |
1347 | //} | |
1348 | 6 | if (start == offset) |
1349 | { | |
1350 | 3 | writer.write(string, offset, realLength); |
1351 | } | |
1352 | 3 | else if (start < length) |
1353 | { | |
1354 | 1 | writer.write(string,start,length-start); |
1355 | } | |
1356 | 6 | } |
1357 | } |