Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
HTMLEncoder |
|
| 8.333333333333334;8.333 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one | |
3 | * or more contributor license agreements. See the NOTICE file | |
4 | * distributed with this work for additional information | |
5 | * regarding copyright ownership. The ASF licenses this file | |
6 | * to you under the Apache License, Version 2.0 (the | |
7 | * "License"); you may not use this file except in compliance | |
8 | * with the License. You may obtain a copy of the License at | |
9 | * | |
10 | * http://www.apache.org/licenses/LICENSE-2.0 | |
11 | * | |
12 | * Unless required by applicable law or agreed to in writing, | |
13 | * software distributed under the License is distributed on an | |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
15 | * KIND, either express or implied. See the License for the | |
16 | * specific language governing permissions and limitations | |
17 | * under the License. | |
18 | */ | |
19 | package org.apache.myfaces.shared_impl.renderkit.html.util; | |
20 | ||
21 | import java.io.ByteArrayOutputStream; | |
22 | import java.io.IOException; | |
23 | import java.io.OutputStreamWriter; | |
24 | import java.io.Writer; | |
25 | ||
26 | /** | |
27 | * Converts Strings so that they can be used within HTML-Code. | |
28 | */ | |
29 | 0 | public abstract class HTMLEncoder |
30 | { | |
31 | /** | |
32 | * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. | |
33 | */ | |
34 | public static String encode (String string) | |
35 | { | |
36 | 0 | return encode(string, false, true); |
37 | } | |
38 | ||
39 | /** | |
40 | * Variant of {@link #encode} where encodeNbsp is true. | |
41 | */ | |
42 | public static String encode (String string, boolean encodeNewline) | |
43 | { | |
44 | 0 | return encode(string, encodeNewline, true); |
45 | } | |
46 | ||
47 | /** | |
48 | * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true | |
49 | */ | |
50 | public static String encode (String string, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp) | |
51 | { | |
52 | 0 | return encode(string, encodeNewline, encodeSubsequentBlanksToNbsp, true); |
53 | } | |
54 | ||
55 | /** | |
56 | * Encodes the given string, so that it can be used within a html page. | |
57 | * @param string the string to convert | |
58 | * @param encodeNewline if true newline characters are converted to <br>'s | |
59 | * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s | |
60 | * @param encodeNonLatin if true encode non-latin characters as numeric character references | |
61 | */ | |
62 | public static String encode (String string, | |
63 | boolean encodeNewline, | |
64 | boolean encodeSubsequentBlanksToNbsp, | |
65 | boolean encodeNonLatin) | |
66 | { | |
67 | 0 | if (string == null) |
68 | { | |
69 | 0 | return ""; |
70 | } | |
71 | ||
72 | 0 | StringBuilder sb = null; //create later on demand |
73 | String app; | |
74 | char c; | |
75 | 0 | for (int i = 0; i < string.length (); ++i) |
76 | { | |
77 | 0 | app = null; |
78 | 0 | c = string.charAt(i); |
79 | ||
80 | // All characters before letters | |
81 | 0 | if ((int)c < 0x41) |
82 | { | |
83 | 0 | switch (c) |
84 | { | |
85 | 0 | case '"': app = """; break; //" |
86 | 0 | case '&': app = "&"; break; //& |
87 | 0 | case '<': app = "<"; break; //< |
88 | 0 | case '>': app = ">"; break; //> |
89 | case ' ': | |
90 | 0 | if (encodeSubsequentBlanksToNbsp && |
91 | (i == 0 || (i - 1 >= 0 && string.charAt(i - 1) == ' '))) | |
92 | { | |
93 | //Space at beginning or after another space | |
94 | 0 | app = " "; |
95 | } | |
96 | break; | |
97 | case '\n': | |
98 | 0 | if (encodeNewline) |
99 | { | |
100 | 0 | app = "<br/>"; |
101 | } | |
102 | 0 | break; |
103 | } | |
104 | 0 | } else if (encodeNonLatin && (int)c > 0x80) { |
105 | 0 | switch(c) { |
106 | //german umlauts | |
107 | 0 | case '\u00E4' : app = "ä"; break; |
108 | 0 | case '\u00C4' : app = "Ä"; break; |
109 | 0 | case '\u00F6' : app = "ö"; break; |
110 | 0 | case '\u00D6' : app = "Ö"; break; |
111 | 0 | case '\u00FC' : app = "ü"; break; |
112 | 0 | case '\u00DC' : app = "Ü"; break; |
113 | 0 | case '\u00DF' : app = "ß"; break; |
114 | ||
115 | //misc | |
116 | //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? | |
117 | 0 | case '\u20AC': app = "€"; break; |
118 | 0 | case '\u00AB': app = "«"; break; |
119 | 0 | case '\u00BB': app = "»"; break; |
120 | 0 | case '\u00A0': app = " "; break; |
121 | ||
122 | default : | |
123 | //encode all non basic latin characters | |
124 | 0 | app = "&#" + ((int)c) + ";"; |
125 | break; | |
126 | } | |
127 | } | |
128 | 0 | if (app != null) |
129 | { | |
130 | 0 | if (sb == null) |
131 | { | |
132 | 0 | sb = new StringBuilder(string.substring(0, i)); |
133 | } | |
134 | 0 | sb.append(app); |
135 | } else { | |
136 | 0 | if (sb != null) |
137 | { | |
138 | 0 | sb.append(c); |
139 | } | |
140 | } | |
141 | } | |
142 | ||
143 | 0 | if (sb == null) |
144 | { | |
145 | 0 | return string; |
146 | } | |
147 | else | |
148 | { | |
149 | 0 | return sb.toString(); |
150 | } | |
151 | } | |
152 | ||
153 | /** | |
154 | * Variant of {@link #encode} where encodeNewline is false and encodeNbsp is true. | |
155 | */ | |
156 | public static void encode (char[] string, int offset, int length, Writer writer) throws IOException | |
157 | { | |
158 | 0 | encode(string, offset, length, false, true, writer); |
159 | 0 | } |
160 | ||
161 | /** | |
162 | * Variant of {@link #encode} where encodeNbsp is true. | |
163 | */ | |
164 | public static void encode (char[] string, int offset, int length, boolean encodeNewline, Writer writer) throws IOException | |
165 | { | |
166 | 0 | encode(string, offset, length, encodeNewline, true, writer); |
167 | 0 | } |
168 | ||
169 | /** | |
170 | * Variant of {@link #encode} where encodeNbsp and encodeNonLatin are true | |
171 | */ | |
172 | public static void encode (char[] string, int offset, int length, boolean encodeNewline, boolean encodeSubsequentBlanksToNbsp, Writer writer) throws IOException | |
173 | { | |
174 | 0 | encode(string, offset, length, encodeNewline, encodeSubsequentBlanksToNbsp, true, writer); |
175 | 0 | } |
176 | ||
177 | ||
178 | /** | |
179 | * Encodes the given string, so that it can be used within a html page. | |
180 | * @param string the string to convert | |
181 | * @param encodeNewline if true newline characters are converted to <br>'s | |
182 | * @param encodeSubsequentBlanksToNbsp if true subsequent blanks are converted to &nbsp;'s | |
183 | * @param encodeNonLatin if true encode non-latin characters as numeric character references | |
184 | */ | |
185 | public static void encode (char[] string, int offset, int length, | |
186 | boolean encodeNewline, | |
187 | boolean encodeSubsequentBlanksToNbsp, | |
188 | boolean encodeNonLatin, Writer writer) throws IOException | |
189 | { | |
190 | 0 | if (string == null || length < 0 || offset >= string.length) |
191 | { | |
192 | 0 | return; |
193 | } | |
194 | 0 | offset = Math.max(0, offset); |
195 | 0 | int realLength = Math.min(length, string.length - offset); |
196 | ||
197 | 0 | StringBuilder sb = null; //create later on demand |
198 | String app; | |
199 | char c; | |
200 | ||
201 | 0 | for (int i = offset; i < offset + realLength; ++i) |
202 | { | |
203 | 0 | app = null; |
204 | 0 | c = string[i]; |
205 | ||
206 | // All characters before letters | |
207 | 0 | if ((int)c < 0x41) |
208 | { | |
209 | 0 | switch (c) |
210 | { | |
211 | 0 | case '"': app = """; break; //" |
212 | 0 | case '&': app = "&"; break; //& |
213 | 0 | case '<': app = "<"; break; //< |
214 | 0 | case '>': app = ">"; break; //> |
215 | case ' ': | |
216 | 0 | if (encodeSubsequentBlanksToNbsp && |
217 | (i == 0 || (i - 1 >= 0 && string[i - 1] == ' '))) | |
218 | { | |
219 | //Space at beginning or after another space | |
220 | 0 | app = " "; |
221 | } | |
222 | break; | |
223 | case '\n': | |
224 | 0 | if (encodeNewline) |
225 | { | |
226 | 0 | app = "<br/>"; |
227 | } | |
228 | 0 | break; |
229 | } | |
230 | 0 | } else if (encodeNonLatin && (int)c > 0x80) { |
231 | 0 | switch(c) { |
232 | //german umlauts | |
233 | 0 | case '\u00E4' : app = "ä"; break; |
234 | 0 | case '\u00C4' : app = "Ä"; break; |
235 | 0 | case '\u00F6' : app = "ö"; break; |
236 | 0 | case '\u00D6' : app = "Ö"; break; |
237 | 0 | case '\u00FC' : app = "ü"; break; |
238 | 0 | case '\u00DC' : app = "Ü"; break; |
239 | 0 | case '\u00DF' : app = "ß"; break; |
240 | ||
241 | //misc | |
242 | //case 0x80: app = "€"; break; sometimes euro symbol is ascii 128, should we suport it? | |
243 | 0 | case '\u20AC': app = "€"; break; |
244 | 0 | case '\u00AB': app = "«"; break; |
245 | 0 | case '\u00BB': app = "»"; break; |
246 | 0 | case '\u00A0': app = " "; break; |
247 | ||
248 | default : | |
249 | //encode all non basic latin characters | |
250 | 0 | app = "&#" + ((int)c) + ";"; |
251 | break; | |
252 | } | |
253 | } | |
254 | 0 | if (app != null) |
255 | { | |
256 | 0 | if (sb == null) |
257 | { | |
258 | 0 | sb = new StringBuilder(realLength*2); |
259 | 0 | sb.append(string, offset, i - offset); |
260 | } | |
261 | 0 | sb.append(app); |
262 | } else { | |
263 | 0 | if (sb != null) |
264 | { | |
265 | 0 | sb.append(c); |
266 | } | |
267 | } | |
268 | } | |
269 | ||
270 | 0 | if (sb == null) |
271 | { | |
272 | 0 | writer.write(string, offset, realLength); |
273 | } | |
274 | else | |
275 | { | |
276 | 0 | writer.write(sb.toString()); |
277 | } | |
278 | 0 | } |
279 | ||
280 | private static final String HEX_CHARSET = "0123456789ABCDEF"; | |
281 | ||
282 | private static final String UTF8 = "UTF-8"; | |
283 | ||
284 | /** | |
285 | * Encode an URI, escaping or percent-encoding all required characters and | |
286 | * following the rules mentioned on RFC 3986. | |
287 | * | |
288 | * @param string | |
289 | * @param encodeNonLatin | |
290 | * @return | |
291 | * @throws IOException | |
292 | */ | |
293 | public static String encodeURIAtributte(final String string, final String characterEncoding) | |
294 | throws IOException | |
295 | { | |
296 | 0 | StringBuilder sb = null; //create later on demand |
297 | String app; | |
298 | char c; | |
299 | 0 | boolean endLoop = false; |
300 | 0 | for (int i = 0; i < string.length (); ++i) |
301 | { | |
302 | 0 | app = null; |
303 | 0 | c = string.charAt(i); |
304 | ||
305 | // This are the guidelines to be taken into account by this algorithm to encode: | |
306 | ||
307 | // RFC 2396 Section 2.4.3 Excluded US-ASCII Characters | |
308 | // | |
309 | // control = <US-ASCII coded characters 00-1F and 7F hexadecimal> | |
310 | // space = <US-ASCII coded character 20 hexadecimal> | |
311 | // delims = "<" | ">" | "#" | "%" | <"> | |
312 | // %3C %3E %23 %25 %22 | |
313 | // unwise = "{" | "}" | "|" | "\" | "^" | "[" | "]" | "`" | |
314 | // %7D %7B %7C %5C %5E %5B %5D %60 | |
315 | // | |
316 | // ".... Data corresponding to excluded characters must be escaped in order to | |
317 | // be properly represented within a URI....." | |
318 | ||
319 | // RFC 3986 Section 3. Syntax Components | |
320 | // | |
321 | // "... The generic URI syntax consists of a hierarchical sequence of | |
322 | // components referred to as the scheme, authority, path, query, and | |
323 | // fragment. | |
324 | // | |
325 | // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] | |
326 | // | |
327 | // hier-part = "//" authority path-abempty | |
328 | // / path-absolute | |
329 | // / path-rootless | |
330 | // / path-empty | |
331 | // ...." | |
332 | ||
333 | // RFC 3986 Section 2.2: | |
334 | // Reserved characters (should not be percent-encoded) | |
335 | // reserved = gen-delims / sub-delims | |
336 | // gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" | |
337 | // %3A %2F %3F %23 %5B %5D %40 | |
338 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" | |
339 | // %21 %24 %26 %27 %28 %29 %2A %2B %2C %3B %3D | |
340 | ||
341 | // Note than chars "[" and "]" are mentioned as they should be escaped on RFC 2396, | |
342 | // but on the part D. Changes from RFC 2396 says about this chars (used on IPv6) | |
343 | // "...those rules were redefined to directly specify the characters allowed...." | |
344 | // There is also other characters moved from excluded list to reserved: | |
345 | // "[" / "]" / "#" | |
346 | ||
347 | // RFC 3986 Section 2.3: | |
348 | // "... for consistency, percent-encoded octets in the ranges of ALPHA | |
349 | // (%41-%5A and %61-%7A), DIGIT (%30-%39), hyphen (%2D), period (%2E), | |
350 | // underscore (%5F), or tilde (%7E) should not be created by URI | |
351 | // producers...." | |
352 | ||
353 | // RFC 3986 Section 3.2.2. Host | |
354 | ||
355 | // host = IP-literal / IPv4address / reg-name | |
356 | ||
357 | // The reg-name syntax allows percent-encoded octets in order to | |
358 | // represent non-ASCII registered names in a uniform way that is | |
359 | // independent of the underlying name resolution technology. Non-ASCII | |
360 | // characters must first be encoded according to UTF-8 [STD63], and then | |
361 | // each octet of the corresponding UTF-8 sequence must be percent- | |
362 | // encoded to be represented as URI characters. URI producing | |
363 | // applications must not use percent-encoding in host unless it is used | |
364 | // to represent a UTF-8 character sequence. | |
365 | ||
366 | // RFC 3986 Section 3.4 Query | |
367 | // query = *( pchar / "/" / "?" ) | |
368 | // | |
369 | // "... However, as query components are often used to carry identifying information | |
370 | // in the form of "key=value" pairs and one frequently used value is a reference to | |
371 | // another URI, it is sometimes better for usability to avoid percent-encoding those characters....." | |
372 | // | |
373 | // RFC 3986 Section 2.5 Identifying Data (Apply to query section) | |
374 | // | |
375 | // When a new URI scheme defines a component that represents textual | |
376 | // data consisting of characters from the Universal Character Set [UCS], | |
377 | // the data should first be encoded as octets according to the UTF-8 | |
378 | // character encoding [STD63]; then only those octets that do not | |
379 | // correspond to characters in the unreserved set should be percent- | |
380 | // encoded. For example, the character A would be represented as "A", | |
381 | // the character LATIN CAPITAL LETTER A WITH GRAVE would be represented | |
382 | // as "%C3%80", and the character KATAKANA LETTER A would be represented | |
383 | // as "%E3%82%A2". | |
384 | // | |
385 | // RFC 3986 Section 3.5 Fragment | |
386 | // fragment = *( pchar / "/" / "?" ) | |
387 | // | |
388 | // Note that follows the same as query | |
389 | ||
390 | // Based on the extracts the strategy to apply on this method is: | |
391 | // | |
392 | // On scheme ":" hier-part | |
393 | // | |
394 | // Escape or percent encode chars inside : | |
395 | // | |
396 | // - From %00 to %20, | |
397 | // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of | |
398 | // duplicate encoding, encode it when we are sure | |
399 | // that there are not encoded twice) | |
400 | // - "<" %3C, ">" %3E | |
401 | // - "\" %5C, "^" %5E, "`" %60 | |
402 | // - "{" %7B, "|" %7C, "}" %7D | |
403 | // - From %7F ad infinitum (characters from %100 to infinitum should not be used in this | |
404 | // part of an URI, but it is preferred to encode it that omit it). | |
405 | // | |
406 | // The remaining characters must not be encoded | |
407 | // | |
408 | // Characters after ? or # should be percent encoding but only the necessary ones: | |
409 | // | |
410 | // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) | |
411 | // - <"> %22, "%" %25 (If there is encode of "%", there is a risk of | |
412 | // duplicate encoding, encode it when we are sure | |
413 | // that there are not encoded twice) | |
414 | // - "<" %3C, ">" %3E, | |
415 | // - "\" %5C, "^" %5E, "`" %60 | |
416 | // - "{" %7B, "|" %7C, "}" %7D | |
417 | // - From %7F ad infinitum (each character as many bytes as necessary but take into account | |
418 | // that a single char should contain 2,3 or more bytes!. This data should be encoded | |
419 | // translating from the document character encoding to percent encoding, because this values | |
420 | // could be retrieved from httpRequest.getParameter() and it uses the current character encoding | |
421 | // for decode values) | |
422 | // | |
423 | // "&" should be encoded as "&" because this link is inside an html page, and | |
424 | // put only & is invalid in this context. | |
425 | ||
426 | 0 | if ( (c <= (char)0x20) || (c >= (char)0x7F) || |
427 | c == '"' || c == '<' || | |
428 | c == '>' || c == '\\' || c == '^' || c == '`' || | |
429 | c == '{' || c == '|' || c == '}') | |
430 | { | |
431 | // The percent encoding on this part should be done using UTF-8 charset | |
432 | // as RFC 3986 Section 3.2.2 says. | |
433 | // Also there is a reference on | |
434 | // http://www.w3.org/TR/html40/appendix/notes.html#non-ascii-chars | |
435 | // that recommend use of UTF-8 instead the document character encoding. | |
436 | // Jetty set by default UTF-8 (see http://jira.codehaus.org/browse/JETTY-113) | |
437 | 0 | app = percentEncode(c, "UTF-8"); |
438 | } | |
439 | 0 | else if (c == '%') |
440 | { | |
441 | 0 | if (i + 2 < string.length()) |
442 | { | |
443 | 0 | char c1 = string.charAt(i+1); |
444 | 0 | char c2 = string.charAt(i+2); |
445 | 0 | if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && |
446 | (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) | |
447 | { | |
448 | // do not percent encode, because it could be already encoded | |
449 | // and we don't want encode it twice | |
450 | } | |
451 | else | |
452 | { | |
453 | 0 | app = percentEncode(c, UTF8); |
454 | } | |
455 | 0 | } |
456 | else | |
457 | { | |
458 | 0 | app = percentEncode(c, UTF8); |
459 | } | |
460 | } | |
461 | 0 | else if (c == '?' || c == '#') |
462 | { | |
463 | 0 | if (i+1 < string.length()) |
464 | { | |
465 | // The remaining part of the URI are data that should be encoded | |
466 | // using the document character encoding. | |
467 | 0 | app = c + encodeURIQuery(string.substring(i+1), characterEncoding); |
468 | 0 | endLoop = true; |
469 | } | |
470 | } | |
471 | else | |
472 | { | |
473 | //No encoding, just do nothing, char will be added later. | |
474 | } | |
475 | ||
476 | 0 | if (app != null) |
477 | { | |
478 | 0 | if (sb == null) |
479 | { | |
480 | 0 | sb = new StringBuilder(string.substring(0, i)); |
481 | } | |
482 | 0 | sb.append(app); |
483 | } else { | |
484 | 0 | if (sb != null) |
485 | { | |
486 | 0 | sb.append(c); |
487 | } | |
488 | } | |
489 | 0 | if (endLoop) |
490 | { | |
491 | 0 | break; |
492 | } | |
493 | } | |
494 | 0 | if (sb == null) |
495 | { | |
496 | 0 | return string; |
497 | } | |
498 | else | |
499 | { | |
500 | 0 | return sb.toString(); |
501 | } | |
502 | } | |
503 | ||
504 | /** | |
505 | * Encode a unicode char value in percentEncode, decoding its bytes using a specified | |
506 | * characterEncoding. | |
507 | * | |
508 | * @param c | |
509 | * @param characterEncoding | |
510 | * @return | |
511 | */ | |
512 | private static String percentEncode(char c, String characterEncoding) | |
513 | { | |
514 | 0 | String app = null; |
515 | 0 | if (c > (char)((short)0x007F)) |
516 | { | |
517 | //percent encode in the proper encoding to be consistent | |
518 | 0 | app = percentEncodeNonUsAsciiCharacter(c, characterEncoding); |
519 | } | |
520 | else | |
521 | { | |
522 | //percent encode US-ASCII char (0x00-0x7F range) | |
523 | 0 | app = "%" + HEX_CHARSET.charAt( ((c >> 0x4) % 0x10)) +HEX_CHARSET.charAt(c % 0x10); |
524 | } | |
525 | 0 | return app; |
526 | } | |
527 | ||
528 | private static String percentEncodeNonUsAsciiCharacter(char c, String characterEncoding) | |
529 | { | |
530 | 0 | ByteArrayOutputStream baos = new ByteArrayOutputStream(10); |
531 | 0 | StringBuffer builder = new StringBuffer(); |
532 | try | |
533 | { | |
534 | 0 | OutputStreamWriter writer = new OutputStreamWriter(baos,characterEncoding); |
535 | 0 | writer.write(c); |
536 | 0 | writer.flush(); |
537 | } | |
538 | 0 | catch(IOException e) |
539 | { | |
540 | 0 | baos.reset(); |
541 | 0 | return null; |
542 | 0 | } |
543 | ||
544 | 0 | byte [] byteArray = baos.toByteArray(); |
545 | 0 | for (int i=0; i < byteArray.length; i++) |
546 | { | |
547 | 0 | builder.append('%'); |
548 | 0 | builder.append(HEX_CHARSET.charAt( (( ((short) byteArray[i] & 0xFF ) >> 0x4) % 0x10)) ); |
549 | 0 | builder.append(HEX_CHARSET.charAt( ((short) byteArray[i] & 0xFF ) % 0x10)); |
550 | } | |
551 | ||
552 | 0 | return builder.toString(); |
553 | } | |
554 | ||
555 | /** | |
556 | * Encode the query part using the document charset encoding provided. | |
557 | * | |
558 | * | |
559 | * @param string | |
560 | * @param characterEncoding | |
561 | * @return | |
562 | */ | |
563 | private static String encodeURIQuery(final String string, final String characterEncoding) | |
564 | { | |
565 | 0 | StringBuilder sb = null; //create later on demand |
566 | String app; | |
567 | char c; | |
568 | 0 | boolean endLoop = false; |
569 | 0 | for (int i = 0; i < string.length (); ++i) |
570 | { | |
571 | 0 | app = null; |
572 | 0 | c = string.charAt(i); |
573 | ||
574 | // - From %00 to %20 (' ' %20 could encode as +, but %20 also works, so we keep %20) | |
575 | // - <"> %22 (If there is encode of "%", there is a risk of duplicate encoding, so we make easier and omit this one) | |
576 | // - "<" %3C, ">" %3E, | |
577 | // - "\" %5C, "^" %5E, "`" %60 | |
578 | // - "{" %7B, "|" %7C, "}" %7D | |
579 | // - From %7F ad infinitum (each character as many bytes as necessary but take into account | |
580 | // that a single char should contain 2,3 or more bytes!. This data should be encoded translating from the document | |
581 | // character encoding to percent encoding) | |
582 | // | |
583 | // "&" should be encoded as "&" because this link is inside an html page, and | |
584 | // put & is invalid in this context | |
585 | ||
586 | 0 | if ( (c <= (char)0x20) || (c >= (char)0x7F) || |
587 | c == '"' || c == '<' || | |
588 | c == '>' || c == '\\' || c == '^' || c == '`' || | |
589 | c == '{' || c == '|' || c == '}') | |
590 | { | |
591 | // The percent encoding on this part should be done using UTF-8 charset | |
592 | // as RFC 3986 Section 3.2.2 says | |
593 | 0 | app = percentEncode(c, characterEncoding); |
594 | } | |
595 | 0 | else if (c == '%') |
596 | { | |
597 | 0 | if (i + 2 < string.length()) |
598 | { | |
599 | 0 | char c1 = string.charAt(i+1); |
600 | 0 | char c2 = string.charAt(i+2); |
601 | 0 | if ((( c1 >= '0' && c1 <='9') || (c1 >='A' && c1 <='Z')) && |
602 | (( c2 >= '0' && c2 <='9') || (c2 >='A' && c2 <='Z'))) | |
603 | { | |
604 | // do not percent encode, because it could be already encoded | |
605 | } | |
606 | else | |
607 | { | |
608 | 0 | app = percentEncode(c, characterEncoding); |
609 | } | |
610 | 0 | } |
611 | else | |
612 | { | |
613 | 0 | app = percentEncode(c, characterEncoding); |
614 | } | |
615 | } | |
616 | 0 | else if (c == '&') |
617 | { | |
618 | 0 | if (i+4 < string.length() ) |
619 | { | |
620 | 0 | if ('a' == string.charAt(i+1) && |
621 | 'm' == string.charAt(i+2) && | |
622 | 'p' == string.charAt(i+3) && | |
623 | ';' == string.charAt(i+4)) | |
624 | { | |
625 | //Skip | |
626 | } | |
627 | else | |
628 | { | |
629 | 0 | app = "&"; |
630 | } | |
631 | } | |
632 | else | |
633 | { | |
634 | 0 | app = "&"; |
635 | } | |
636 | } | |
637 | else | |
638 | { | |
639 | //No encoding, just do nothing, char will be added later. | |
640 | } | |
641 | ||
642 | 0 | if (app != null) |
643 | { | |
644 | 0 | if (sb == null) |
645 | { | |
646 | 0 | sb = new StringBuilder(string.substring(0, i)); |
647 | } | |
648 | 0 | sb.append(app); |
649 | } else { | |
650 | 0 | if (sb != null) |
651 | { | |
652 | 0 | sb.append(c); |
653 | } | |
654 | } | |
655 | 0 | if (endLoop) |
656 | { | |
657 | 0 | break; |
658 | } | |
659 | } | |
660 | 0 | if (sb == null) |
661 | { | |
662 | 0 | return string; |
663 | } | |
664 | else | |
665 | { | |
666 | 0 | return sb.toString(); |
667 | } | |
668 | } | |
669 | } |