001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 package org.apache.commons.lang3; 018 019 import java.io.IOException; 020 import java.io.Writer; 021 022 import org.apache.commons.lang3.text.translate.AggregateTranslator; 023 import org.apache.commons.lang3.text.translate.CharSequenceTranslator; 024 import org.apache.commons.lang3.text.translate.EntityArrays; 025 import org.apache.commons.lang3.text.translate.LookupTranslator; 026 import org.apache.commons.lang3.text.translate.NumericEntityUnescaper; 027 import org.apache.commons.lang3.text.translate.UnicodeEscaper; 028 import org.apache.commons.lang3.text.translate.UnicodeUnescaper; 029 030 /** 031 * <p>Escapes and unescapes <code>String</code>s for 032 * Java, Java Script, HTML and XML.</p> 033 * 034 * <p>#ThreadSafe#</p> 035 * @author Apache Software Foundation 036 * @author Apache Jakarta Turbine 037 * @author Purple Technology 038 * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a> 039 * @author Antony Riley 040 * @author Helge Tesgaard 041 * @author <a href="sean@boohai.com">Sean Brown</a> 042 * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a> 043 * @author Phil Steitz 044 * @author Pete Gieser 045 * @since 2.0 046 * @version $Id: StringEscapeUtils.java 918868 2010-03-04 06:22:16Z bayard $ 047 */ 048 public class StringEscapeUtils { 049 050 /* ESCAPE TRANSLATORS */ 051 052 public static final CharSequenceTranslator ESCAPE_JAVA = 053 new LookupTranslator( 054 new String[][] { 055 {"\"", "\\\""}, 056 {"\\", "\\\\"}, 057 }).with( 058 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()) 059 ).with( 060 UnicodeEscaper.outsideOf(32, 0x7f) 061 ); 062 063 public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = 064 new AggregateTranslator( 065 new LookupTranslator( 066 new String[][] { 067 {"'", "\\'"}, 068 {"\"", "\\\""}, 069 {"\\", "\\\\"}, 070 {"/", "\\/"} 071 }), 072 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()), 073 UnicodeEscaper.outsideOf(32, 0x7f) 074 ); 075 076 public static final CharSequenceTranslator ESCAPE_XML = 077 new AggregateTranslator( 078 new LookupTranslator(EntityArrays.BASIC_ESCAPE()), 079 new LookupTranslator(EntityArrays.APOS_ESCAPE()) 080 ); 081 082 public static final CharSequenceTranslator ESCAPE_HTML3 = 083 new AggregateTranslator( 084 new LookupTranslator(EntityArrays.BASIC_ESCAPE()), 085 new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()) 086 ); 087 088 public static final CharSequenceTranslator ESCAPE_HTML4 = 089 new AggregateTranslator( 090 new LookupTranslator(EntityArrays.BASIC_ESCAPE()), 091 new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()), 092 new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE()) 093 ); 094 095 public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper(); 096 097 // TODO: Create a parent class - 'SinglePassTranslator' ? 098 // TODO: It would handle the index checking, and length returning, and 099 // TODO: could also have an optimization check method. 100 static class CsvEscaper extends CharSequenceTranslator { 101 102 private static final char CSV_DELIMITER = ','; 103 private static final char CSV_QUOTE = '"'; 104 private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); 105 private static final char[] CSV_SEARCH_CHARS = new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; 106 107 // TODO: Replace with a RegexTranslator. That should consume the number of characters the regex uses up? 108 @Override 109 public int translate(CharSequence input, int index, Writer out) throws IOException { 110 111 if(index != 0) { 112 throw new IllegalStateException("CsvEscaper should never reach the [1] index"); 113 } 114 115 if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) { 116 out.write(input.toString()); 117 } else { 118 out.write(CSV_QUOTE); 119 out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR)); 120 out.write(CSV_QUOTE); 121 } 122 return input.length(); 123 } 124 } 125 126 /* UNESCAPE TRANSLATORS */ 127 128 // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)? 129 public static final CharSequenceTranslator UNESCAPE_JAVA = 130 new AggregateTranslator( 131 new UnicodeUnescaper(), 132 new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()), 133 new LookupTranslator( 134 new String[][] { 135 {"\\\\", "\\"}, 136 {"\\\"", "\""}, 137 {"\\'", "'"}, 138 {"\\", ""} 139 }) 140 ); 141 142 public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA; 143 144 public static final CharSequenceTranslator UNESCAPE_HTML3 = 145 new AggregateTranslator( 146 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), 147 new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), 148 new NumericEntityUnescaper() 149 ); 150 151 public static final CharSequenceTranslator UNESCAPE_HTML4 = 152 new AggregateTranslator( 153 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), 154 new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()), 155 new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()), 156 new NumericEntityUnescaper() 157 ); 158 159 public static final CharSequenceTranslator UNESCAPE_XML = 160 new AggregateTranslator( 161 new LookupTranslator(EntityArrays.BASIC_UNESCAPE()), 162 new LookupTranslator(EntityArrays.APOS_UNESCAPE()), 163 new NumericEntityUnescaper() 164 ); 165 166 public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper(); 167 168 static class CsvUnescaper extends CharSequenceTranslator { 169 170 private static final char CSV_DELIMITER = ','; 171 private static final char CSV_QUOTE = '"'; 172 private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE); 173 private static final char[] CSV_SEARCH_CHARS = new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF}; 174 175 // TODO: Replace with a RegexTranslator. That should consume the number of characters the regex uses up? 176 @Override 177 public int translate(CharSequence input, int index, Writer out) throws IOException { 178 179 if(index != 0) { 180 throw new IllegalStateException("CsvUnescaper should never reach the [1] index"); 181 } 182 183 if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) { 184 out.write(input.toString()); 185 return input.length(); 186 } 187 188 // strip quotes 189 String quoteless = input.subSequence(1, input.length() - 1).toString(); 190 191 if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) { 192 // deal with escaped quotes; ie) "" 193 out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR)); 194 } else { 195 out.write(input.toString()); 196 } 197 return input.length(); 198 } 199 } 200 201 /* Helper functions */ 202 203 /** 204 * <p><code>StringEscapeUtils</code> instances should NOT be constructed in 205 * standard programming.</p> 206 * 207 * <p>Instead, the class should be used as: 208 * <pre>StringEscapeUtils.escapeJava("foo");</pre></p> 209 * 210 * <p>This constructor is public to permit tools that require a JavaBean 211 * instance to operate.</p> 212 */ 213 public StringEscapeUtils() { 214 super(); 215 } 216 217 // Java and JavaScript 218 //-------------------------------------------------------------------------- 219 /** 220 * <p>Escapes the characters in a <code>String</code> using Java String rules.</p> 221 * 222 * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> 223 * 224 * <p>So a tab becomes the characters <code>'\\'</code> and 225 * <code>'t'</code>.</p> 226 * 227 * <p>The only difference between Java strings and JavaScript strings 228 * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p> 229 * 230 * <p>Example: 231 * <pre> 232 * input string: He didn't say, "Stop!" 233 * output string: He didn't say, \"Stop!\" 234 * </pre> 235 * </p> 236 * 237 * @param input String to escape values in, may be null 238 * @return String with escaped values, <code>null</code> if null string input 239 */ 240 public static final String escapeJava(String input) { 241 return ESCAPE_JAVA.translate(input); 242 } 243 244 /** 245 * <p>Escapes the characters in a <code>String</code> using EcmaScript String rules.</p> 246 * <p>Escapes any values it finds into their EcmaScript String form. 247 * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p> 248 * 249 * <p>So a tab becomes the characters <code>'\\'</code> and 250 * <code>'t'</code>.</p> 251 * 252 * <p>The only difference between Java strings and EcmaScript strings 253 * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p> 254 * 255 * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p> 256 * 257 * <p>Example: 258 * <pre> 259 * input string: He didn't say, "Stop!" 260 * output string: He didn\'t say, \"Stop!\" 261 * </pre> 262 * </p> 263 * 264 * @param input String to escape values in, may be null 265 * @return String with escaped values, <code>null</code> if null string input 266 */ 267 public static final String escapeEcmaScript(String input) { 268 return ESCAPE_ECMASCRIPT.translate(input); 269 } 270 271 /** 272 * <p>Unescapes any Java literals found in the <code>String</code>. 273 * For example, it will turn a sequence of <code>'\'</code> and 274 * <code>'n'</code> into a newline character, unless the <code>'\'</code> 275 * is preceded by another <code>'\'</code>.</p> 276 * 277 * @param input the <code>String</code> to unescape, may be null 278 * @return a new unescaped <code>String</code>, <code>null</code> if null string input 279 */ 280 public static final String unescapeJava(String input) { 281 return UNESCAPE_JAVA.translate(input); 282 } 283 284 /** 285 * <p>Unescapes any EcmaScript literals found in the <code>String</code>.</p> 286 * 287 * <p>For example, it will turn a sequence of <code>'\'</code> and <code>'n'</code> 288 * into a newline character, unless the <code>'\'</code> is preceded by another 289 * <code>'\'</code>.</p> 290 * 291 * @see #unescapeJava(String) 292 * @param input the <code>String</code> to unescape, may be null 293 * @return A new unescaped <code>String</code>, <code>null</code> if null string input 294 */ 295 public static final String unescapeEcmaScript(String input) { 296 return UNESCAPE_ECMASCRIPT.translate(input); 297 } 298 299 // HTML and XML 300 //-------------------------------------------------------------------------- 301 /** 302 * <p>Escapes the characters in a <code>String</code> using HTML entities.</p> 303 * 304 * <p> 305 * For example: 306 * </p> 307 * <p><code>"bread" & "butter"</code></p> 308 * becomes: 309 * <p> 310 * <code>&quot;bread&quot; &amp; &quot;butter&quot;</code>. 311 * </p> 312 * 313 * <p>Supports all known HTML 4.0 entities, including funky accents. 314 * Note that the commonly used apostrophe escape character (&apos;) 315 * is not a legal entity and so is not supported). </p> 316 * 317 * @param input the <code>String</code> to escape, may be null 318 * @return a new escaped <code>String</code>, <code>null</code> if null string input 319 * 320 * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> 321 * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> 322 * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> 323 * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> 324 * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> 325 */ 326 public static final String escapeHtml4(String input) { 327 return ESCAPE_HTML4.translate(input); 328 } 329 330 public static final String escapeHtml3(String input) { 331 return ESCAPE_HTML3.translate(input); 332 } 333 334 //----------------------------------------------------------------------- 335 /** 336 * <p>Unescapes a string containing entity escapes to a string 337 * containing the actual Unicode characters corresponding to the 338 * escapes. Supports HTML 4.0 entities.</p> 339 * 340 * <p>For example, the string "&lt;Fran&ccedil;ais&gt;" 341 * will become "<Français>"</p> 342 * 343 * <p>If an entity is unrecognized, it is left alone, and inserted 344 * verbatim into the result string. e.g. "&gt;&zzzz;x" will 345 * become ">&zzzz;x".</p> 346 * 347 * @param input the <code>String</code> to unescape, may be null 348 * @return a new unescaped <code>String</code>, <code>null</code> if null string input 349 */ 350 public static final String unescapeHtml4(String input) { 351 return UNESCAPE_HTML4.translate(input); 352 } 353 354 public static final String unescapeHtml3(String input) { 355 return UNESCAPE_HTML3.translate(input); 356 } 357 358 //----------------------------------------------------------------------- 359 /** 360 * <p>Escapes the characters in a <code>String</code> using XML entities.</p> 361 * 362 * <p>For example: <tt>"bread" & "butter"</tt> => 363 * <tt>&quot;bread&quot; &amp; &quot;butter&quot;</tt>. 364 * </p> 365 * 366 * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). 367 * Does not support DTDs or external entities.</p> 368 * 369 * <p>Note that unicode characters greater than 0x7f are as of 3.0, no longer 370 * escaped. </p> 371 * 372 * @param input the <code>String</code> to escape, may be null 373 * @return a new escaped <code>String</code>, <code>null</code> if null string input 374 * @see #unescapeXml(java.lang.String) 375 */ 376 public static final String escapeXml(String input) { 377 return ESCAPE_XML.translate(input); 378 } 379 380 381 //----------------------------------------------------------------------- 382 /** 383 * <p>Unescapes a string containing XML entity escapes to a string 384 * containing the actual Unicode characters corresponding to the 385 * escapes.</p> 386 * 387 * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos). 388 * Does not support DTDs or external entities.</p> 389 * 390 * <p>Note that numerical \\u unicode codes are unescaped to their respective 391 * unicode characters. This may change in future releases. </p> 392 * 393 * @param input the <code>String</code> to unescape, may be null 394 * @return a new unescaped <code>String</code>, <code>null</code> if null string input 395 * @see #escapeXml(String) 396 */ 397 public static final String unescapeXml(String input) { 398 return UNESCAPE_XML.translate(input); 399 } 400 401 402 //----------------------------------------------------------------------- 403 404 /** 405 * <p>Returns a <code>String</code> value for a CSV column enclosed in double quotes, 406 * if required.</p> 407 * 408 * <p>If the value contains a comma, newline or double quote, then the 409 * String value is returned enclosed in double quotes.</p> 410 * </p> 411 * 412 * <p>Any double quote characters in the value are escaped with another double quote.</p> 413 * 414 * <p>If the value does not contain a comma, newline or double quote, then the 415 * String value is returned unchanged.</p> 416 * </p> 417 * 418 * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and 419 * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. 420 * 421 * @param input the input CSV column String, may be null 422 * @return the input String, enclosed in double quotes if the value contains a comma, 423 * newline or double quote, <code>null</code> if null string input 424 * @since 2.4 425 */ 426 public static final String escapeCsv(String input) { 427 return ESCAPE_CSV.translate(input); 428 } 429 430 /** 431 * <p>Returns a <code>String</code> value for an unescaped CSV column. </p> 432 * 433 * <p>If the value is enclosed in double quotes, and contains a comma, newline 434 * or double quote, then quotes are removed. 435 * </p> 436 * 437 * <p>Any double quote escaped characters (a pair of double quotes) are unescaped 438 * to just one double quote. </p> 439 * 440 * <p>If the value is not enclosed in double quotes, or is and does not contain a 441 * comma, newline or double quote, then the String value is returned unchanged.</p> 442 * </p> 443 * 444 * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and 445 * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>. 446 * 447 * @param input the input CSV column String, may be null 448 * @return the input String, with enclosing double quotes removed and embedded double 449 * quotes unescaped, <code>null</code> if null string input 450 * @since 2.4 451 */ 452 public static final String unescapeCsv(String input) { 453 return UNESCAPE_CSV.translate(input); 454 } 455 456 }