001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.lang3;
018    
019    import java.io.IOException;
020    import java.io.Writer;
021    
022    import org.apache.commons.lang3.text.translate.AggregateTranslator;
023    import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
024    import org.apache.commons.lang3.text.translate.EntityArrays;
025    import org.apache.commons.lang3.text.translate.LookupTranslator;
026    import org.apache.commons.lang3.text.translate.NumericEntityUnescaper;
027    import org.apache.commons.lang3.text.translate.UnicodeEscaper;
028    import org.apache.commons.lang3.text.translate.UnicodeUnescaper;
029    
030    /**
031     * <p>Escapes and unescapes <code>String</code>s for
032     * Java, Java Script, HTML and XML.</p>
033     *
034     * <p>#ThreadSafe#</p>
035     * @author Apache Software Foundation
036     * @author Apache Jakarta Turbine
037     * @author Purple Technology
038     * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
039     * @author Antony Riley
040     * @author Helge Tesgaard
041     * @author <a href="sean@boohai.com">Sean Brown</a>
042     * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
043     * @author Phil Steitz
044     * @author Pete Gieser
045     * @since 2.0
046     * @version $Id: StringEscapeUtils.java 918868 2010-03-04 06:22:16Z bayard $
047     */
048    public class StringEscapeUtils {
049    
050        /* ESCAPE TRANSLATORS */
051    
052        public static final CharSequenceTranslator ESCAPE_JAVA = 
053              new LookupTranslator(
054                new String[][] { 
055                  {"\"", "\\\""},
056                  {"\\", "\\\\"},
057              }).with(
058                new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE())
059              ).with(
060                UnicodeEscaper.outsideOf(32, 0x7f) 
061            );
062    
063        public static final CharSequenceTranslator ESCAPE_ECMASCRIPT = 
064            new AggregateTranslator(
065                new LookupTranslator(
066                          new String[][] { 
067                                {"'", "\\'"},
068                                {"\"", "\\\""},
069                                {"\\", "\\\\"},
070                                {"/", "\\/"}
071                          }),
072                new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_ESCAPE()),
073                UnicodeEscaper.outsideOf(32, 0x7f) 
074            );
075                
076        public static final CharSequenceTranslator ESCAPE_XML = 
077            new AggregateTranslator(
078                new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
079                new LookupTranslator(EntityArrays.APOS_ESCAPE())
080            );
081    
082        public static final CharSequenceTranslator ESCAPE_HTML3 = 
083            new AggregateTranslator(
084                new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
085                new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE())
086            );
087    
088        public static final CharSequenceTranslator ESCAPE_HTML4 = 
089            new AggregateTranslator(
090                new LookupTranslator(EntityArrays.BASIC_ESCAPE()),
091                new LookupTranslator(EntityArrays.ISO8859_1_ESCAPE()),
092                new LookupTranslator(EntityArrays.HTML40_EXTENDED_ESCAPE())
093            );
094    
095        public static final CharSequenceTranslator ESCAPE_CSV = new CsvEscaper();
096    
097        // TODO: Create a parent class - 'SinglePassTranslator' ?
098        // TODO: It would handle the index checking, and length returning, and 
099        // TODO: could also have an optimization check method.
100        static class CsvEscaper extends CharSequenceTranslator {
101    
102            private static final char CSV_DELIMITER = ',';
103            private static final char CSV_QUOTE = '"';
104            private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
105            private static final char[] CSV_SEARCH_CHARS = new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
106    
107            // TODO: Replace with a RegexTranslator. That should consume the number of characters the regex uses up?
108            @Override
109            public int translate(CharSequence input, int index, Writer out) throws IOException {
110    
111                if(index != 0) {
112                    throw new IllegalStateException("CsvEscaper should never reach the [1] index");
113                }
114    
115                if (StringUtils.containsNone(input.toString(), CSV_SEARCH_CHARS)) {
116                    out.write(input.toString());
117                } else {
118                    out.write(CSV_QUOTE);
119                    out.write(StringUtils.replace(input.toString(), CSV_QUOTE_STR, CSV_QUOTE_STR + CSV_QUOTE_STR));
120                    out.write(CSV_QUOTE);
121                }
122                return input.length();
123            }
124        }
125    
126        /* UNESCAPE TRANSLATORS */
127    
128        // TODO: throw "illegal character: \92" as an Exception if a \ on the end of the Java (as per the compiler)?
129        public static final CharSequenceTranslator UNESCAPE_JAVA = 
130            new AggregateTranslator(
131                new UnicodeUnescaper(),
132                new LookupTranslator(EntityArrays.JAVA_CTRL_CHARS_UNESCAPE()),
133                new LookupTranslator(
134                          new String[][] { 
135                                {"\\\\", "\\"},
136                                {"\\\"", "\""},
137                                {"\\'", "'"},
138                                {"\\", ""}
139                          })
140            );
141    
142        public static final CharSequenceTranslator UNESCAPE_ECMASCRIPT = UNESCAPE_JAVA;
143    
144        public static final CharSequenceTranslator UNESCAPE_HTML3 = 
145            new AggregateTranslator(
146                new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
147                new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
148                new NumericEntityUnescaper()
149            );
150    
151        public static final CharSequenceTranslator UNESCAPE_HTML4 = 
152            new AggregateTranslator(
153                new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
154                new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
155                new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()),
156                new NumericEntityUnescaper()
157            );
158    
159        public static final CharSequenceTranslator UNESCAPE_XML = 
160            new AggregateTranslator(
161                new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
162                new LookupTranslator(EntityArrays.APOS_UNESCAPE()),
163                new NumericEntityUnescaper()
164            );
165    
166        public static final CharSequenceTranslator UNESCAPE_CSV = new CsvUnescaper();
167    
168        static class CsvUnescaper extends CharSequenceTranslator {
169    
170            private static final char CSV_DELIMITER = ',';
171            private static final char CSV_QUOTE = '"';
172            private static final String CSV_QUOTE_STR = String.valueOf(CSV_QUOTE);
173            private static final char[] CSV_SEARCH_CHARS = new char[] {CSV_DELIMITER, CSV_QUOTE, CharUtils.CR, CharUtils.LF};
174    
175            // TODO: Replace with a RegexTranslator. That should consume the number of characters the regex uses up?
176            @Override
177            public int translate(CharSequence input, int index, Writer out) throws IOException {
178    
179                if(index != 0) {
180                    throw new IllegalStateException("CsvUnescaper should never reach the [1] index");
181                }
182    
183                if ( input.charAt(0) != CSV_QUOTE || input.charAt(input.length() - 1) != CSV_QUOTE ) {
184                    out.write(input.toString());
185                    return input.length();
186                }
187    
188                // strip quotes
189                String quoteless = input.subSequence(1, input.length() - 1).toString();
190    
191                if ( StringUtils.containsAny(quoteless, CSV_SEARCH_CHARS) ) {
192                    // deal with escaped quotes; ie) ""
193                    out.write(StringUtils.replace(quoteless, CSV_QUOTE_STR + CSV_QUOTE_STR, CSV_QUOTE_STR));
194                } else {
195                    out.write(input.toString());
196                }
197                return input.length();
198            }
199        }
200    
201        /* Helper functions */
202    
203        /**
204         * <p><code>StringEscapeUtils</code> instances should NOT be constructed in
205         * standard programming.</p>
206         *
207         * <p>Instead, the class should be used as:
208         * <pre>StringEscapeUtils.escapeJava("foo");</pre></p>
209         *
210         * <p>This constructor is public to permit tools that require a JavaBean
211         * instance to operate.</p>
212         */
213        public StringEscapeUtils() {
214          super();
215        }
216    
217        // Java and JavaScript
218        //--------------------------------------------------------------------------
219        /**
220         * <p>Escapes the characters in a <code>String</code> using Java String rules.</p>
221         *
222         * <p>Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
223         *
224         * <p>So a tab becomes the characters <code>'\\'</code> and
225         * <code>'t'</code>.</p>
226         *
227         * <p>The only difference between Java strings and JavaScript strings
228         * is that in JavaScript, a single quote and forward-slash (/) are escaped.</p>
229         *
230         * <p>Example:
231         * <pre>
232         * input string: He didn't say, "Stop!"
233         * output string: He didn't say, \"Stop!\"
234         * </pre>
235         * </p>
236         *
237         * @param input  String to escape values in, may be null
238         * @return String with escaped values, <code>null</code> if null string input
239         */
240        public static final String escapeJava(String input) {
241            return ESCAPE_JAVA.translate(input);
242        }
243    
244        /**
245         * <p>Escapes the characters in a <code>String</code> using EcmaScript String rules.</p>
246         * <p>Escapes any values it finds into their EcmaScript String form.
247         * Deals correctly with quotes and control-chars (tab, backslash, cr, ff, etc.) </p>
248         *
249         * <p>So a tab becomes the characters <code>'\\'</code> and
250         * <code>'t'</code>.</p>
251         *
252         * <p>The only difference between Java strings and EcmaScript strings
253         * is that in EcmaScript, a single quote and forward-slash (/) are escaped.</p>
254         *
255         * <p>Note that EcmaScript is best known by the JavaScript and ActionScript dialects. </p>
256         *
257         * <p>Example:
258         * <pre>
259         * input string: He didn't say, "Stop!"
260         * output string: He didn\'t say, \"Stop!\"
261         * </pre>
262         * </p>
263         *
264         * @param input  String to escape values in, may be null
265         * @return String with escaped values, <code>null</code> if null string input
266         */
267        public static final String escapeEcmaScript(String input) {
268            return ESCAPE_ECMASCRIPT.translate(input);
269        }
270    
271        /**
272         * <p>Unescapes any Java literals found in the <code>String</code>.
273         * For example, it will turn a sequence of <code>'\'</code> and
274         * <code>'n'</code> into a newline character, unless the <code>'\'</code>
275         * is preceded by another <code>'\'</code>.</p>
276         * 
277         * @param input  the <code>String</code> to unescape, may be null
278         * @return a new unescaped <code>String</code>, <code>null</code> if null string input
279         */
280        public static final String unescapeJava(String input) {
281            return UNESCAPE_JAVA.translate(input);
282        }
283    
284        /**
285         * <p>Unescapes any EcmaScript literals found in the <code>String</code>.</p>
286         *
287         * <p>For example, it will turn a sequence of <code>'\'</code> and <code>'n'</code>
288         * into a newline character, unless the <code>'\'</code> is preceded by another
289         * <code>'\'</code>.</p>
290         *
291         * @see #unescapeJava(String)
292         * @param input  the <code>String</code> to unescape, may be null
293         * @return A new unescaped <code>String</code>, <code>null</code> if null string input
294         */
295        public static final String unescapeEcmaScript(String input) {
296            return UNESCAPE_ECMASCRIPT.translate(input);
297        }
298    
299        // HTML and XML
300        //--------------------------------------------------------------------------
301        /**
302         * <p>Escapes the characters in a <code>String</code> using HTML entities.</p>
303         *
304         * <p>
305         * For example:
306         * </p> 
307         * <p><code>"bread" & "butter"</code></p>
308         * becomes:
309         * <p>
310         * <code>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</code>.
311         * </p>
312         *
313         * <p>Supports all known HTML 4.0 entities, including funky accents.
314         * Note that the commonly used apostrophe escape character (&amp;apos;)
315         * is not a legal entity and so is not supported). </p>
316         *
317         * @param input  the <code>String</code> to escape, may be null
318         * @return a new escaped <code>String</code>, <code>null</code> if null string input
319         * 
320         * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
321         * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
322         * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
323         * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
324         * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
325         */
326        public static final String escapeHtml4(String input) {
327            return ESCAPE_HTML4.translate(input);
328        }
329    
330        public static final String escapeHtml3(String input) {
331            return ESCAPE_HTML3.translate(input);
332        }
333                    
334        //-----------------------------------------------------------------------
335        /**
336         * <p>Unescapes a string containing entity escapes to a string
337         * containing the actual Unicode characters corresponding to the
338         * escapes. Supports HTML 4.0 entities.</p>
339         *
340         * <p>For example, the string "&amp;lt;Fran&amp;ccedil;ais&amp;gt;"
341         * will become "&lt;Fran&ccedil;ais&gt;"</p>
342         *
343         * <p>If an entity is unrecognized, it is left alone, and inserted
344         * verbatim into the result string. e.g. "&amp;gt;&amp;zzzz;x" will
345         * become "&gt;&amp;zzzz;x".</p>
346         *
347         * @param input  the <code>String</code> to unescape, may be null
348         * @return a new unescaped <code>String</code>, <code>null</code> if null string input
349         */
350        public static final String unescapeHtml4(String input) {
351            return UNESCAPE_HTML4.translate(input);
352        }
353    
354        public static final String unescapeHtml3(String input) {
355            return UNESCAPE_HTML3.translate(input);
356        }
357    
358        //-----------------------------------------------------------------------
359        /**
360         * <p>Escapes the characters in a <code>String</code> using XML entities.</p>
361         *
362         * <p>For example: <tt>"bread" & "butter"</tt> =>
363         * <tt>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</tt>.
364         * </p>
365         *
366         * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
367         * Does not support DTDs or external entities.</p>
368         *
369         * <p>Note that unicode characters greater than 0x7f are as of 3.0, no longer 
370         *    escaped. </p>
371         *
372         * @param input  the <code>String</code> to escape, may be null
373         * @return a new escaped <code>String</code>, <code>null</code> if null string input
374         * @see #unescapeXml(java.lang.String)
375         */
376        public static final String escapeXml(String input) {
377            return ESCAPE_XML.translate(input);
378        }
379                    
380    
381        //-----------------------------------------------------------------------
382        /**
383         * <p>Unescapes a string containing XML entity escapes to a string
384         * containing the actual Unicode characters corresponding to the
385         * escapes.</p>
386         *
387         * <p>Supports only the five basic XML entities (gt, lt, quot, amp, apos).
388         * Does not support DTDs or external entities.</p>
389         *
390         * <p>Note that numerical \\u unicode codes are unescaped to their respective 
391         *    unicode characters. This may change in future releases. </p>
392         *
393         * @param input  the <code>String</code> to unescape, may be null
394         * @return a new unescaped <code>String</code>, <code>null</code> if null string input
395         * @see #escapeXml(String)
396         */
397        public static final String unescapeXml(String input) {
398            return UNESCAPE_XML.translate(input);
399        }
400                    
401    
402        //-----------------------------------------------------------------------
403    
404        /**
405         * <p>Returns a <code>String</code> value for a CSV column enclosed in double quotes,
406         * if required.</p>
407         *
408         * <p>If the value contains a comma, newline or double quote, then the
409         *    String value is returned enclosed in double quotes.</p>
410         * </p>
411         *
412         * <p>Any double quote characters in the value are escaped with another double quote.</p>
413         *
414         * <p>If the value does not contain a comma, newline or double quote, then the
415         *    String value is returned unchanged.</p>
416         * </p>
417         *
418         * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
419         * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
420         *
421         * @param input the input CSV column String, may be null
422         * @return the input String, enclosed in double quotes if the value contains a comma,
423         * newline or double quote, <code>null</code> if null string input
424         * @since 2.4
425         */
426        public static final String escapeCsv(String input) {
427            return ESCAPE_CSV.translate(input);
428        }
429    
430        /**
431         * <p>Returns a <code>String</code> value for an unescaped CSV column. </p>
432         *
433         * <p>If the value is enclosed in double quotes, and contains a comma, newline 
434         *    or double quote, then quotes are removed. 
435         * </p>
436         *
437         * <p>Any double quote escaped characters (a pair of double quotes) are unescaped 
438         *    to just one double quote. </p>
439         *
440         * <p>If the value is not enclosed in double quotes, or is and does not contain a 
441         *    comma, newline or double quote, then the String value is returned unchanged.</p>
442         * </p>
443         *
444         * see <a href="http://en.wikipedia.org/wiki/Comma-separated_values">Wikipedia</a> and
445         * <a href="http://tools.ietf.org/html/rfc4180">RFC 4180</a>.
446         *
447         * @param input the input CSV column String, may be null
448         * @return the input String, with enclosing double quotes removed and embedded double 
449         * quotes unescaped, <code>null</code> if null string input
450         * @since 2.4
451         */
452        public static final String unescapeCsv(String input) {
453            return UNESCAPE_CSV.translate(input);
454        }
455    
456    }