View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.commons.io.input;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.InputStreamReader;
25  import java.io.Reader;
26  import java.io.StringReader;
27  import java.net.HttpURLConnection;
28  import java.net.URL;
29  import java.net.URLConnection;
30  import java.nio.charset.Charset;
31  import java.nio.charset.StandardCharsets;
32  import java.nio.file.Files;
33  import java.nio.file.Path;
34  import java.text.MessageFormat;
35  import java.util.Locale;
36  import java.util.Objects;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  import org.apache.commons.io.ByteOrderMark;
41  import org.apache.commons.io.Charsets;
42  import org.apache.commons.io.IOUtils;
43  import org.apache.commons.io.build.AbstractStreamBuilder;
44  import org.apache.commons.io.function.IOConsumer;
45  import org.apache.commons.io.output.XmlStreamWriter;
46  
47  /**
48   * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream.
49   * <p>
50   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
51   * </p>
52   * <p>
53   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100%
54   * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers).
55   * </p>
56   * <p>
57   * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors.
58   * </p>
59   * <p>
60   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML
61   * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="https://diveintomark.org/archives/2004/02/13/xml-media-types">
62   * Determining the character encoding of a feed</a>.
63   * </p>
64   * <p>
65   * To build an instance, use {@link Builder}.
66   * </p>
67   * <p>
68   * Originally developed for <a href="https://rome.dev.java.net">ROME</a> under Apache License 2.0.
69   * </p>
70   *
71   * @see Builder
72   * @see org.apache.commons.io.output.XmlStreamWriter
73   * @since 2.0
74   */
75  public class XmlStreamReader extends Reader {
76  
77      // @formatter:off
78      /**
79       * Builds a new {@link XmlStreamWriter}.
80       *
81       * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
82       * <p>
83       * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
84       * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
85       * </p>
86       * <p>
87       * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
88       * </p>
89       * <p>
90       * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
91       * </p>
92       * <p>
93       * Else if the XML prolog had a charset encoding that encoding is used.
94       * </p>
95       * <p>
96       * Else if the content type had a charset encoding that encoding is used.
97       * </p>
98       * <p>
99       * Else 'UTF-8' is used.
100      * </p>
101      * <p>
102      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
103      * </p>
104      * <p>
105      * For example:
106      * </p>
107      *
108      * <pre>{@code
109      * XmlStreamReader r = XmlStreamReader.builder()
110      *   .setPath(path)
111      *   .setCharset(StandardCharsets.UTF_8)
112      *   .get();
113      * }
114      * </pre>
115      *
116      * @see #get()
117      * @since 2.12.0
118      */
119     // @formatter:on
120     public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
121 
122         private boolean nullCharset = true;
123         private boolean lenient = true;
124         private String httpContentType;
125 
126         /**
127          * Builds a new {@link XmlStreamWriter}.
128          * <p>
129          * You must set input that supports {@link #getInputStream()}, otherwise, this method throws an exception.
130          * </p>
131          * <p>
132          * This builder use the following aspects:
133          * </p>
134          * <ul>
135          * <li>{@link #getInputStream()}</li>
136          * <li>{@link #getCharset()}</li>
137          * <li>lenient</li>
138          * <li>httpContentType</li>
139          * </ul>
140          *
141          * @return a new instance.
142          * @throws IllegalStateException         if the {@code origin} is {@code null}.
143          * @throws UnsupportedOperationException if the origin cannot be converted to an {@link InputStream}.
144          * @throws IOException                   if an I/O error occurs.
145          * @throws XmlStreamReaderException thrown if the Charset encoding could not be determined according to the specification.
146          * @see #getInputStream()
147          */
148         @SuppressWarnings("resource")
149         @Override
150         public XmlStreamReader get() throws IOException {
151             final String defaultEncoding = nullCharset ? null : getCharset().name();
152             // @formatter:off
153             return httpContentType == null
154                     ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
155                     : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
156             // @formatter:on
157         }
158 
159         @Override
160         public Builder setCharset(final Charset charset) {
161             nullCharset = charset == null;
162             return super.setCharset(charset);
163         }
164 
165         @Override
166         public Builder setCharset(final String charset) {
167             nullCharset = charset == null;
168             return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
169         }
170 
171         /**
172          * Sets the HTTP content type.
173          *
174          * @param httpContentType the HTTP content type.
175          * @return this.
176          */
177         public Builder setHttpContentType(final String httpContentType) {
178             this.httpContentType = httpContentType;
179             return this;
180         }
181 
182         /**
183          * Sets the lenient toggle.
184          *
185          * @param lenient the lenient toggle.
186          * @return this.
187          */
188         public Builder setLenient(final boolean lenient) {
189             this.lenient = lenient;
190             return this;
191         }
192 
193     }
194 
195     private static final String UTF_8 = StandardCharsets.UTF_8.name();
196 
197     private static final String US_ASCII = StandardCharsets.US_ASCII.name();
198 
199     private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
200 
201     private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
202 
203     private static final String UTF_32BE = "UTF-32BE";
204 
205     private static final String UTF_32LE = "UTF-32LE";
206 
207     private static final String UTF_16 = StandardCharsets.UTF_16.name();
208 
209     private static final String UTF_32 = "UTF-32";
210 
211     private static final String EBCDIC = "CP1047";
212 
213     private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
214             ByteOrderMark.UTF_32LE };
215 
216     /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */
217     private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
218             new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
219             new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
220             new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
221             new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
222 
223     private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
224 
225     /**
226      * Pattern capturing the encoding of the <a href="https://www.w3.org/TR/REC-xml/#sec-pi">{@code 'xml'} processing instruction</a>.
227      * <p>
228      * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">NT-EncName</a> XML specification.
229      * </p>
230      * <p>
231      * Note the documented pattern is:
232      * </p>
233      * <pre>
234      * EncName   ::=   [A-Za-z] ([A-Za-z0-9._] | '-')*
235      * </pre>
236      * <p>
237      * However this does not match all the aliases that are supported by Java. For example, {@code '437'}, {@code 'ISO_8859-1:1987'} and
238      * {@code 'ebcdic-de-273+euro'}.
239      * </p>
240      */
241     public static final Pattern ENCODING_PATTERN = Pattern.compile(
242     // @formatter:off
243             "^<\\?xml\\s+"
244             + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
245             + "encoding\\s*=\\s*"
246             + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"  // double-quoted
247             +  "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted
248             Pattern.MULTILINE);
249     // @formatter:on
250 
251     private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
252 
253     private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
254 
255     private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
256 
257     private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
258 
259     private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
260 
261     /**
262      * Constructs a new {@link Builder}.
263      *
264      * @return a new {@link Builder}.
265      * @since 2.12.0
266      */
267     public static Builder builder() {
268         return new Builder();
269     }
270 
271     /**
272      * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}.
273      *
274      * @param httpContentType the HTTP content type
275      * @return The content type encoding (upcased)
276      */
277     static String getContentTypeEncoding(final String httpContentType) {
278         String encoding = null;
279         if (httpContentType != null) {
280             final int i = httpContentType.indexOf(";");
281             if (i > -1) {
282                 final String postMime = httpContentType.substring(i + 1);
283                 final Matcher m = CHARSET_PATTERN.matcher(postMime);
284                 encoding = m.find() ? m.group(1) : null;
285                 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
286             }
287         }
288         return encoding;
289     }
290 
291     /**
292      * Gets the MIME type or {@code null} if httpContentType is {@code null}.
293      *
294      * @param httpContentType the HTTP content type
295      * @return The mime content type
296      */
297     static String getContentTypeMime(final String httpContentType) {
298         String mime = null;
299         if (httpContentType != null) {
300             final int i = httpContentType.indexOf(";");
301             mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
302             mime = mime.trim();
303         }
304         return mime;
305     }
306 
307     /**
308      * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none.
309      *
310      * @param inputStream InputStream to create the reader from.
311      * @param guessedEnc  guessed encoding
312      * @return the encoding declared in the <?xml encoding=...?>
313      * @throws IOException thrown if there is a problem reading the stream.
314      */
315     private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
316         String encoding = null;
317         if (guessedEnc != null) {
318             final byte[] bytes = IOUtils.byteArray();
319             inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
320             int offset = 0;
321             int max = IOUtils.DEFAULT_BUFFER_SIZE;
322             int c = inputStream.read(bytes, offset, max);
323             int firstGT = -1;
324             String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning)
325             while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
326                 offset += c;
327                 max -= c;
328                 c = inputStream.read(bytes, offset, max);
329                 xmlProlog = new String(bytes, 0, offset, guessedEnc);
330                 firstGT = xmlProlog.indexOf('>');
331             }
332             if (firstGT == -1) {
333                 if (c == -1) {
334                     throw new IOException("Unexpected end of XML stream");
335                 }
336                 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
337             }
338             final int bytesRead = offset;
339             if (bytesRead > 0) {
340                 inputStream.reset();
341                 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
342                 final StringBuilder prolog = new StringBuilder();
343                 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
344                 final Matcher m = ENCODING_PATTERN.matcher(prolog);
345                 if (m.find()) {
346                     encoding = m.group(1).toUpperCase(Locale.ROOT);
347                     encoding = encoding.substring(1, encoding.length() - 1);
348                 }
349             }
350         }
351         return encoding;
352     }
353 
354     /**
355      * Tests if the MIME type belongs to the APPLICATION XML family.
356      *
357      * @param mime The mime type
358      * @return true if the mime type belongs to the APPLICATION XML family, otherwise false
359      */
360     static boolean isAppXml(final String mime) {
361         return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
362                 || mime.startsWith("application/") && mime.endsWith("+xml"));
363     }
364 
365     /**
366      * Tests if the MIME type belongs to the TEXT XML family.
367      *
368      * @param mime The mime type
369      * @return true if the mime type belongs to the TEXT XML family, otherwise false
370      */
371     static boolean isTextXml(final String mime) {
372         return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
373     }
374 
375     private final Reader reader;
376 
377     private final String encoding;
378 
379     private final String defaultEncoding;
380 
381     /**
382      * Constructs a Reader for a File.
383      * <p>
384      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
385      * </p>
386      * <p>
387      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
388      * </p>
389      *
390      * @param file File to create a Reader from.
391      * @throws NullPointerException if the input is {@code null}.
392      * @throws IOException          thrown if there is a problem reading the file.
393      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
394      */
395     @Deprecated
396     public XmlStreamReader(final File file) throws IOException {
397         this(Objects.requireNonNull(file, "file").toPath());
398     }
399 
400     /**
401      * Constructs a Reader for a raw InputStream.
402      * <p>
403      * It follows the same logic used for files.
404      * </p>
405      * <p>
406      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
407      * </p>
408      *
409      * @param inputStream InputStream to create a Reader from.
410      * @throws NullPointerException if the input stream is {@code null}.
411      * @throws IOException          thrown if there is a problem reading the stream.
412      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
413      */
414     @Deprecated
415     public XmlStreamReader(final InputStream inputStream) throws IOException {
416         this(inputStream, true);
417     }
418 
419     /**
420      * Constructs a Reader for a raw InputStream.
421      * <p>
422      * It follows the same logic used for files.
423      * </p>
424      * <p>
425      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
426      * </p>
427      * <p>
428      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
429      * </p>
430      * <p>
431      * Else if the XML prolog had a charset encoding that encoding is used.
432      * </p>
433      * <p>
434      * Else if the content type had a charset encoding that encoding is used.
435      * </p>
436      * <p>
437      * Else 'UTF-8' is used.
438      * </p>
439      * <p>
440      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
441      * </p>
442      *
443      * @param inputStream InputStream to create a Reader from.
444      * @param lenient     indicates if the charset encoding detection should be relaxed.
445      * @throws NullPointerException     if the input stream is {@code null}.
446      * @throws IOException              thrown if there is a problem reading the stream.
447      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
448      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
449      */
450     @Deprecated
451     public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
452         this(inputStream, lenient, null);
453     }
454 
455     /**
456      * Constructs a Reader for a raw InputStream.
457      * <p>
458      * It follows the same logic used for files.
459      * </p>
460      * <p>
461      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
462      * </p>
463      * <p>
464      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
465      * </p>
466      * <p>
467      * Else if the XML prolog had a charset encoding that encoding is used.
468      * </p>
469      * <p>
470      * Else if the content type had a charset encoding that encoding is used.
471      * </p>
472      * <p>
473      * Else 'UTF-8' is used.
474      * </p>
475      * <p>
476      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
477      * </p>
478      *
479      * @param inputStream     InputStream to create a Reader from.
480      * @param lenient         indicates if the charset encoding detection should be relaxed.
481      * @param defaultEncoding The default encoding
482      * @throws NullPointerException     if the input stream is {@code null}.
483      * @throws IOException              thrown if there is a problem reading the stream.
484      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
485      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
486      */
487     @Deprecated
488     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
489     public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
490         this.defaultEncoding = defaultEncoding;
491         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
492                 false, BOMS);
493         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
494         this.encoding = processHttpStream(bom, pis, lenient);
495         this.reader = new InputStreamReader(pis, encoding);
496     }
497 
498     /**
499      * Constructs a Reader using an InputStream and the associated content-type header.
500      * <p>
501      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
502      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
503      * </p>
504      * <p>
505      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
506      * </p>
507      *
508      * @param inputStream     InputStream to create the reader from.
509      * @param httpContentType content-type header to use for the resolution of the charset encoding.
510      * @throws NullPointerException if the input stream is {@code null}.
511      * @throws IOException          thrown if there is a problem reading the file.
512      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
513      */
514     @Deprecated
515     public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
516         this(inputStream, httpContentType, true);
517     }
518 
519     /**
520      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
521      * <p>
522      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
523      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
524      * </p>
525      * <p>
526      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
527      * </p>
528      * <p>
529      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
530      * </p>
531      * <p>
532      * Else if the XML prolog had a charset encoding that encoding is used.
533      * </p>
534      * <p>
535      * Else if the content type had a charset encoding that encoding is used.
536      * </p>
537      * <p>
538      * Else 'UTF-8' is used.
539      * </p>
540      * <p>
541      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
542      * </p>
543      *
544      * @param inputStream     InputStream to create the reader from.
545      * @param httpContentType content-type header to use for the resolution of the charset encoding.
546      * @param lenient         indicates if the charset encoding detection should be relaxed.
547      * @throws NullPointerException     if the input stream is {@code null}.
548      * @throws IOException              thrown if there is a problem reading the file.
549      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
550      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
551      */
552     @Deprecated
553     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
554         this(inputStream, httpContentType, lenient, null);
555     }
556 
557     /**
558      * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection.
559      * <p>
560      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog
561      * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type.
562      * </p>
563      * <p>
564      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following:
565      * </p>
566      * <p>
567      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
568      * </p>
569      * <p>
570      * Else if the XML prolog had a charset encoding that encoding is used.
571      * </p>
572      * <p>
573      * Else if the content type had a charset encoding that encoding is used.
574      * </p>
575      * <p>
576      * Else 'UTF-8' is used.
577      * </p>
578      * <p>
579      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
580      * </p>
581      *
582      * @param inputStream     InputStream to create the reader from.
583      * @param httpContentType content-type header to use for the resolution of the charset encoding.
584      * @param lenient         indicates if the charset encoding detection should be relaxed.
585      * @param defaultEncoding The default encoding
586      * @throws NullPointerException     if the input stream is {@code null}.
587      * @throws IOException              thrown if there is a problem reading the file.
588      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification.
589      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
590      */
591     @Deprecated
592     @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance.
593     public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
594             throws IOException {
595         this.defaultEncoding = defaultEncoding;
596         final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
597                 false, BOMS);
598         final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
599         this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
600         this.reader = new InputStreamReader(pis, encoding);
601     }
602 
603     /**
604      * Constructs a Reader for a File.
605      * <p>
606      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8.
607      * </p>
608      * <p>
609      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
610      * </p>
611      *
612      * @param file File to create a Reader from.
613      * @throws NullPointerException if the input is {@code null}.
614      * @throws IOException          thrown if there is a problem reading the file.
615      * @since 2.11.0
616      * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()}
617      */
618     @Deprecated
619     @SuppressWarnings("resource") // InputStream is managed through another reader in this instance.
620     public XmlStreamReader(final Path file) throws IOException {
621         this(Files.newInputStream(Objects.requireNonNull(file, "file")));
622     }
623 
624     /**
625      * Constructs a Reader using the InputStream of a URL.
626      * <p>
627      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files.
628      * </p>
629      * <p>
630      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type.
631      * </p>
632      * <p>
633      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
634      * </p>
635      *
636      * @param url URL to create a Reader from.
637      * @throws NullPointerException if the input is {@code null}.
638      * @throws IOException          thrown if there is a problem reading the stream of the URL.
639      */
640     public XmlStreamReader(final URL url) throws IOException {
641         this(Objects.requireNonNull(url, "url").openConnection(), null);
642     }
643 
644     /**
645      * Constructs a Reader using the InputStream of a URLConnection.
646      * <p>
647      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files.
648      * </p>
649      * <p>
650      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with
651      * content-type.
652      * </p>
653      * <p>
654      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
655      * </p>
656      *
657      * @param urlConnection   URLConnection to create a Reader from.
658      * @param defaultEncoding The default encoding
659      * @throws NullPointerException if the input is {@code null}.
660      * @throws IOException          thrown if there is a problem reading the stream of the URLConnection.
661      */
662     public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
663         Objects.requireNonNull(urlConnection, "urlConnection");
664         this.defaultEncoding = defaultEncoding;
665         final boolean lenient = true;
666         final String contentType = urlConnection.getContentType();
667         final InputStream inputStream = urlConnection.getInputStream();
668         @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance
669         // @formatter:off
670         final BOMInputStream bomInput = BOMInputStream.builder()
671             .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
672             .setInclude(false)
673             .setByteOrderMarks(BOMS)
674             .get();
675         @SuppressWarnings("resource")
676         final BOMInputStream piInput = BOMInputStream.builder()
677             .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
678             .setInclude(true)
679             .setByteOrderMarks(XML_GUESS_BYTES)
680             .get();
681         // @formatter:on
682         if (urlConnection instanceof HttpURLConnection || contentType != null) {
683             this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
684         } else {
685             this.encoding = processHttpStream(bomInput, piInput, lenient);
686         }
687         this.reader = new InputStreamReader(piInput, encoding);
688     }
689 
690     /**
691      * Calculates the HTTP encoding.
692      * @param bomEnc          BOM encoding
693      * @param xmlGuessEnc     XML Guess encoding
694      * @param xmlEnc          XML encoding
695      * @param lenient         indicates if the charset encoding detection should be relaxed.
696      * @param httpContentType The HTTP content type
697      *
698      * @return the HTTP encoding
699      * @throws IOException thrown if there is a problem reading the stream.
700      */
701     String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
702             throws IOException {
703 
704         // Lenient and has XML encoding
705         if (lenient && xmlEnc != null) {
706             return xmlEnc;
707         }
708 
709         // Determine mime/encoding content types from HTTP Content Type
710         final String cTMime = getContentTypeMime(httpContentType);
711         final String cTEnc = getContentTypeEncoding(httpContentType);
712         final boolean appXml = isAppXml(cTMime);
713         final boolean textXml = isTextXml(cTMime);
714 
715         // Mime type NOT "application/xml" or "text/xml"
716         if (!appXml && !textXml) {
717             final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
718             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
719         }
720 
721         // No content type encoding
722         if (cTEnc == null) {
723             if (appXml) {
724                 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
725             }
726             return defaultEncoding == null ? US_ASCII : defaultEncoding;
727         }
728 
729         // UTF-16BE or UTF-16LE content type encoding
730         if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
731             if (bomEnc != null) {
732                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
733                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
734             }
735             return cTEnc;
736         }
737 
738         // UTF-16 content type encoding
739         if (cTEnc.equals(UTF_16)) {
740             if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
741                 return bomEnc;
742             }
743             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
744             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
745         }
746 
747         // UTF-32BE or UTF-132E content type encoding
748         if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
749             if (bomEnc != null) {
750                 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
751                 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
752             }
753             return cTEnc;
754         }
755 
756         // UTF-32 content type encoding
757         if (cTEnc.equals(UTF_32)) {
758             if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
759                 return bomEnc;
760             }
761             final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
762             throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
763         }
764 
765         return cTEnc;
766     }
767 
768     /**
769      * Calculate the raw encoding.
770      *
771      * @param bomEnc      BOM encoding
772      * @param xmlGuessEnc XML Guess encoding
773      * @param xmlEnc      XML encoding
774      * @return the raw encoding
775      * @throws IOException thrown if there is a problem reading the stream.
776      */
777     String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
778 
779         // BOM is Null
780         if (bomEnc == null) {
781             if (xmlGuessEnc == null || xmlEnc == null) {
782                 return defaultEncoding == null ? UTF_8 : defaultEncoding;
783             }
784             if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
785                 return xmlGuessEnc;
786             }
787             return xmlEnc;
788         }
789 
790         // BOM is UTF-8
791         if (bomEnc.equals(UTF_8)) {
792             if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
793                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
794                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
795             }
796             if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
797                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
798                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
799             }
800             return bomEnc;
801         }
802 
803         // BOM is UTF-16BE or UTF-16LE
804         if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
805             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
806                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
807                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
808             }
809             if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
810                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
811                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
812             }
813             return bomEnc;
814         }
815 
816         // BOM is UTF-32BE or UTF-32LE
817         if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
818             if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
819                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
820                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
821             }
822             if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
823                 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
824                 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
825             }
826             return bomEnc;
827         }
828 
829         // BOM is something else
830         final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
831         throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
832     }
833 
834     /**
835      * Closes the XmlStreamReader stream.
836      *
837      * @throws IOException thrown if there was a problem closing the stream.
838      */
839     @Override
840     public void close() throws IOException {
841         reader.close();
842     }
843 
844     /**
845      * Does lenient detection.
846      *
847      * @param httpContentType content-type header to use for the resolution of the charset encoding.
848      * @param ex              The thrown exception
849      * @return the encoding
850      * @throws IOException thrown if there is a problem reading the stream.
851      */
852     private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
853         if (httpContentType != null && httpContentType.startsWith("text/html")) {
854             httpContentType = httpContentType.substring("text/html".length());
855             httpContentType = "text/xml" + httpContentType;
856             try {
857                 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
858             } catch (final XmlStreamReaderException ex2) {
859                 ex = ex2;
860             }
861         }
862         String encoding = ex.getXmlEncoding();
863         if (encoding == null) {
864             encoding = ex.getContentTypeEncoding();
865         }
866         if (encoding == null) {
867             encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
868         }
869         return encoding;
870     }
871 
872     /**
873      * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate.
874      * <p>
875      * If it is {@code null} the content-type based rules are used.
876      * </p>
877      *
878      * @return the default encoding to use.
879      */
880     public String getDefaultEncoding() {
881         return defaultEncoding;
882     }
883 
884     /**
885      * Gets the charset encoding of the XmlStreamReader.
886      *
887      * @return charset encoding.
888      */
889     public String getEncoding() {
890         return encoding;
891     }
892 
893     /**
894      * Process the raw stream.
895      *
896      * @param bomInput     BOMInputStream to detect byte order marks
897      * @param piInput     BOMInputStream to guess XML encoding
898      * @param lenient indicates if the charset encoding detection should be relaxed.
899      * @return the encoding to be used
900      * @throws IOException thrown if there is a problem reading the stream.
901      */
902     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
903         final String bomEnc = bomInput.getBOMCharsetName();
904         final String xmlGuessEnc = piInput.getBOMCharsetName();
905         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
906         try {
907             return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
908         } catch (final XmlStreamReaderException ex) {
909             if (lenient) {
910                 return doLenientDetection(null, ex);
911             }
912             throw ex;
913         }
914     }
915 
916     /**
917      * Processes an HTTP stream.
918      *
919      * @param bomInput        BOMInputStream to detect byte order marks
920      * @param piInput         BOMInputStream to guess XML encoding
921      * @param lenient         indicates if the charset encoding detection should be relaxed.
922      * @param httpContentType The HTTP content type
923      * @return the encoding to be used
924      * @throws IOException thrown if there is a problem reading the stream.
925      */
926     private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
927             throws IOException {
928         final String bomEnc = bomInput.getBOMCharsetName();
929         final String xmlGuessEnc = piInput.getBOMCharsetName();
930         final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
931         try {
932             return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
933         } catch (final XmlStreamReaderException ex) {
934             if (lenient) {
935                 return doLenientDetection(httpContentType, ex);
936             }
937             throw ex;
938         }
939     }
940 
941     /**
942      * Reads the underlying reader's {@code read(char[], int, int)} method.
943      *
944      * @param buf    the buffer to read the characters into
945      * @param offset The start offset
946      * @param len    The number of bytes to read
947      * @return the number of characters read or -1 if the end of stream
948      * @throws IOException if an I/O error occurs.
949      */
950     @Override
951     public int read(final char[] buf, final int offset, final int len) throws IOException {
952         return reader.read(buf, offset, len);
953     }
954 
955 }