View Javadoc
1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package org.codehaus.plexus.util.xml;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.URL;
23  import java.net.URLConnection;
24  
25  /**
26   * Character stream that handles (or at least attempts to) all the necessary Voodo to figure out the charset encoding of
27   * the XML document within the stream.
28   * <p>
29   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
30   * <p>
31   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
32   * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
33   * now, XmlReader handles it and things work in all parsers).
34   * <p>
35   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
36   * a wide set of constructors.
37   * <P>
38   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
39   * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog,
40   * <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a
41   * feed</a>.
42   * <p>
43   * 
44   * @author Alejandro Abdelnur
45   * @version revision 1.17 taken on 26/06/2007 from Rome (see
46   *          https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
47   * @since 1.4.4
48   */
49  public class XmlStreamReader
50      extends XmlReader
51  {
52      /**
53       * Creates a Reader for a File.
54       * <p>
55       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
56       * UTF-8.
57       * <p>
58       * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
59       * <p>
60       * 
61       * @param file File to create a Reader from.
62       * @throws IOException thrown if there is a problem reading the file.
63       */
64      public XmlStreamReader( File file )
65          throws IOException
66      {
67          super( file );
68      }
69  
70      /**
71       * Creates a Reader for a raw InputStream.
72       * <p>
73       * It follows the same logic used for files.
74       * <p>
75       * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
76       * <p>
77       * 
78       * @param is InputStream to create a Reader from.
79       * @throws IOException thrown if there is a problem reading the stream.
80       */
81      public XmlStreamReader( InputStream is )
82          throws IOException
83      {
84          super( is );
85      }
86  
87      /**
88       * Creates a Reader for a raw InputStream.
89       * <p>
90       * It follows the same logic used for files.
91       * <p>
92       * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
93       * following:
94       * <p>
95       * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
96       * <p>
97       * Else if the XML prolog had a charset encoding that encoding is used.
98       * <p>
99       * Else if the content type had a charset encoding that encoding is used.
100      * <p>
101      * Else 'UTF-8' is used.
102      * <p>
103      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
104      * <p>
105      * 
106      * @param is InputStream to create a Reader from.
107      * @param lenient indicates if the charset encoding detection should be relaxed.
108      * @throws IOException thrown if there is a problem reading the stream.
109      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
110      */
111     public XmlStreamReader( InputStream is, boolean lenient )
112         throws IOException, XmlStreamReaderException
113     {
114         super( is, lenient );
115     }
116 
117     /**
118      * Creates a Reader using the InputStream of a URL.
119      * <p>
120      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
121      * used for Files.
122      * <p>
123      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
124      * an InputStream with content-type.
125      * <p>
126      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
127      * <p>
128      * 
129      * @param url URL to create a Reader from.
130      * @throws IOException thrown if there is a problem reading the stream of the URL.
131      */
132     public XmlStreamReader( URL url )
133         throws IOException
134     {
135         super( url );
136     }
137 
138     /**
139      * Creates a Reader using the InputStream of a URLConnection.
140      * <p>
141      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
142      * it uses the same logic used for files.
143      * <p>
144      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
145      * used for an InputStream with content-type.
146      * <p>
147      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
148      * <p>
149      * 
150      * @param conn URLConnection to create a Reader from.
151      * @throws IOException thrown if there is a problem reading the stream of the URLConnection.
152      */
153     public XmlStreamReader( URLConnection conn )
154         throws IOException
155     {
156         super( conn );
157     }
158 
159     /**
160      * Creates a Reader using an InputStream an the associated content-type header.
161      * <p>
162      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
163      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
164      * encoding mandated by the content-type MIME type.
165      * <p>
166      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
167      * <p>
168      * 
169      * @param is InputStream to create the reader from.
170      * @param httpContentType content-type header to use for the resolution of the charset encoding.
171      * @throws IOException thrown if there is a problem reading the file.
172      */
173     public XmlStreamReader( InputStream is, String httpContentType )
174         throws IOException
175     {
176         super( is, httpContentType );
177     }
178 
179     /**
180      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
181      * regarding the encoding detection.
182      * <p>
183      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
184      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
185      * encoding mandated by the content-type MIME type.
186      * <p>
187      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
188      * following:
189      * <p>
190      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
191      * <p>
192      * Else if the XML prolog had a charset encoding that encoding is used.
193      * <p>
194      * Else if the content type had a charset encoding that encoding is used.
195      * <p>
196      * Else 'UTF-8' is used.
197      * <p>
198      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
199      * <p>
200      * 
201      * @param is InputStream to create the reader from.
202      * @param httpContentType content-type header to use for the resolution of the charset encoding.
203      * @param lenient indicates if the charset encoding detection should be relaxed.
204      * @param defaultEncoding encoding to use
205      * @throws IOException thrown if there is a problem reading the file.
206      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
207      */
208     public XmlStreamReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding )
209         throws IOException, XmlStreamReaderException
210     {
211         super( is, httpContentType, lenient, defaultEncoding );
212     }
213 
214     /**
215      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
216      * regarding the encoding detection.
217      * <p>
218      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
219      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
220      * encoding mandated by the content-type MIME type.
221      * <p>
222      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
223      * following:
224      * <p>
225      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
226      * <p>
227      * Else if the XML prolog had a charset encoding that encoding is used.
228      * <p>
229      * Else if the content type had a charset encoding that encoding is used.
230      * <p>
231      * Else 'UTF-8' is used.
232      * <p>
233      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
234      * <p>
235      * 
236      * @param is InputStream to create the reader from.
237      * @param httpContentType content-type header to use for the resolution of the charset encoding.
238      * @param lenient indicates if the charset encoding detection should be relaxed.
239      * @throws IOException thrown if there is a problem reading the file.
240      * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specs.
241      */
242     public XmlStreamReader( InputStream is, String httpContentType, boolean lenient )
243         throws IOException, XmlStreamReaderException
244     {
245         super( is, httpContentType, lenient );
246     }
247 }