View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package org.codehaus.plexus.util.xml;
18  
19  import java.io.File;
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.net.URL;
23  import java.net.URLConnection;
24  
25  /**
26   * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the charset encoding of
27   * the XML document within the stream.
28   * <p>
29   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
30   * <p>
31   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
32   * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
33   * now, XmlReader handles it and things work in all parsers).
34   * <p>
35   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
36   * a wide set of constructors.
37   * <P>
38   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
39   * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
40   * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a feed</a>.
41   * <p>
42   * 
43   * @author Alejandro Abdelnur
44   * @version revision 1.17 taken on 26/06/2007 from Rome (see https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
45   * @since 1.4.4
46   * @deprecated TO BE REMOVED from here when plexus-utils is upgraded to 1.4.5+ (and prerequisite upgraded to Maven 2.0.6)
47   */
48  public class XmlStreamReader
49  extends XmlReader
50  {
51      /**
52       * Creates a Reader for a File.
53       * <p>
54       * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
55       * UTF-8.
56       * <p>
57       * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
58       * <p>
59       * 
60       * @param file
61       *            File to create a Reader from.
62       * @throws IOException
63       *             thrown if there is a problem reading the file.
64       * 
65       */
66      public XmlStreamReader( File file ) throws IOException
67      {
68          super( file );
69      }
70  
71      /**
72       * Creates a Reader for a raw InputStream.
73       * <p>
74       * It follows the same logic used for files.
75       * <p>
76       * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
77       * <p>
78       * 
79       * @param is
80       *            InputStream to create a Reader from.
81       * @throws IOException
82       *             thrown if there is a problem reading the stream.
83       * 
84       */
85      public XmlStreamReader( InputStream is ) throws IOException
86      {
87          super( is );
88      }
89  
90      /**
91       * Creates a Reader for a raw InputStream.
92       * <p>
93       * It follows the same logic used for files.
94       * <p>
95       * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
96       * following:
97       * <p>
98       * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
99       * <p>
100      * Else if the XML prolog had a charset encoding that encoding is used.
101      * <p>
102      * Else if the content type had a charset encoding that encoding is used.
103      * <p>
104      * Else 'UTF-8' is used.
105      * <p>
106      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
107      * <p>
108      * 
109      * @param is
110      *            InputStream to create a Reader from.
111      * @param lenient
112      *            indicates if the charset encoding detection should be relaxed.
113      * @throws IOException
114      *             thrown if there is a problem reading the stream.
115      * @throws XmlStreamReaderException
116      *             thrown if the charset encoding could not be determined according to the specs.
117      * 
118      */
119     public XmlStreamReader( InputStream is, boolean lenient ) throws IOException, XmlStreamReaderException
120     {
121         super( is, lenient );
122     }
123 
124     /**
125      * Creates a Reader using the InputStream of a URL.
126      * <p>
127      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
128      * used for Files.
129      * <p>
130      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
131      * an InputStream with content-type.
132      * <p>
133      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
134      * <p>
135      * 
136      * @param url
137      *            URL to create a Reader from.
138      * @throws IOException
139      *             thrown if there is a problem reading the stream of the URL.
140      * 
141      */
142     public XmlStreamReader( URL url ) throws IOException
143     {
144         super( url );
145     }
146 
147     /**
148      * Creates a Reader using the InputStream of a URLConnection.
149      * <p>
150      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
151      * it uses the same logic used for files.
152      * <p>
153      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
154      * used for an InputStream with content-type.
155      * <p>
156      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
157      * <p>
158      * 
159      * @param conn
160      *            URLConnection to create a Reader from.
161      * @throws IOException
162      *             thrown if there is a problem reading the stream of the URLConnection.
163      * 
164      */
165     public XmlStreamReader( URLConnection conn ) throws IOException
166     {
167         super( conn );
168     }
169 
170     /**
171      * Creates a Reader using an InputStream an the associated content-type header.
172      * <p>
173      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
174      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
175      * encoding mandated by the content-type MIME type.
176      * <p>
177      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
178      * <p>
179      * 
180      * @param is
181      *            InputStream to create the reader from.
182      * @param httpContentType
183      *            content-type header to use for the resolution of the charset encoding.
184      * @throws IOException
185      *             thrown if there is a problem reading the file.
186      * 
187      */
188     public XmlStreamReader( InputStream is, String httpContentType ) throws IOException
189     {
190         super( is, httpContentType );
191     }
192 
193     /**
194      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
195      * regarding the encoding detection.
196      * <p>
197      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
198      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
199      * encoding mandated by the content-type MIME type.
200      * <p>
201      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
202      * following:
203      * <p>
204      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
205      * <p>
206      * Else if the XML prolog had a charset encoding that encoding is used.
207      * <p>
208      * Else if the content type had a charset encoding that encoding is used.
209      * <p>
210      * Else 'UTF-8' is used.
211      * <p>
212      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
213      * <p>
214      * 
215      * @param is
216      *            InputStream to create the reader from.
217      * @param httpContentType
218      *            content-type header to use for the resolution of the charset encoding.
219      * @param lenient
220      *            indicates if the charset encoding detection should be relaxed.
221      * @throws IOException
222      *             thrown if there is a problem reading the file.
223      * @throws XmlStreamReaderException
224      *             thrown if the charset encoding could not be determined according to the specs.
225      * 
226      */
227     public XmlStreamReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding )
228         throws IOException, XmlStreamReaderException
229     {
230         super( is, httpContentType, lenient, defaultEncoding );
231     }
232 
233     /**
234      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
235      * regarding the encoding detection.
236      * <p>
237      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
238      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
239      * encoding mandated by the content-type MIME type.
240      * <p>
241      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
242      * following:
243      * <p>
244      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
245      * <p>
246      * Else if the XML prolog had a charset encoding that encoding is used.
247      * <p>
248      * Else if the content type had a charset encoding that encoding is used.
249      * <p>
250      * Else 'UTF-8' is used.
251      * <p>
252      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
253      * <p>
254      * 
255      * @param is
256      *            InputStream to create the reader from.
257      * @param httpContentType
258      *            content-type header to use for the resolution of the charset encoding.
259      * @param lenient
260      *            indicates if the charset encoding detection should be relaxed.
261      * @throws IOException
262      *             thrown if there is a problem reading the file.
263      * @throws XmlStreamReaderException
264      *             thrown if the charset encoding could not be determined according to the specs.
265      * 
266      */
267     public XmlStreamReader( InputStream is, String httpContentType, boolean lenient ) throws IOException, XmlStreamReaderException
268     {
269         super( is, httpContentType, lenient );
270     }
271 }