1 /* 2 * Copyright 2004 Sun Microsystems, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 * 16 */ 17 package org.codehaus.plexus.util.xml; 18 19 import java.io.File; 20 import java.io.IOException; 21 import java.io.InputStream; 22 import java.net.URL; 23 import java.net.URLConnection; 24 25 /** 26 * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the charset encoding of 27 * the XML document within the stream. 28 * <p> 29 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. 30 * <p> 31 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the 32 * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right 33 * now, XmlReader handles it and things work in all parsers). 34 * <p> 35 * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering 36 * a wide set of constructors. 37 * <P> 38 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script 39 * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog, <a 40 * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a feed</a>. 41 * <p> 42 * 43 * @author Alejandro Abdelnur 44 * @version revision 1.17 taken on 26/06/2007 from Rome (see https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java) 45 * @since 1.4.4 46 * @deprecated TO BE REMOVED from here when plexus-utils is upgraded to 1.4.5+ (and prerequisite upgraded to Maven 2.0.6) 47 */ 48 public class XmlStreamReader 49 extends XmlReader 50 { 51 /** 52 * Creates a Reader for a File. 53 * <p> 54 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to 55 * UTF-8. 56 * <p> 57 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 58 * <p> 59 * 60 * @param file 61 * File to create a Reader from. 62 * @throws IOException 63 * thrown if there is a problem reading the file. 64 * 65 */ 66 public XmlStreamReader( File file ) throws IOException 67 { 68 super( file ); 69 } 70 71 /** 72 * Creates a Reader for a raw InputStream. 73 * <p> 74 * It follows the same logic used for files. 75 * <p> 76 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 77 * <p> 78 * 79 * @param is 80 * InputStream to create a Reader from. 81 * @throws IOException 82 * thrown if there is a problem reading the stream. 83 * 84 */ 85 public XmlStreamReader( InputStream is ) throws IOException 86 { 87 super( is ); 88 } 89 90 /** 91 * Creates a Reader for a raw InputStream. 92 * <p> 93 * It follows the same logic used for files. 94 * <p> 95 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 96 * following: 97 * <p> 98 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 99 * <p> 100 * Else if the XML prolog had a charset encoding that encoding is used. 101 * <p> 102 * Else if the content type had a charset encoding that encoding is used. 103 * <p> 104 * Else 'UTF-8' is used. 105 * <p> 106 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 107 * <p> 108 * 109 * @param is 110 * InputStream to create a Reader from. 111 * @param lenient 112 * indicates if the charset encoding detection should be relaxed. 113 * @throws IOException 114 * thrown if there is a problem reading the stream. 115 * @throws XmlStreamReaderException 116 * thrown if the charset encoding could not be determined according to the specs. 117 * 118 */ 119 public XmlStreamReader( InputStream is, boolean lenient ) throws IOException, XmlStreamReaderException 120 { 121 super( is, lenient ); 122 } 123 124 /** 125 * Creates a Reader using the InputStream of a URL. 126 * <p> 127 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic 128 * used for Files. 129 * <p> 130 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for 131 * an InputStream with content-type. 132 * <p> 133 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 134 * <p> 135 * 136 * @param url 137 * URL to create a Reader from. 138 * @throws IOException 139 * thrown if there is a problem reading the stream of the URL. 140 * 141 */ 142 public XmlStreamReader( URL url ) throws IOException 143 { 144 super( url ); 145 } 146 147 /** 148 * Creates a Reader using the InputStream of a URLConnection. 149 * <p> 150 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data 151 * it uses the same logic used for files. 152 * <p> 153 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic 154 * used for an InputStream with content-type. 155 * <p> 156 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 157 * <p> 158 * 159 * @param conn 160 * URLConnection to create a Reader from. 161 * @throws IOException 162 * thrown if there is a problem reading the stream of the URLConnection. 163 * 164 */ 165 public XmlStreamReader( URLConnection conn ) throws IOException 166 { 167 super( conn ); 168 } 169 170 /** 171 * Creates a Reader using an InputStream an the associated content-type header. 172 * <p> 173 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 174 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 175 * encoding mandated by the content-type MIME type. 176 * <p> 177 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 178 * <p> 179 * 180 * @param is 181 * InputStream to create the reader from. 182 * @param httpContentType 183 * content-type header to use for the resolution of the charset encoding. 184 * @throws IOException 185 * thrown if there is a problem reading the file. 186 * 187 */ 188 public XmlStreamReader( InputStream is, String httpContentType ) throws IOException 189 { 190 super( is, httpContentType ); 191 } 192 193 /** 194 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient 195 * regarding the encoding detection. 196 * <p> 197 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 198 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 199 * encoding mandated by the content-type MIME type. 200 * <p> 201 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 202 * following: 203 * <p> 204 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 205 * <p> 206 * Else if the XML prolog had a charset encoding that encoding is used. 207 * <p> 208 * Else if the content type had a charset encoding that encoding is used. 209 * <p> 210 * Else 'UTF-8' is used. 211 * <p> 212 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 213 * <p> 214 * 215 * @param is 216 * InputStream to create the reader from. 217 * @param httpContentType 218 * content-type header to use for the resolution of the charset encoding. 219 * @param lenient 220 * indicates if the charset encoding detection should be relaxed. 221 * @throws IOException 222 * thrown if there is a problem reading the file. 223 * @throws XmlStreamReaderException 224 * thrown if the charset encoding could not be determined according to the specs. 225 * 226 */ 227 public XmlStreamReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding ) 228 throws IOException, XmlStreamReaderException 229 { 230 super( is, httpContentType, lenient, defaultEncoding ); 231 } 232 233 /** 234 * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient 235 * regarding the encoding detection. 236 * <p> 237 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not 238 * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default 239 * encoding mandated by the content-type MIME type. 240 * <p> 241 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the 242 * following: 243 * <p> 244 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 245 * <p> 246 * Else if the XML prolog had a charset encoding that encoding is used. 247 * <p> 248 * Else if the content type had a charset encoding that encoding is used. 249 * <p> 250 * Else 'UTF-8' is used. 251 * <p> 252 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 253 * <p> 254 * 255 * @param is 256 * InputStream to create the reader from. 257 * @param httpContentType 258 * content-type header to use for the resolution of the charset encoding. 259 * @param lenient 260 * indicates if the charset encoding detection should be relaxed. 261 * @throws IOException 262 * thrown if there is a problem reading the file. 263 * @throws XmlStreamReaderException 264 * thrown if the charset encoding could not be determined according to the specs. 265 * 266 */ 267 public XmlStreamReader( InputStream is, String httpContentType, boolean lenient ) throws IOException, XmlStreamReaderException 268 { 269 super( is, httpContentType, lenient ); 270 } 271 }