XmlReader xref

View Javadoc

1   /*
2    * Copyright 2004 Sun Microsystems, Inc.
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   *
16   */
17  package org.codehaus.plexus.util.xml;
18  
19  import java.io.BufferedInputStream;
20  import java.io.BufferedReader;
21  import java.io.File;
22  import java.io.FileInputStream;
23  import java.io.IOException;
24  import java.io.InputStream;
25  import java.io.InputStreamReader;
26  import java.io.Reader;
27  import java.io.StringReader;
28  import java.net.URL;
29  import java.net.URLConnection;
30  import java.net.HttpURLConnection;
31  import java.util.regex.Pattern;
32  import java.util.regex.Matcher;
33  import java.text.MessageFormat;
34  
35  /**
36   * Character stream that handles (or at least attemtps to) all the necessary Voodo to figure out the charset encoding of
37   * the XML document within the stream.
38   * <p>
39   * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream.
40   * <p>
41   * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the
42   * document as a valid XML. This is not 100% true, but it's close enough (UTF-8 BOM is not handled by all parsers right
43   * now, XmlReader handles it and things work in all parsers).
44   * <p>
45   * The XmlReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering
46   * a wide set of constructors.
47   * <P>
48   * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for an script
49   * (following HTTP MIME and XML specifications). All this is nicely explained by Mark Pilgrim in his blog, <a
50   * href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining the character encoding of a feed</a>.
51   * <p>
52   * 
53   * @author Alejandro Abdelnur
54   * @version revision 1.17 taken on 26/06/2007 from Rome (see https://rome.dev.java.net/source/browse/rome/src/java/com/sun/syndication/io/XmlReader.java)
55   * @deprecated use XmlStreamReader
56   * @since 1.4.3
57   * @deprecated TO BE REMOVED from here when plexus-utils is upgraded to 1.4.5+ (and prerequisite upgraded to Maven 2.0.6)
58   */
59  public class XmlReader extends Reader
60  {
61      private static final int BUFFER_SIZE = 4096;
62  
63      private static final String UTF_8 = "UTF-8";
64  
65      private static final String US_ASCII = "US-ASCII";
66  
67      private static final String UTF_16BE = "UTF-16BE";
68  
69      private static final String UTF_16LE = "UTF-16LE";
70  
71      private static final String UTF_16 = "UTF-16";
72  
73      private static final String EBCDIC = "CP1047";
74  
75      private static String _staticDefaultEncoding = null;
76  
77      private Reader _reader;
78  
79      private String _encoding;
80  
81      private String _defaultEncoding;
82  
83      /**
84       * Sets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
85       * content-type are not adequate. <p/> If it is set to NULL the content-type based rules are used. <p/> By default
86       * it is NULL. <p/>
87       * 
88       * @param encoding
89       *            charset encoding to default to.
90       */
91      public static void setDefaultEncoding( String encoding )
92      {
93          _staticDefaultEncoding = encoding;
94      }
95  
96      /**
97       * Returns the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on
98       * content-type are not adequate. <p/> If it is NULL the content-type based rules are used. <p/>
99       * 
100      * @return the default encoding to use.
101      */
102     public static String getDefaultEncoding()
103     {
104         return _staticDefaultEncoding;
105     }
106 
107     /**
108      * Creates a Reader for a File.
109      * <p>
110      * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to
111      * UTF-8.
112      * <p>
113      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
114      * <p>
115      * 
116      * @param file
117      *            File to create a Reader from.
118      * @throws IOException
119      *             thrown if there is a problem reading the file.
120      * 
121      */
122     public XmlReader( File file ) throws IOException
123     {
124         this( new FileInputStream( file ) );
125     }
126 
127     /**
128      * Creates a Reader for a raw InputStream.
129      * <p>
130      * It follows the same logic used for files.
131      * <p>
132      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
133      * <p>
134      * 
135      * @param is
136      *            InputStream to create a Reader from.
137      * @throws IOException
138      *             thrown if there is a problem reading the stream.
139      * 
140      */
141     public XmlReader( InputStream is ) throws IOException
142     {
143         this( is, true );
144     }
145 
146     /**
147      * Creates a Reader for a raw InputStream.
148      * <p>
149      * It follows the same logic used for files.
150      * <p>
151      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
152      * following:
153      * <p>
154      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
155      * <p>
156      * Else if the XML prolog had a charset encoding that encoding is used.
157      * <p>
158      * Else if the content type had a charset encoding that encoding is used.
159      * <p>
160      * Else 'UTF-8' is used.
161      * <p>
162      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
163      * <p>
164      * 
165      * @param is
166      *            InputStream to create a Reader from.
167      * @param lenient
168      *            indicates if the charset encoding detection should be relaxed.
169      * @throws IOException
170      *             thrown if there is a problem reading the stream.
171      * @throws XmlStreamReaderException
172      *             thrown if the charset encoding could not be determined according to the specs.
173      * 
174      */
175     public XmlReader( InputStream is, boolean lenient ) throws IOException, XmlStreamReaderException
176     {
177         _defaultEncoding = _staticDefaultEncoding;
178         try
179         {
180             doRawStream( is, lenient );
181         }
182         catch ( XmlStreamReaderException ex )
183         {
184             if ( !lenient )
185             {
186                 throw ex;
187             }
188             else
189             {
190                 doLenientDetection( null, ex );
191             }
192         }
193     }
194 
195     /**
196      * Creates a Reader using the InputStream of a URL.
197      * <p>
198      * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic
199      * used for Files.
200      * <p>
201      * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for
202      * an InputStream with content-type.
203      * <p>
204      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
205      * <p>
206      * 
207      * @param url
208      *            URL to create a Reader from.
209      * @throws IOException
210      *             thrown if there is a problem reading the stream of the URL.
211      * 
212      */
213     public XmlReader( URL url ) throws IOException
214     {
215         this( url.openConnection() );
216     }
217 
218     /**
219      * Creates a Reader using the InputStream of a URLConnection.
220      * <p>
221      * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data
222      * it uses the same logic used for files.
223      * <p>
224      * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic
225      * used for an InputStream with content-type.
226      * <p>
227      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
228      * <p>
229      * 
230      * @param conn
231      *            URLConnection to create a Reader from.
232      * @throws IOException
233      *             thrown if there is a problem reading the stream of the URLConnection.
234      * 
235      */
236     public XmlReader( URLConnection conn ) throws IOException
237     {
238         _defaultEncoding = _staticDefaultEncoding;
239         boolean lenient = true;
240         if ( conn instanceof HttpURLConnection )
241         {
242             try
243             {
244                 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
245             }
246             catch ( XmlStreamReaderException ex )
247             {
248                 doLenientDetection( conn.getContentType(), ex );
249             }
250         }
251         else if ( conn.getContentType() != null )
252         {
253             try
254             {
255                 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
256             }
257             catch ( XmlStreamReaderException ex )
258             {
259                 doLenientDetection( conn.getContentType(), ex );
260             }
261         }
262         else
263         {
264             try
265             {
266                 doRawStream( conn.getInputStream(), lenient );
267             }
268             catch ( XmlStreamReaderException ex )
269             {
270                 doLenientDetection( null, ex );
271             }
272         }
273     }
274 
275     /**
276      * Creates a Reader using an InputStream an the associated content-type header.
277      * <p>
278      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
279      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
280      * encoding mandated by the content-type MIME type.
281      * <p>
282      * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details.
283      * <p>
284      * 
285      * @param is
286      *            InputStream to create the reader from.
287      * @param httpContentType
288      *            content-type header to use for the resolution of the charset encoding.
289      * @throws IOException
290      *             thrown if there is a problem reading the file.
291      * 
292      */
293     public XmlReader( InputStream is, String httpContentType ) throws IOException
294     {
295         this( is, httpContentType, true );
296     }
297 
298     /**
299      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
300      * regarding the encoding detection.
301      * <p>
302      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
303      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
304      * encoding mandated by the content-type MIME type.
305      * <p>
306      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
307      * following:
308      * <p>
309      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
310      * <p>
311      * Else if the XML prolog had a charset encoding that encoding is used.
312      * <p>
313      * Else if the content type had a charset encoding that encoding is used.
314      * <p>
315      * Else 'UTF-8' is used.
316      * <p>
317      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
318      * <p>
319      * 
320      * @param is
321      *            InputStream to create the reader from.
322      * @param httpContentType
323      *            content-type header to use for the resolution of the charset encoding.
324      * @param lenient
325      *            indicates if the charset encoding detection should be relaxed.
326      * @throws IOException
327      *             thrown if there is a problem reading the file.
328      * @throws XmlStreamReaderException
329      *             thrown if the charset encoding could not be determined according to the specs.
330      * 
331      */
332     public XmlReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding )
333         throws IOException, XmlStreamReaderException
334     {
335         _defaultEncoding = ( defaultEncoding == null ) ? _staticDefaultEncoding : defaultEncoding;
336         try
337         {
338             doHttpStream( is, httpContentType, lenient );
339         }
340         catch ( XmlStreamReaderException ex )
341         {
342             if ( !lenient )
343             {
344                 throw ex;
345             }
346             else
347             {
348                 doLenientDetection( httpContentType, ex );
349             }
350         }
351     }
352 
353     /**
354      * Creates a Reader using an InputStream an the associated content-type header. This constructor is lenient
355      * regarding the encoding detection.
356      * <p>
357      * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not
358      * content-type encoding checks the XML prolog encoding. If there is not XML prolog encoding uses the default
359      * encoding mandated by the content-type MIME type.
360      * <p>
361      * If lenient detection is indicated and the detection above fails as per specifications it then attempts the
362      * following:
363      * <p>
364      * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again.
365      * <p>
366      * Else if the XML prolog had a charset encoding that encoding is used.
367      * <p>
368      * Else if the content type had a charset encoding that encoding is used.
369      * <p>
370      * Else 'UTF-8' is used.
371      * <p>
372      * If lenient detection is indicated an XmlStreamReaderException is never thrown.
373      * <p>
374      * 
375      * @param is
376      *            InputStream to create the reader from.
377      * @param httpContentType
378      *            content-type header to use for the resolution of the charset encoding.
379      * @param lenient
380      *            indicates if the charset encoding detection should be relaxed.
381      * @throws IOException
382      *             thrown if there is a problem reading the file.
383      * @throws XmlStreamReaderException
384      *             thrown if the charset encoding could not be determined according to the specs.
385      * 
386      */
387     public XmlReader( InputStream is, String httpContentType, boolean lenient ) throws IOException, XmlStreamReaderException
388     {
389         this( is, httpContentType, lenient, null );
390     }
391 
392     private void doLenientDetection( String httpContentType, XmlStreamReaderException ex ) throws IOException
393     {
394         if ( httpContentType != null )
395         {
396             if ( httpContentType.startsWith( "text/html" ) )
397             {
398                 httpContentType = httpContentType.substring( "text/html".length() );
399                 httpContentType = "text/xml" + httpContentType;
400                 try
401                 {
402                     doHttpStream( ex.getInputStream(), httpContentType, true );
403                     ex = null;
404                 }
405                 catch ( XmlStreamReaderException ex2 )
406                 {
407                     ex = ex2;
408                 }
409             }
410         }
411         if ( ex != null )
412         {
413             String encoding = ex.getXmlEncoding();
414             if ( encoding == null )
415             {
416                 encoding = ex.getContentTypeEncoding();
417             }
418             if ( encoding == null )
419             {
420                 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
421             }
422             prepareReader( ex.getInputStream(), encoding );
423         }
424     }
425 
426     /**
427      * Returns the charset encoding of the XmlReader.
428      * <p>
429      * 
430      * @return charset encoding.
431      * 
432      */
433     public String getEncoding()
434     {
435         return _encoding;
436     }
437 
438     public int read( char[] buf, int offset, int len ) throws IOException
439     {
440         return _reader.read( buf, offset, len );
441     }
442 
443     /**
444      * Closes the XmlReader stream.
445      * <p>
446      * 
447      * @throws IOException
448      *             thrown if there was a problem closing the stream.
449      * 
450      */
451     public void close() throws IOException
452     {
453         _reader.close();
454     }
455 
456     private void doRawStream( InputStream is, boolean lenient ) throws IOException
457     {
458         BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
459         String bomEnc = getBOMEncoding( pis );
460         String xmlGuessEnc = getXMLGuessEncoding( pis );
461         String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
462         String encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, pis );
463         prepareReader( pis, encoding );
464     }
465 
466     private void doHttpStream( InputStream is, String httpContentType, boolean lenient ) throws IOException
467     {
468         BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
469         String cTMime = getContentTypeMime( httpContentType );
470         String cTEnc = getContentTypeEncoding( httpContentType );
471         String bomEnc = getBOMEncoding( pis );
472         String xmlGuessEnc = getXMLGuessEncoding( pis );
473         String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
474         String encoding = calculateHttpEncoding( cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient );
475         prepareReader( pis, encoding );
476     }
477 
478     private void prepareReader( InputStream is, String encoding ) throws IOException
479     {
480         _reader = new InputStreamReader( is, encoding );
481         _encoding = encoding;
482     }
483 
484     // InputStream is passed for XmlStreamReaderException creation only
485     private String calculateRawEncoding( String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is )
486         throws IOException
487     {
488         String encoding;
489         if ( bomEnc == null )
490         {
491             if ( xmlGuessEnc == null || xmlEnc == null )
492             {
493                 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
494             }
495             else if ( xmlEnc.equals( UTF_16 ) && ( xmlGuessEnc.equals( UTF_16BE ) || xmlGuessEnc.equals( UTF_16LE ) ) )
496             {
497                 encoding = xmlGuessEnc;
498             }
499             else
500             {
501                 encoding = xmlEnc;
502             }
503         }
504         else if ( bomEnc.equals( UTF_8 ) )
505         {
506             if ( xmlGuessEnc != null && !xmlGuessEnc.equals( UTF_8 ) )
507             {
508                 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
509                                               xmlGuessEnc, xmlEnc, is );
510             }
511             if ( xmlEnc != null && !xmlEnc.equals( UTF_8 ) )
512             {
513                 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
514                                               xmlGuessEnc, xmlEnc, is );
515             }
516             encoding = UTF_8;
517         }
518         else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
519         {
520             if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
521             {
522                 throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
523             }
524             if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
525             {
526                 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
527                                               xmlGuessEnc, xmlEnc, is );
528             }
529             encoding = bomEnc;
530         }
531         else
532         {
533             throw new XmlStreamReaderException( RAW_EX_2.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
534                                           xmlGuessEnc, xmlEnc, is );
535         }
536         return encoding;
537     }
538 
539     // InputStream is passed for XmlStreamReaderException creation only
540     private String calculateHttpEncoding( String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc,
541                                           String xmlEnc, InputStream is, boolean lenient ) throws IOException
542     {
543         String encoding;
544         if ( lenient & xmlEnc != null )
545         {
546             encoding = xmlEnc;
547         }
548         else
549         {
550             boolean appXml = isAppXml( cTMime );
551             boolean textXml = isTextXml( cTMime );
552             if ( appXml || textXml )
553             {
554                 if ( cTEnc == null )
555                 {
556                     if ( appXml )
557                     {
558                         encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, is );
559                     }
560                     else
561                     {
562                         encoding = ( _defaultEncoding == null ) ? US_ASCII : _defaultEncoding;
563                     }
564                 }
565                 else if ( bomEnc != null && ( cTEnc.equals( UTF_16BE ) || cTEnc.equals( UTF_16LE ) ) )
566                 {
567                     throw new XmlStreamReaderException( HTTP_EX_1.format( new Object[] { cTMime, cTEnc, bomEnc, xmlGuessEnc,
568                         xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
569                 }
570                 else if ( cTEnc.equals( UTF_16 ) )
571                 {
572                     if ( bomEnc != null && bomEnc.startsWith( UTF_16 ) )
573                     {
574                         encoding = bomEnc;
575                     }
576                     else
577                     {
578                         throw new XmlStreamReaderException( HTTP_EX_2.format( new Object[] { cTMime, cTEnc, bomEnc,
579                             xmlGuessEnc, xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
580                     }
581                 }
582                 else
583                 {
584                     encoding = cTEnc;
585                 }
586             }
587             else
588             {
589                 throw new XmlStreamReaderException( HTTP_EX_3.format( new Object[] { cTMime, cTEnc, bomEnc, xmlGuessEnc,
590                     xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
591             }
592         }
593         return encoding;
594     }
595 
596     // returns MIME type or NULL if httpContentType is NULL
597     private static String getContentTypeMime( String httpContentType )
598     {
599         String mime = null;
600         if ( httpContentType != null )
601         {
602             int i = httpContentType.indexOf( ";" );
603             mime = ( ( i == -1 ) ? httpContentType : httpContentType.substring( 0, i ) ).trim();
604         }
605         return mime;
606     }
607 
608     private static final Pattern CHARSET_PATTERN = Pattern.compile( "charset=([.[^; ]]*)" );
609 
610     // returns charset parameter value, NULL if not present, NULL if httpContentType is NULL
611     private static String getContentTypeEncoding( String httpContentType )
612     {
613         String encoding = null;
614         if ( httpContentType != null )
615         {
616             int i = httpContentType.indexOf( ";" );
617             if ( i > -1 )
618             {
619                 String postMime = httpContentType.substring( i + 1 );
620                 Matcher m = CHARSET_PATTERN.matcher( postMime );
621                 encoding = ( m.find() ) ? m.group( 1 ) : null;
622                 encoding = ( encoding != null ) ? encoding.toUpperCase() : null;
623             }
624         }
625         return encoding;
626     }
627 
628     // returns the BOM in the stream, NULL if not present,
629     // if there was BOM the in the stream it is consumed
630     private static String getBOMEncoding( BufferedInputStream is ) throws IOException
631     {
632         String encoding = null;
633         int[] bytes = new int[3];
634         is.mark( 3 );
635         bytes[0] = is.read();
636         bytes[1] = is.read();
637         bytes[2] = is.read();
638 
639         if ( bytes[0] == 0xFE && bytes[1] == 0xFF )
640         {
641             encoding = UTF_16BE;
642             is.reset();
643             is.read();
644             is.read();
645         }
646         else if ( bytes[0] == 0xFF && bytes[1] == 0xFE )
647         {
648             encoding = UTF_16LE;
649             is.reset();
650             is.read();
651             is.read();
652         }
653         else if ( bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF )
654         {
655             encoding = UTF_8;
656         }
657         else
658         {
659             is.reset();
660         }
661         return encoding;
662     }
663 
664     // returns the best guess for the encoding by looking the first bytes of the stream, '<?'
665     private static String getXMLGuessEncoding( BufferedInputStream is ) throws IOException
666     {
667         String encoding = null;
668         int[] bytes = new int[4];
669         is.mark( 4 );
670         bytes[0] = is.read();
671         bytes[1] = is.read();
672         bytes[2] = is.read();
673         bytes[3] = is.read();
674         is.reset();
675 
676         if ( bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F )
677         {
678             encoding = UTF_16BE;
679         }
680         else if ( bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00 )
681         {
682             encoding = UTF_16LE;
683         }
684         else if ( bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D )
685         {
686             encoding = UTF_8;
687         }
688         else if ( bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94 )
689         {
690             encoding = EBCDIC;
691         }
692         return encoding;
693     }
694 
695     static final Pattern ENCODING_PATTERN =
696         Pattern.compile( "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE );
697 
698     // returns the encoding declared in the <?xml encoding=...?>, NULL if none
699     private static String getXmlProlog( BufferedInputStream is, String guessedEnc ) throws IOException
700     {
701         String encoding = null;
702         if ( guessedEnc != null )
703         {
704             byte[] bytes = new byte[BUFFER_SIZE];
705             is.mark( BUFFER_SIZE );
706             int offset = 0;
707             int max = BUFFER_SIZE;
708             int c = is.read( bytes, offset, max );
709             int firstGT = -1;
710             String xmlProlog = null;
711             while ( c != -1 && firstGT == -1 && offset < BUFFER_SIZE )
712             {
713                 offset += c;
714                 max -= c;
715                 c = is.read( bytes, offset, max );
716                 xmlProlog = new String( bytes, 0, offset, guessedEnc );
717                 firstGT = xmlProlog.indexOf( '>' );
718             }
719             if ( firstGT == -1 )
720             {
721                 if ( c == -1 )
722                 {
723                     throw new IOException( "Unexpected end of XML stream" );
724                 }
725                 else
726                 {
727                     throw new IOException( "XML prolog or ROOT element not found on first " + offset + " bytes" );
728                 }
729             }
730             int bytesRead = offset;
731             if ( bytesRead > 0 )
732             {
733                 is.reset();
734                 BufferedReader bReader = new BufferedReader( new StringReader( xmlProlog.substring( 0, firstGT + 1 ) ) );
735                 StringBuffer prolog = new StringBuffer();
736                 String line = bReader.readLine();
737                 while ( line != null )
738                 {
739                     prolog.append( line );
740                     line = bReader.readLine();
741                 }
742                 Matcher m = ENCODING_PATTERN.matcher( prolog );
743                 if ( m.find() )
744                 {
745                     encoding = m.group( 1 ).toUpperCase();
746                     encoding = encoding.substring( 1, encoding.length() - 1 );
747                 }
748             }
749         }
750         return encoding;
751     }
752 
753     // indicates if the MIME type belongs to the APPLICATION XML family
754     private static boolean isAppXml( String mime )
755     {
756         return mime != null
757                         && ( mime.equals( "application/xml" ) || mime.equals( "application/xml-dtd" )
758                                         || mime.equals( "application/xml-external-parsed-entity" ) || ( mime.startsWith( "application/" ) && mime.endsWith( "+xml" ) ) );
759     }
760 
761     // indicates if the MIME type belongs to the TEXT XML family
762     private static boolean isTextXml( String mime )
763     {
764         return mime != null
765                         && ( mime.equals( "text/xml" ) || mime.equals( "text/xml-external-parsed-entity" ) || ( mime.startsWith( "text/" ) && mime.endsWith( "+xml" ) ) );
766     }
767 
768     private static final MessageFormat RAW_EX_1 =
769         new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch" );
770 
771     private static final MessageFormat RAW_EX_2 =
772         new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM" );
773 
774     private static final MessageFormat HTTP_EX_1 =
775         new MessageFormat(
776                            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL" );
777 
778     private static final MessageFormat HTTP_EX_2 =
779         new MessageFormat(
780                            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch" );
781 
782     private static final MessageFormat HTTP_EX_3 =
783         new MessageFormat(
784                            "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME" );
785 
786 }