1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.codehaus.plexus.util.xml;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.FileInputStream;
23 import java.io.IOException;
24 import java.io.InputStream;
25 import java.io.InputStreamReader;
26 import java.io.Reader;
27 import java.io.StringReader;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.net.HttpURLConnection;
31 import java.util.regex.Pattern;
32 import java.util.regex.Matcher;
33 import java.text.MessageFormat;
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59 public class XmlReader extends Reader
60 {
61 private static final int BUFFER_SIZE = 4096;
62
63 private static final String UTF_8 = "UTF-8";
64
65 private static final String US_ASCII = "US-ASCII";
66
67 private static final String UTF_16BE = "UTF-16BE";
68
69 private static final String UTF_16LE = "UTF-16LE";
70
71 private static final String UTF_16 = "UTF-16";
72
73 private static final String EBCDIC = "CP1047";
74
75 private static String _staticDefaultEncoding = null;
76
77 private Reader _reader;
78
79 private String _encoding;
80
81 private String _defaultEncoding;
82
83
84
85
86
87
88
89
90
91 public static void setDefaultEncoding( String encoding )
92 {
93 _staticDefaultEncoding = encoding;
94 }
95
96
97
98
99
100
101
102 public static String getDefaultEncoding()
103 {
104 return _staticDefaultEncoding;
105 }
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 public XmlReader( File file ) throws IOException
123 {
124 this( new FileInputStream( file ) );
125 }
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141 public XmlReader( InputStream is ) throws IOException
142 {
143 this( is, true );
144 }
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175 public XmlReader( InputStream is, boolean lenient ) throws IOException, XmlStreamReaderException
176 {
177 _defaultEncoding = _staticDefaultEncoding;
178 try
179 {
180 doRawStream( is, lenient );
181 }
182 catch ( XmlStreamReaderException ex )
183 {
184 if ( !lenient )
185 {
186 throw ex;
187 }
188 else
189 {
190 doLenientDetection( null, ex );
191 }
192 }
193 }
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213 public XmlReader( URL url ) throws IOException
214 {
215 this( url.openConnection() );
216 }
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236 public XmlReader( URLConnection conn ) throws IOException
237 {
238 _defaultEncoding = _staticDefaultEncoding;
239 boolean lenient = true;
240 if ( conn instanceof HttpURLConnection )
241 {
242 try
243 {
244 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
245 }
246 catch ( XmlStreamReaderException ex )
247 {
248 doLenientDetection( conn.getContentType(), ex );
249 }
250 }
251 else if ( conn.getContentType() != null )
252 {
253 try
254 {
255 doHttpStream( conn.getInputStream(), conn.getContentType(), lenient );
256 }
257 catch ( XmlStreamReaderException ex )
258 {
259 doLenientDetection( conn.getContentType(), ex );
260 }
261 }
262 else
263 {
264 try
265 {
266 doRawStream( conn.getInputStream(), lenient );
267 }
268 catch ( XmlStreamReaderException ex )
269 {
270 doLenientDetection( null, ex );
271 }
272 }
273 }
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293 public XmlReader( InputStream is, String httpContentType ) throws IOException
294 {
295 this( is, httpContentType, true );
296 }
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332 public XmlReader( InputStream is, String httpContentType, boolean lenient, String defaultEncoding )
333 throws IOException, XmlStreamReaderException
334 {
335 _defaultEncoding = ( defaultEncoding == null ) ? _staticDefaultEncoding : defaultEncoding;
336 try
337 {
338 doHttpStream( is, httpContentType, lenient );
339 }
340 catch ( XmlStreamReaderException ex )
341 {
342 if ( !lenient )
343 {
344 throw ex;
345 }
346 else
347 {
348 doLenientDetection( httpContentType, ex );
349 }
350 }
351 }
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387 public XmlReader( InputStream is, String httpContentType, boolean lenient ) throws IOException, XmlStreamReaderException
388 {
389 this( is, httpContentType, lenient, null );
390 }
391
392 private void doLenientDetection( String httpContentType, XmlStreamReaderException ex ) throws IOException
393 {
394 if ( httpContentType != null )
395 {
396 if ( httpContentType.startsWith( "text/html" ) )
397 {
398 httpContentType = httpContentType.substring( "text/html".length() );
399 httpContentType = "text/xml" + httpContentType;
400 try
401 {
402 doHttpStream( ex.getInputStream(), httpContentType, true );
403 ex = null;
404 }
405 catch ( XmlStreamReaderException ex2 )
406 {
407 ex = ex2;
408 }
409 }
410 }
411 if ( ex != null )
412 {
413 String encoding = ex.getXmlEncoding();
414 if ( encoding == null )
415 {
416 encoding = ex.getContentTypeEncoding();
417 }
418 if ( encoding == null )
419 {
420 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
421 }
422 prepareReader( ex.getInputStream(), encoding );
423 }
424 }
425
426
427
428
429
430
431
432
433 public String getEncoding()
434 {
435 return _encoding;
436 }
437
438 public int read( char[] buf, int offset, int len ) throws IOException
439 {
440 return _reader.read( buf, offset, len );
441 }
442
443
444
445
446
447
448
449
450
451 public void close() throws IOException
452 {
453 _reader.close();
454 }
455
456 private void doRawStream( InputStream is, boolean lenient ) throws IOException
457 {
458 BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
459 String bomEnc = getBOMEncoding( pis );
460 String xmlGuessEnc = getXMLGuessEncoding( pis );
461 String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
462 String encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, pis );
463 prepareReader( pis, encoding );
464 }
465
466 private void doHttpStream( InputStream is, String httpContentType, boolean lenient ) throws IOException
467 {
468 BufferedInputStream pis = new BufferedInputStream( is, BUFFER_SIZE );
469 String cTMime = getContentTypeMime( httpContentType );
470 String cTEnc = getContentTypeEncoding( httpContentType );
471 String bomEnc = getBOMEncoding( pis );
472 String xmlGuessEnc = getXMLGuessEncoding( pis );
473 String xmlEnc = getXmlProlog( pis, xmlGuessEnc );
474 String encoding = calculateHttpEncoding( cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, pis, lenient );
475 prepareReader( pis, encoding );
476 }
477
478 private void prepareReader( InputStream is, String encoding ) throws IOException
479 {
480 _reader = new InputStreamReader( is, encoding );
481 _encoding = encoding;
482 }
483
484
485 private String calculateRawEncoding( String bomEnc, String xmlGuessEnc, String xmlEnc, InputStream is )
486 throws IOException
487 {
488 String encoding;
489 if ( bomEnc == null )
490 {
491 if ( xmlGuessEnc == null || xmlEnc == null )
492 {
493 encoding = ( _defaultEncoding == null ) ? UTF_8 : _defaultEncoding;
494 }
495 else if ( xmlEnc.equals( UTF_16 ) && ( xmlGuessEnc.equals( UTF_16BE ) || xmlGuessEnc.equals( UTF_16LE ) ) )
496 {
497 encoding = xmlGuessEnc;
498 }
499 else
500 {
501 encoding = xmlEnc;
502 }
503 }
504 else if ( bomEnc.equals( UTF_8 ) )
505 {
506 if ( xmlGuessEnc != null && !xmlGuessEnc.equals( UTF_8 ) )
507 {
508 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
509 xmlGuessEnc, xmlEnc, is );
510 }
511 if ( xmlEnc != null && !xmlEnc.equals( UTF_8 ) )
512 {
513 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
514 xmlGuessEnc, xmlEnc, is );
515 }
516 encoding = UTF_8;
517 }
518 else if ( bomEnc.equals( UTF_16BE ) || bomEnc.equals( UTF_16LE ) )
519 {
520 if ( xmlGuessEnc != null && !xmlGuessEnc.equals( bomEnc ) )
521 {
522 throw new IOException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ) );
523 }
524 if ( xmlEnc != null && !xmlEnc.equals( UTF_16 ) && !xmlEnc.equals( bomEnc ) )
525 {
526 throw new XmlStreamReaderException( RAW_EX_1.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
527 xmlGuessEnc, xmlEnc, is );
528 }
529 encoding = bomEnc;
530 }
531 else
532 {
533 throw new XmlStreamReaderException( RAW_EX_2.format( new Object[] { bomEnc, xmlGuessEnc, xmlEnc } ), bomEnc,
534 xmlGuessEnc, xmlEnc, is );
535 }
536 return encoding;
537 }
538
539
540 private String calculateHttpEncoding( String cTMime, String cTEnc, String bomEnc, String xmlGuessEnc,
541 String xmlEnc, InputStream is, boolean lenient ) throws IOException
542 {
543 String encoding;
544 if ( lenient & xmlEnc != null )
545 {
546 encoding = xmlEnc;
547 }
548 else
549 {
550 boolean appXml = isAppXml( cTMime );
551 boolean textXml = isTextXml( cTMime );
552 if ( appXml || textXml )
553 {
554 if ( cTEnc == null )
555 {
556 if ( appXml )
557 {
558 encoding = calculateRawEncoding( bomEnc, xmlGuessEnc, xmlEnc, is );
559 }
560 else
561 {
562 encoding = ( _defaultEncoding == null ) ? US_ASCII : _defaultEncoding;
563 }
564 }
565 else if ( bomEnc != null && ( cTEnc.equals( UTF_16BE ) || cTEnc.equals( UTF_16LE ) ) )
566 {
567 throw new XmlStreamReaderException( HTTP_EX_1.format( new Object[] { cTMime, cTEnc, bomEnc, xmlGuessEnc,
568 xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
569 }
570 else if ( cTEnc.equals( UTF_16 ) )
571 {
572 if ( bomEnc != null && bomEnc.startsWith( UTF_16 ) )
573 {
574 encoding = bomEnc;
575 }
576 else
577 {
578 throw new XmlStreamReaderException( HTTP_EX_2.format( new Object[] { cTMime, cTEnc, bomEnc,
579 xmlGuessEnc, xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
580 }
581 }
582 else
583 {
584 encoding = cTEnc;
585 }
586 }
587 else
588 {
589 throw new XmlStreamReaderException( HTTP_EX_3.format( new Object[] { cTMime, cTEnc, bomEnc, xmlGuessEnc,
590 xmlEnc } ), cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc, is );
591 }
592 }
593 return encoding;
594 }
595
596
597 private static String getContentTypeMime( String httpContentType )
598 {
599 String mime = null;
600 if ( httpContentType != null )
601 {
602 int i = httpContentType.indexOf( ";" );
603 mime = ( ( i == -1 ) ? httpContentType : httpContentType.substring( 0, i ) ).trim();
604 }
605 return mime;
606 }
607
608 private static final Pattern CHARSET_PATTERN = Pattern.compile( "charset=([.[^; ]]*)" );
609
610
611 private static String getContentTypeEncoding( String httpContentType )
612 {
613 String encoding = null;
614 if ( httpContentType != null )
615 {
616 int i = httpContentType.indexOf( ";" );
617 if ( i > -1 )
618 {
619 String postMime = httpContentType.substring( i + 1 );
620 Matcher m = CHARSET_PATTERN.matcher( postMime );
621 encoding = ( m.find() ) ? m.group( 1 ) : null;
622 encoding = ( encoding != null ) ? encoding.toUpperCase() : null;
623 }
624 }
625 return encoding;
626 }
627
628
629
630 private static String getBOMEncoding( BufferedInputStream is ) throws IOException
631 {
632 String encoding = null;
633 int[] bytes = new int[3];
634 is.mark( 3 );
635 bytes[0] = is.read();
636 bytes[1] = is.read();
637 bytes[2] = is.read();
638
639 if ( bytes[0] == 0xFE && bytes[1] == 0xFF )
640 {
641 encoding = UTF_16BE;
642 is.reset();
643 is.read();
644 is.read();
645 }
646 else if ( bytes[0] == 0xFF && bytes[1] == 0xFE )
647 {
648 encoding = UTF_16LE;
649 is.reset();
650 is.read();
651 is.read();
652 }
653 else if ( bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF )
654 {
655 encoding = UTF_8;
656 }
657 else
658 {
659 is.reset();
660 }
661 return encoding;
662 }
663
664
665 private static String getXMLGuessEncoding( BufferedInputStream is ) throws IOException
666 {
667 String encoding = null;
668 int[] bytes = new int[4];
669 is.mark( 4 );
670 bytes[0] = is.read();
671 bytes[1] = is.read();
672 bytes[2] = is.read();
673 bytes[3] = is.read();
674 is.reset();
675
676 if ( bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00 && bytes[3] == 0x3F )
677 {
678 encoding = UTF_16BE;
679 }
680 else if ( bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F && bytes[3] == 0x00 )
681 {
682 encoding = UTF_16LE;
683 }
684 else if ( bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78 && bytes[3] == 0x6D )
685 {
686 encoding = UTF_8;
687 }
688 else if ( bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7 && bytes[3] == 0x94 )
689 {
690 encoding = EBCDIC;
691 }
692 return encoding;
693 }
694
695 static final Pattern ENCODING_PATTERN =
696 Pattern.compile( "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))", Pattern.MULTILINE );
697
698
699 private static String getXmlProlog( BufferedInputStream is, String guessedEnc ) throws IOException
700 {
701 String encoding = null;
702 if ( guessedEnc != null )
703 {
704 byte[] bytes = new byte[BUFFER_SIZE];
705 is.mark( BUFFER_SIZE );
706 int offset = 0;
707 int max = BUFFER_SIZE;
708 int c = is.read( bytes, offset, max );
709 int firstGT = -1;
710 String xmlProlog = null;
711 while ( c != -1 && firstGT == -1 && offset < BUFFER_SIZE )
712 {
713 offset += c;
714 max -= c;
715 c = is.read( bytes, offset, max );
716 xmlProlog = new String( bytes, 0, offset, guessedEnc );
717 firstGT = xmlProlog.indexOf( '>' );
718 }
719 if ( firstGT == -1 )
720 {
721 if ( c == -1 )
722 {
723 throw new IOException( "Unexpected end of XML stream" );
724 }
725 else
726 {
727 throw new IOException( "XML prolog or ROOT element not found on first " + offset + " bytes" );
728 }
729 }
730 int bytesRead = offset;
731 if ( bytesRead > 0 )
732 {
733 is.reset();
734 BufferedReader bReader = new BufferedReader( new StringReader( xmlProlog.substring( 0, firstGT + 1 ) ) );
735 StringBuffer prolog = new StringBuffer();
736 String line = bReader.readLine();
737 while ( line != null )
738 {
739 prolog.append( line );
740 line = bReader.readLine();
741 }
742 Matcher m = ENCODING_PATTERN.matcher( prolog );
743 if ( m.find() )
744 {
745 encoding = m.group( 1 ).toUpperCase();
746 encoding = encoding.substring( 1, encoding.length() - 1 );
747 }
748 }
749 }
750 return encoding;
751 }
752
753
754 private static boolean isAppXml( String mime )
755 {
756 return mime != null
757 && ( mime.equals( "application/xml" ) || mime.equals( "application/xml-dtd" )
758 || mime.equals( "application/xml-external-parsed-entity" ) || ( mime.startsWith( "application/" ) && mime.endsWith( "+xml" ) ) );
759 }
760
761
762 private static boolean isTextXml( String mime )
763 {
764 return mime != null
765 && ( mime.equals( "text/xml" ) || mime.equals( "text/xml-external-parsed-entity" ) || ( mime.startsWith( "text/" ) && mime.endsWith( "+xml" ) ) );
766 }
767
768 private static final MessageFormat RAW_EX_1 =
769 new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch" );
770
771 private static final MessageFormat RAW_EX_2 =
772 new MessageFormat( "Invalid encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM" );
773
774 private static final MessageFormat HTTP_EX_1 =
775 new MessageFormat(
776 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be NULL" );
777
778 private static final MessageFormat HTTP_EX_2 =
779 new MessageFormat(
780 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch" );
781
782 private static final MessageFormat HTTP_EX_3 =
783 new MessageFormat(
784 "Invalid encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Invalid MIME" );
785
786 }