1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.io.input.compatibility;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.charset.StandardCharsets;
31 import java.nio.file.Files;
32 import java.text.MessageFormat;
33 import java.util.Locale;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37 import org.apache.commons.io.IOUtils;
38 import org.apache.commons.io.output.XmlStreamWriter;
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71 public class XmlStreamReader extends Reader {
72
73 private static final String UTF_8 = StandardCharsets.UTF_8.name();
74
75 private static final String US_ASCII = StandardCharsets.US_ASCII.name();
76
77 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
78
79 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
80
81 private static final String UTF_16 = StandardCharsets.UTF_16.name();
82
83 private static final String UTF_32BE = "UTF-32BE";
84
85 private static final String UTF_32LE = "UTF-32LE";
86
87 private static final String UTF_32 = "UTF-32";
88
89 private static final String EBCDIC = "CP1047";
90
91 private static String staticDefaultEncoding;
92
93 private static final Pattern CHARSET_PATTERN = Pattern
94 .compile("charset=[\"']?([.[^; \"']]*)[\"']?");
95
96 public static final Pattern ENCODING_PATTERN = Pattern.compile(
97 "<\\?xml.*encoding[\\s]*=[\\s]*((?:\".[^\"]*\")|(?:'.[^']*'))",
98 Pattern.MULTILINE);
99
100 private static final MessageFormat RAW_EX_1 = new MessageFormat(
101 "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch");
102
103 private static final MessageFormat RAW_EX_2 = new MessageFormat(
104 "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM");
105
106 private static final MessageFormat HTTP_EX_1 = new MessageFormat(
107 "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null");
108
109 private static final MessageFormat HTTP_EX_2 = new MessageFormat(
110 "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch");
111
112 private static final MessageFormat HTTP_EX_3 = new MessageFormat(
113 "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME");
114
115
116
117 static String getBOMEncoding(final BufferedInputStream is)
118 throws IOException {
119 String encoding = null;
120 final int[] bytes = new int[3];
121 is.mark(3);
122 bytes[0] = is.read();
123 bytes[1] = is.read();
124 bytes[2] = is.read();
125
126 if (bytes[0] == 0xFE && bytes[1] == 0xFF) {
127 encoding = UTF_16BE;
128 is.reset();
129 is.read();
130 is.read();
131 } else if (bytes[0] == 0xFF && bytes[1] == 0xFE) {
132 encoding = UTF_16LE;
133 is.reset();
134 is.read();
135 is.read();
136 } else if (bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
137 encoding = UTF_8;
138 } else {
139 is.reset();
140 }
141 return encoding;
142 }
143
144
145
146 static String getContentTypeEncoding(final String httpContentType) {
147 String encoding = null;
148 if (httpContentType != null) {
149 final int i = httpContentType.indexOf(";");
150 if (i > -1) {
151 final String postMime = httpContentType.substring(i + 1);
152 final Matcher m = CHARSET_PATTERN.matcher(postMime);
153 encoding = m.find() ? m.group(1) : null;
154 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
155 }
156 }
157 return encoding;
158 }
159
160
161 static String getContentTypeMime(final String httpContentType) {
162 String mime = null;
163 if (httpContentType != null) {
164 final int i = httpContentType.indexOf(";");
165 mime = (i == -1 ? httpContentType : httpContentType.substring(0,
166 i)).trim();
167 }
168 return mime;
169 }
170
171
172
173
174
175
176
177
178
179 public static String getDefaultEncoding() {
180 return staticDefaultEncoding;
181 }
182
183
184
185 private static String getXMLGuessEncoding(final BufferedInputStream is)
186 throws IOException {
187 String encoding = null;
188 final int[] bytes = new int[4];
189 is.mark(4);
190 bytes[0] = is.read();
191 bytes[1] = is.read();
192 bytes[2] = is.read();
193 bytes[3] = is.read();
194 is.reset();
195
196 if (bytes[0] == 0x00 && bytes[1] == 0x3C && bytes[2] == 0x00
197 && bytes[3] == 0x3F) {
198 encoding = UTF_16BE;
199 } else if (bytes[0] == 0x3C && bytes[1] == 0x00 && bytes[2] == 0x3F
200 && bytes[3] == 0x00) {
201 encoding = UTF_16LE;
202 } else if (bytes[0] == 0x3C && bytes[1] == 0x3F && bytes[2] == 0x78
203 && bytes[3] == 0x6D) {
204 encoding = UTF_8;
205 } else if (bytes[0] == 0x4C && bytes[1] == 0x6F && bytes[2] == 0xA7
206 && bytes[3] == 0x94) {
207 encoding = EBCDIC;
208 }
209 return encoding;
210 }
211
212
213 private static String getXmlProlog(final BufferedInputStream is, final String guessedEnc)
214 throws IOException {
215 String encoding = null;
216 if (guessedEnc != null) {
217 final byte[] bytes = IOUtils.byteArray();
218 is.mark(IOUtils.DEFAULT_BUFFER_SIZE);
219 int offset = 0;
220 int max = IOUtils.DEFAULT_BUFFER_SIZE;
221 int c = is.read(bytes, offset, max);
222 int firstGT = -1;
223 String xmlProlog = "";
224 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
225 offset += c;
226 max -= c;
227 c = is.read(bytes, offset, max);
228 xmlProlog = new String(bytes, 0, offset, guessedEnc);
229 firstGT = xmlProlog.indexOf('>');
230 }
231 if (firstGT == -1) {
232 if (c == -1) {
233 throw new IOException("Unexpected end of XML stream");
234 }
235 throw new IOException(
236 "XML prolog or ROOT element not found on first "
237 + offset + " bytes");
238 }
239 final int bytesRead = offset;
240 if (bytesRead > 0) {
241 is.reset();
242 final BufferedReader bReader = new BufferedReader(new StringReader(
243 xmlProlog.substring(0, firstGT + 1)));
244 final StringBuilder prolog = new StringBuilder();
245 String line;
246 while ((line = bReader.readLine()) != null) {
247 prolog.append(line);
248 }
249 final Matcher m = ENCODING_PATTERN.matcher(prolog);
250 if (m.find()) {
251 encoding = m.group(1).toUpperCase(Locale.ROOT);
252 encoding = encoding.substring(1, encoding.length() - 1);
253 }
254 }
255 }
256 return encoding;
257 }
258
259
260 static boolean isAppXml(final String mime) {
261 return mime != null
262 && (mime.equals("application/xml")
263 || mime.equals("application/xml-dtd")
264 || mime
265 .equals("application/xml-external-parsed-entity") || mime
266 .startsWith("application/") && mime.endsWith("+xml"));
267 }
268
269
270 static boolean isTextXml(final String mime) {
271 return mime != null
272 && (mime.equals("text/xml")
273 || mime.equals("text/xml-external-parsed-entity") || mime
274 .startsWith("text/") && mime.endsWith("+xml"));
275 }
276
277
278
279
280
281
282
283
284
285
286
287 public static void setDefaultEncoding(final String encoding) {
288 staticDefaultEncoding = encoding;
289 }
290
291 private Reader reader;
292
293 private String encoding;
294
295 private final String defaultEncoding;
296
297
298
299
300
301
302
303
304
305
306
307
308
309 @SuppressWarnings("resource")
310 public XmlStreamReader(final File file) throws IOException {
311 this(Files.newInputStream(file.toPath()));
312 }
313
314
315
316
317
318
319
320
321
322
323
324
325 public XmlStreamReader(final InputStream inputStream) throws IOException {
326 this(inputStream, true);
327 }
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException,
357 XmlStreamReaderException {
358 defaultEncoding = staticDefaultEncoding;
359 try {
360 doRawStream(inputStream);
361 } catch (final XmlStreamReaderException ex) {
362 if (!lenient) {
363 throw ex;
364 }
365 doLenientDetection(null, ex);
366 }
367 }
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386 public XmlStreamReader(final InputStream inputStream, final String httpContentType)
387 throws IOException {
388 this(inputStream, httpContentType, true);
389 }
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424 public XmlStreamReader(final InputStream inputStream, final String httpContentType,
425 final boolean lenient) throws IOException, XmlStreamReaderException {
426 this(inputStream, httpContentType, lenient, null);
427 }
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463 public XmlStreamReader(final InputStream inputStream, final String httpContentType,
464 final boolean lenient, final String defaultEncoding) throws IOException,
465 XmlStreamReaderException {
466 this.defaultEncoding = defaultEncoding == null ? staticDefaultEncoding
467 : defaultEncoding;
468 try {
469 doHttpStream(inputStream, httpContentType, lenient);
470 } catch (final XmlStreamReaderException ex) {
471 if (!lenient) {
472 throw ex;
473 }
474 doLenientDetection(httpContentType, ex);
475 }
476 }
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495 public XmlStreamReader(final URL url) throws IOException {
496
497 this(url.openConnection());
498 }
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518 public XmlStreamReader(final URLConnection conn) throws IOException {
519 defaultEncoding = staticDefaultEncoding;
520 final boolean lenient = true;
521 if (conn instanceof HttpURLConnection || conn.getContentType() != null) {
522 try {
523 doHttpStream(conn.getInputStream(), conn.getContentType(),
524 lenient);
525 } catch (final XmlStreamReaderException ex) {
526 doLenientDetection(conn.getContentType(), ex);
527 }
528 } else {
529 try {
530 doRawStream(conn.getInputStream());
531 } catch (final XmlStreamReaderException ex) {
532 doLenientDetection(null, ex);
533 }
534 }
535 }
536
537
538 String calculateHttpEncoding(final String cTMime, final String cTEnc,
539 final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final InputStream is,
540 final boolean lenient) throws IOException {
541 final String encoding;
542 if (lenient && xmlEnc != null) {
543 encoding = xmlEnc;
544 } else {
545 final boolean appXml = isAppXml(cTMime);
546 final boolean textXml = isTextXml(cTMime);
547 if (!appXml && !textXml) {
548 throw new XmlStreamReaderException(HTTP_EX_3
549 .format(new Object[] { cTMime, cTEnc, bomEnc,
550 xmlGuessEnc, xmlEnc }), cTMime, cTEnc, bomEnc,
551 xmlGuessEnc, xmlEnc, is);
552 }
553 if (cTEnc == null) {
554 if (appXml) {
555 encoding = calculateRawEncoding(bomEnc, xmlGuessEnc,
556 xmlEnc, is);
557 } else {
558 encoding = defaultEncoding == null ? US_ASCII
559 : defaultEncoding;
560 }
561 } else if (bomEnc != null
562 && (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE))) {
563 throw new XmlStreamReaderException(HTTP_EX_1
564 .format(new Object[] { cTMime, cTEnc, bomEnc,
565 xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
566 bomEnc, xmlGuessEnc, xmlEnc, is);
567 } else if (cTEnc.equals(UTF_16)) {
568 if (bomEnc == null || !bomEnc.startsWith(UTF_16)) {
569 throw new XmlStreamReaderException(HTTP_EX_2
570 .format(new Object[] { cTMime, cTEnc, bomEnc,
571 xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
572 bomEnc, xmlGuessEnc, xmlEnc, is);
573 }
574 encoding = bomEnc;
575 } else if (bomEnc != null
576 && (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE))) {
577 throw new XmlStreamReaderException(HTTP_EX_1
578 .format(new Object[] { cTMime, cTEnc, bomEnc,
579 xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
580 bomEnc, xmlGuessEnc, xmlEnc, is);
581 } else if (cTEnc.equals(UTF_32)) {
582 if (bomEnc == null || !bomEnc.startsWith(UTF_32)) {
583 throw new XmlStreamReaderException(HTTP_EX_2
584 .format(new Object[] { cTMime, cTEnc, bomEnc,
585 xmlGuessEnc, xmlEnc }), cTMime, cTEnc,
586 bomEnc, xmlGuessEnc, xmlEnc, is);
587 }
588 encoding = bomEnc;
589 } else {
590 encoding = cTEnc;
591 }
592 }
593 return encoding;
594 }
595
596
597 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc,
598 final String xmlEnc, final InputStream is) throws IOException {
599 final String encoding;
600 if (bomEnc == null) {
601 if (xmlGuessEnc == null || xmlEnc == null) {
602 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
603 } else if (xmlEnc.equals(UTF_16)
604 && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc
605 .equals(UTF_16LE))) {
606 encoding = xmlGuessEnc;
607 } else if (xmlEnc.equals(UTF_32)
608 && (xmlGuessEnc.equals(UTF_32BE) || xmlGuessEnc
609 .equals(UTF_32LE))) {
610 encoding = xmlGuessEnc;
611 } else {
612 encoding = xmlEnc;
613 }
614 } else if (bomEnc.equals(UTF_8)) {
615 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
616 throw new XmlStreamReaderException(RAW_EX_1
617 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
618 bomEnc, xmlGuessEnc, xmlEnc, is);
619 }
620 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
621 throw new XmlStreamReaderException(RAW_EX_1
622 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
623 bomEnc, xmlGuessEnc, xmlEnc, is);
624 }
625 encoding = UTF_8;
626 } else {
627 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
628 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
629 throw new XmlStreamReaderException(RAW_EX_1.format(new Object[] { bomEnc,
630 xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc, is);
631 }
632 if (xmlEnc != null && !xmlEnc.equals(UTF_16)
633 && !xmlEnc.equals(bomEnc)) {
634 throw new XmlStreamReaderException(RAW_EX_1
635 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
636 bomEnc, xmlGuessEnc, xmlEnc, is);
637 }
638 } else if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
639 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
640 throw new XmlStreamReaderException(RAW_EX_1.format(new Object[] { bomEnc,
641 xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc, xmlEnc, is);
642 }
643 if (xmlEnc != null && !xmlEnc.equals(UTF_32)
644 && !xmlEnc.equals(bomEnc)) {
645 throw new XmlStreamReaderException(RAW_EX_1
646 .format(new Object[] { bomEnc, xmlGuessEnc, xmlEnc }),
647 bomEnc, xmlGuessEnc, xmlEnc, is);
648 }
649 } else {
650 throw new XmlStreamReaderException(RAW_EX_2.format(new Object[] {
651 bomEnc, xmlGuessEnc, xmlEnc }), bomEnc, xmlGuessEnc,
652 xmlEnc, is);
653 }
654 encoding = bomEnc;
655 }
656 return encoding;
657 }
658
659
660
661
662
663
664 @Override
665 public void close() throws IOException {
666 reader.close();
667 }
668
669 private void doHttpStream(final InputStream inputStream, final String httpContentType,
670 final boolean lenient) throws IOException {
671 final BufferedInputStream pis = new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE);
672 final String cTMime = getContentTypeMime(httpContentType);
673 final String cTEnc = getContentTypeEncoding(httpContentType);
674 final String bomEnc = getBOMEncoding(pis);
675 final String xmlGuessEnc = getXMLGuessEncoding(pis);
676 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
677 final String encoding = calculateHttpEncoding(cTMime, cTEnc, bomEnc,
678 xmlGuessEnc, xmlEnc, pis, lenient);
679 prepareReader(pis, encoding);
680 }
681
682 private void doLenientDetection(String httpContentType,
683 XmlStreamReaderException ex) throws IOException {
684 if (httpContentType != null && httpContentType.startsWith("text/html")) {
685 httpContentType = httpContentType.substring("text/html"
686 .length());
687 httpContentType = "text/xml" + httpContentType;
688 try {
689 doHttpStream(ex.getInputStream(), httpContentType, true);
690 ex = null;
691 } catch (final XmlStreamReaderException ex2) {
692 ex = ex2;
693 }
694 }
695 if (ex != null) {
696 String encoding = ex.getXmlEncoding();
697 if (encoding == null) {
698 encoding = ex.getContentTypeEncoding();
699 }
700 if (encoding == null) {
701 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
702 }
703 prepareReader(ex.getInputStream(), encoding);
704 }
705 }
706
707 private void doRawStream(final InputStream inputStream)
708 throws IOException {
709 final BufferedInputStream pis = new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE);
710 final String bomEnc = getBOMEncoding(pis);
711 final String xmlGuessEnc = getXMLGuessEncoding(pis);
712 final String xmlEnc = getXmlProlog(pis, xmlGuessEnc);
713 final String encoding = calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc, pis);
714 prepareReader(pis, encoding);
715 }
716
717
718
719
720
721
722 public String getEncoding() {
723 return encoding;
724 }
725
726 private void prepareReader(final InputStream inputStream, final String encoding)
727 throws IOException {
728 reader = new InputStreamReader(inputStream, encoding);
729 this.encoding = encoding;
730 }
731
732 @Override
733 public int read(final char[] buf, final int offset, final int len) throws IOException {
734 return reader.read(buf, offset, len);
735 }
736
737 }