1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.commons.io.input;
18
19 import java.io.BufferedInputStream;
20 import java.io.BufferedReader;
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.InputStreamReader;
25 import java.io.Reader;
26 import java.io.StringReader;
27 import java.net.HttpURLConnection;
28 import java.net.URL;
29 import java.net.URLConnection;
30 import java.nio.charset.Charset;
31 import java.nio.charset.StandardCharsets;
32 import java.nio.file.Files;
33 import java.nio.file.Path;
34 import java.text.MessageFormat;
35 import java.util.Locale;
36 import java.util.Objects;
37 import java.util.regex.Matcher;
38 import java.util.regex.Pattern;
39
40 import org.apache.commons.io.ByteOrderMark;
41 import org.apache.commons.io.Charsets;
42 import org.apache.commons.io.IOUtils;
43 import org.apache.commons.io.build.AbstractStreamBuilder;
44 import org.apache.commons.io.function.IOConsumer;
45 import org.apache.commons.io.output.XmlStreamWriter;
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75 public class XmlStreamReader extends Reader {
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> {
121
122 private boolean nullCharset = true;
123 private boolean lenient = true;
124 private String httpContentType;
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148 @SuppressWarnings("resource")
149 @Override
150 public XmlStreamReader get() throws IOException {
151 final String defaultEncoding = nullCharset ? null : getCharset().name();
152
153 return httpContentType == null
154 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding)
155 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding);
156
157 }
158
159 @Override
160 public Builder setCharset(final Charset charset) {
161 nullCharset = charset == null;
162 return super.setCharset(charset);
163 }
164
165 @Override
166 public Builder setCharset(final String charset) {
167 nullCharset = charset == null;
168 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault()));
169 }
170
171
172
173
174
175
176
177 public Builder setHttpContentType(final String httpContentType) {
178 this.httpContentType = httpContentType;
179 return this;
180 }
181
182
183
184
185
186
187
188 public Builder setLenient(final boolean lenient) {
189 this.lenient = lenient;
190 return this;
191 }
192
193 }
194
195 private static final String UTF_8 = StandardCharsets.UTF_8.name();
196
197 private static final String US_ASCII = StandardCharsets.US_ASCII.name();
198
199 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name();
200
201 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name();
202
203 private static final String UTF_32BE = "UTF-32BE";
204
205 private static final String UTF_32LE = "UTF-32LE";
206
207 private static final String UTF_16 = StandardCharsets.UTF_16.name();
208
209 private static final String UTF_32 = "UTF-32";
210
211 private static final String EBCDIC = "CP1047";
212
213 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE,
214 ByteOrderMark.UTF_32LE };
215
216
217 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D),
218 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00),
219 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D),
220 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00),
221 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) };
222
223 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?");
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241 public static final Pattern ENCODING_PATTERN = Pattern.compile(
242
243 "^<\\?xml\\s+"
244 + "(?:version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+)??"
245 + "encoding\\s*=\\s*"
246 + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")"
247 + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))",
248 Pattern.MULTILINE);
249
250
251 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch";
252
253 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM";
254
255 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null";
256
257 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch";
258
259 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME";
260
261
262
263
264
265
266
267 public static Builder builder() {
268 return new Builder();
269 }
270
271
272
273
274
275
276
277 static String getContentTypeEncoding(final String httpContentType) {
278 String encoding = null;
279 if (httpContentType != null) {
280 final int i = httpContentType.indexOf(";");
281 if (i > -1) {
282 final String postMime = httpContentType.substring(i + 1);
283 final Matcher m = CHARSET_PATTERN.matcher(postMime);
284 encoding = m.find() ? m.group(1) : null;
285 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null;
286 }
287 }
288 return encoding;
289 }
290
291
292
293
294
295
296
297 static String getContentTypeMime(final String httpContentType) {
298 String mime = null;
299 if (httpContentType != null) {
300 final int i = httpContentType.indexOf(";");
301 mime = i >= 0 ? httpContentType.substring(0, i) : httpContentType;
302 mime = mime.trim();
303 }
304 return mime;
305 }
306
307
308
309
310
311
312
313
314
315 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException {
316 String encoding = null;
317 if (guessedEnc != null) {
318 final byte[] bytes = IOUtils.byteArray();
319 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE);
320 int offset = 0;
321 int max = IOUtils.DEFAULT_BUFFER_SIZE;
322 int c = inputStream.read(bytes, offset, max);
323 int firstGT = -1;
324 String xmlProlog = "";
325 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) {
326 offset += c;
327 max -= c;
328 c = inputStream.read(bytes, offset, max);
329 xmlProlog = new String(bytes, 0, offset, guessedEnc);
330 firstGT = xmlProlog.indexOf('>');
331 }
332 if (firstGT == -1) {
333 if (c == -1) {
334 throw new IOException("Unexpected end of XML stream");
335 }
336 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes");
337 }
338 final int bytesRead = offset;
339 if (bytesRead > 0) {
340 inputStream.reset();
341 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1)));
342 final StringBuilder prolog = new StringBuilder();
343 IOConsumer.forEach(bReader.lines(), l -> prolog.append(l).append(' '));
344 final Matcher m = ENCODING_PATTERN.matcher(prolog);
345 if (m.find()) {
346 encoding = m.group(1).toUpperCase(Locale.ROOT);
347 encoding = encoding.substring(1, encoding.length() - 1);
348 }
349 }
350 }
351 return encoding;
352 }
353
354
355
356
357
358
359
360 static boolean isAppXml(final String mime) {
361 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity")
362 || mime.startsWith("application/") && mime.endsWith("+xml"));
363 }
364
365
366
367
368
369
370
371 static boolean isTextXml(final String mime) {
372 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml"));
373 }
374
375 private final Reader reader;
376
377 private final String encoding;
378
379 private final String defaultEncoding;
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395 @Deprecated
396 public XmlStreamReader(final File file) throws IOException {
397 this(Objects.requireNonNull(file, "file").toPath());
398 }
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414 @Deprecated
415 public XmlStreamReader(final InputStream inputStream) throws IOException {
416 this(inputStream, true);
417 }
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450 @Deprecated
451 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException {
452 this(inputStream, lenient, null);
453 }
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487 @Deprecated
488 @SuppressWarnings("resource")
489 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException {
490 this.defaultEncoding = defaultEncoding;
491 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
492 false, BOMS);
493 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
494 this.encoding = processHttpStream(bom, pis, lenient);
495 this.reader = new InputStreamReader(pis, encoding);
496 }
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514 @Deprecated
515 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException {
516 this(inputStream, httpContentType, true);
517 }
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552 @Deprecated
553 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException {
554 this(inputStream, httpContentType, lenient, null);
555 }
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591 @Deprecated
592 @SuppressWarnings("resource")
593 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding)
594 throws IOException {
595 this.defaultEncoding = defaultEncoding;
596 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE),
597 false, BOMS);
598 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES);
599 this.encoding = processHttpStream(bom, pis, lenient, httpContentType);
600 this.reader = new InputStreamReader(pis, encoding);
601 }
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618 @Deprecated
619 @SuppressWarnings("resource")
620 public XmlStreamReader(final Path file) throws IOException {
621 this(Files.newInputStream(Objects.requireNonNull(file, "file")));
622 }
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640 public XmlStreamReader(final URL url) throws IOException {
641 this(Objects.requireNonNull(url, "url").openConnection(), null);
642 }
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException {
663 Objects.requireNonNull(urlConnection, "urlConnection");
664 this.defaultEncoding = defaultEncoding;
665 final boolean lenient = true;
666 final String contentType = urlConnection.getContentType();
667 final InputStream inputStream = urlConnection.getInputStream();
668 @SuppressWarnings("resource")
669
670 final BOMInputStream bomInput = BOMInputStream.builder()
671 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE))
672 .setInclude(false)
673 .setByteOrderMarks(BOMS)
674 .get();
675 @SuppressWarnings("resource")
676 final BOMInputStream piInput = BOMInputStream.builder()
677 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE))
678 .setInclude(true)
679 .setByteOrderMarks(XML_GUESS_BYTES)
680 .get();
681
682 if (urlConnection instanceof HttpURLConnection || contentType != null) {
683 this.encoding = processHttpStream(bomInput, piInput, lenient, contentType);
684 } else {
685 this.encoding = processHttpStream(bomInput, piInput, lenient);
686 }
687 this.reader = new InputStreamReader(piInput, encoding);
688 }
689
690
691
692
693
694
695
696
697
698
699
700
701 String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType)
702 throws IOException {
703
704
705 if (lenient && xmlEnc != null) {
706 return xmlEnc;
707 }
708
709
710 final String cTMime = getContentTypeMime(httpContentType);
711 final String cTEnc = getContentTypeEncoding(httpContentType);
712 final boolean appXml = isAppXml(cTMime);
713 final boolean textXml = isTextXml(cTMime);
714
715
716 if (!appXml && !textXml) {
717 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
718 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
719 }
720
721
722 if (cTEnc == null) {
723 if (appXml) {
724 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
725 }
726 return defaultEncoding == null ? US_ASCII : defaultEncoding;
727 }
728
729
730 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) {
731 if (bomEnc != null) {
732 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
733 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
734 }
735 return cTEnc;
736 }
737
738
739 if (cTEnc.equals(UTF_16)) {
740 if (bomEnc != null && bomEnc.startsWith(UTF_16)) {
741 return bomEnc;
742 }
743 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
744 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
745 }
746
747
748 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) {
749 if (bomEnc != null) {
750 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
751 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
752 }
753 return cTEnc;
754 }
755
756
757 if (cTEnc.equals(UTF_32)) {
758 if (bomEnc != null && bomEnc.startsWith(UTF_32)) {
759 return bomEnc;
760 }
761 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
762 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc);
763 }
764
765 return cTEnc;
766 }
767
768
769
770
771
772
773
774
775
776
777 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException {
778
779
780 if (bomEnc == null) {
781 if (xmlGuessEnc == null || xmlEnc == null) {
782 return defaultEncoding == null ? UTF_8 : defaultEncoding;
783 }
784 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) {
785 return xmlGuessEnc;
786 }
787 return xmlEnc;
788 }
789
790
791 if (bomEnc.equals(UTF_8)) {
792 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) {
793 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
794 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
795 }
796 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) {
797 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
798 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
799 }
800 return bomEnc;
801 }
802
803
804 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) {
805 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
806 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
807 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
808 }
809 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) {
810 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
811 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
812 }
813 return bomEnc;
814 }
815
816
817 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) {
818 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) {
819 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
820 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
821 }
822 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) {
823 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc);
824 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
825 }
826 return bomEnc;
827 }
828
829
830 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc);
831 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc);
832 }
833
834
835
836
837
838
839 @Override
840 public void close() throws IOException {
841 reader.close();
842 }
843
844
845
846
847
848
849
850
851
852 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException {
853 if (httpContentType != null && httpContentType.startsWith("text/html")) {
854 httpContentType = httpContentType.substring("text/html".length());
855 httpContentType = "text/xml" + httpContentType;
856 try {
857 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType);
858 } catch (final XmlStreamReaderException ex2) {
859 ex = ex2;
860 }
861 }
862 String encoding = ex.getXmlEncoding();
863 if (encoding == null) {
864 encoding = ex.getContentTypeEncoding();
865 }
866 if (encoding == null) {
867 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding;
868 }
869 return encoding;
870 }
871
872
873
874
875
876
877
878
879
880 public String getDefaultEncoding() {
881 return defaultEncoding;
882 }
883
884
885
886
887
888
889 public String getEncoding() {
890 return encoding;
891 }
892
893
894
895
896
897
898
899
900
901
902 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException {
903 final String bomEnc = bomInput.getBOMCharsetName();
904 final String xmlGuessEnc = piInput.getBOMCharsetName();
905 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
906 try {
907 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc);
908 } catch (final XmlStreamReaderException ex) {
909 if (lenient) {
910 return doLenientDetection(null, ex);
911 }
912 throw ex;
913 }
914 }
915
916
917
918
919
920
921
922
923
924
925
926 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType)
927 throws IOException {
928 final String bomEnc = bomInput.getBOMCharsetName();
929 final String xmlGuessEnc = piInput.getBOMCharsetName();
930 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc);
931 try {
932 return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType);
933 } catch (final XmlStreamReaderException ex) {
934 if (lenient) {
935 return doLenientDetection(httpContentType, ex);
936 }
937 throw ex;
938 }
939 }
940
941
942
943
944
945
946
947
948
949
950 @Override
951 public int read(final char[] buf, final int offset, final int len) throws IOException {
952 return reader.read(buf, offset, len);
953 }
954
955 }