Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
XMLCleanser |
|
| 6.2;6.2 |
1 | /* | |
2 | * Copyright 1999,2004 The Apache Software Foundation. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.commons.feedparser.tools; | |
18 | ||
19 | /** | |
20 | * Class that can cleanse a string so that nothing can be present to break an | |
21 | * XML parser. This is a VERY non-portable class as it is meant to work just | |
22 | * with Xalan/Xerces and may remove more text and replace things that are | |
23 | * non-XML centric. | |
24 | * | |
25 | * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a> | |
26 | * @version $Id: XMLCleanser.java 159211 2005-03-27 23:24:21Z burton $ | |
27 | */ | |
28 | 0 | public class XMLCleanser { |
29 | ||
30 | public static String cleanse( String content ) { | |
31 | ||
32 | 0 | StringBuffer buff = new StringBuffer( content.length() ); |
33 | ||
34 | 0 | for ( int i = 0; i < content.length(); ++i ) { |
35 | ||
36 | 0 | char c = content.charAt( i ); |
37 | ||
38 | 0 | if ( isXMLCharacter( c ) ) { |
39 | ||
40 | 0 | buff.append( c ); |
41 | ||
42 | } | |
43 | ||
44 | } | |
45 | ||
46 | 0 | return buff.toString(); |
47 | ||
48 | } | |
49 | ||
50 | /** | |
51 | * Copy based on a byte array. | |
52 | * | |
53 | * | |
54 | */ | |
55 | public static String cleanse( byte[] content, String encoding ) throws Exception { | |
56 | ||
57 | 0 | String s = new String( content, encoding ); |
58 | ||
59 | 0 | StringBuffer buff = new StringBuffer( content.length ); |
60 | ||
61 | 0 | for ( int i = 0; i < s.length(); ++i ) { |
62 | ||
63 | 0 | char c = s.charAt( i ); |
64 | ||
65 | 0 | if ( isXMLCharacter( c ) ) { |
66 | ||
67 | 0 | buff.append( c ); |
68 | ||
69 | } | |
70 | ||
71 | } | |
72 | ||
73 | 0 | return buff.toString(); |
74 | ||
75 | } | |
76 | ||
77 | public static char[] cleanseToCharArray( byte[] content ) { | |
78 | ||
79 | 0 | char[] buff = new char[content.length]; |
80 | ||
81 | 0 | int index = 0; |
82 | ||
83 | 0 | for ( int i = 0; i < content.length; ++i ) { |
84 | ||
85 | 0 | char c = (char)content[ i ]; |
86 | ||
87 | 0 | if ( isXMLCharacter( c ) ) { |
88 | ||
89 | 0 | buff[index] = c; |
90 | ||
91 | 0 | ++index; |
92 | } | |
93 | ||
94 | } | |
95 | ||
96 | 0 | return buff; |
97 | ||
98 | } | |
99 | ||
100 | /** | |
101 | * Copy based on a byte array. | |
102 | * | |
103 | * | |
104 | */ | |
105 | public static byte[] cleanseToByteArray( byte[] content ) { | |
106 | ||
107 | 0 | byte[] buff = new byte[ content.length ]; |
108 | ||
109 | 0 | int index = 0; |
110 | 0 | for ( int i = 0; i < content.length; ++i ) { |
111 | ||
112 | 0 | char c = (char)content[ i ]; |
113 | ||
114 | 0 | if ( isXMLCharacter( c ) ) { |
115 | ||
116 | //buff.append( c ); | |
117 | 0 | buff[index] = content[ i ]; |
118 | 0 | ++index; |
119 | } | |
120 | ||
121 | } | |
122 | ||
123 | 0 | return buff; |
124 | ||
125 | } | |
126 | ||
127 | /* | |
128 | * This is a utility function for determining whether a specified character | |
129 | * is a character according to production 2 of the XML 1.0 specification. | |
130 | * | |
131 | * @param c <code>char</code> to check for XML compliance. | |
132 | ||
133 | * @return <code>boolean</code> - true if it's a character, false otherwise. | |
134 | */ | |
135 | public static boolean isXMLCharacter( char c ) { | |
136 | ||
137 | // A parsed entity contains text, a sequence of characters, which may | |
138 | // represent markup or character data. A character is an atomic unit of | |
139 | // text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters | |
140 | // are tab, carriage return, line feed, and the legal graphic characters | |
141 | // of Unicode and ISO/IEC 10646. The use of "compatibility characters", | |
142 | // as defined in section 6.8 of [Unicode], is discouraged. | |
143 | ||
144 | // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | | |
145 | // [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate | |
146 | // blocks, FFFE, and FFFF. */ | |
147 | ||
148 | 0 | if (c == '\n') return true; |
149 | 0 | if (c == '\r') return true; |
150 | 0 | if (c == '\t') return true; |
151 | ||
152 | //NOTE: this was BROKEN! The range between 0x80 and 0xFF is valid XML | |
153 | //and would end up dropping latin characters in UTF-8. Why did I want | |
154 | //to return false here again? | |
155 | ||
156 | //if (c < 0x20) return false; if (c < 0x80) return true; | |
157 | //if (c < 0xFF) return false; if (c <= 0xD7FF) return true; | |
158 | ||
159 | 0 | if (c < 0x20) return false; if (c <= 0xD7FF) return true; |
160 | 0 | if (c < 0xE000) return false; if (c <= 0xFFFD) return true; |
161 | 0 | if (c < 0x10000) return false; if (c <= 0x10FFFF) return true; |
162 | ||
163 | 0 | return false; |
164 | ||
165 | } | |
166 | ||
167 | } |