Coverage Report

Coverage Report - org.apache.commons.feedparser.tools.XMLCleanser

Classes in this File

Line Coverage

Branch Coverage

Complexity

XMLCleanser

0/37

0/34

6.2

 /*
  * Copyright 1999,2004 The Apache Software Foundation.
  * 
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  * 
  *      http://www.apache.org/licenses/LICENSE-2.0
  * 
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.commons.feedparser.tools;
 
 /**
  * Class that can cleanse a string so that nothing can be present to break an
  * XML parser.  This is a VERY non-portable class as it is meant to work just
  * with Xalan/Xerces and may remove more text and replace things that are
  * non-XML centric.
  *
  * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
  * @version $Id: XMLCleanser.java 159211 2005-03-27 23:24:21Z burton $
  */
 public class XMLCleanser {
 
     public static String cleanse( String content ) {
 
         StringBuffer buff = new StringBuffer( content.length() );
 
         for ( int i = 0; i < content.length(); ++i ) {
 
             char c = content.charAt( i );
             
             if ( isXMLCharacter( c ) ) {
 
                 buff.append( c );
                 
             } 
 
         }
 
         return buff.toString();
 
     }
 
     /**
      * Copy based on a byte array.  
      *
      * 
      */
     public static String cleanse( byte[] content, String encoding ) throws Exception {
 
         String s = new String( content, encoding );
         
         StringBuffer buff = new StringBuffer( content.length );
 
         for ( int i = 0; i < s.length(); ++i ) {
 
             char c = s.charAt( i );
             
             if ( isXMLCharacter( c ) ) {
 
                 buff.append( c );
                 
             } 
 
         }
 
         return buff.toString();
 
     }
 
     public static char[] cleanseToCharArray( byte[] content ) {
 
         char[] buff = new char[content.length];
 
         int index = 0;
 
         for ( int i = 0; i < content.length; ++i ) {
 
             char c = (char)content[ i ];
             
             if ( isXMLCharacter( c ) ) {
 
                 buff[index] = c;
                 
                 ++index;
             } 
 
         }
 
         return buff;
 
     }
     
     /**
      * Copy based on a byte array.  
      *
      * 
      */
     public static byte[] cleanseToByteArray( byte[] content ) {
 
         byte[] buff = new byte[ content.length ];
 
         int index = 0;
         for ( int i = 0; i < content.length; ++i ) {
 
             char c = (char)content[ i ];
             
             if ( isXMLCharacter( c ) ) {
 
                 //buff.append( c );
                 buff[index] = content[ i ];
                 ++index;
             } 
 
         }
 
         return buff;
 
     }
 
     /*
      * This is a utility function for determining whether a specified character
      * is a character according to production 2 of the XML 1.0 specification.
      *
      * @param c <code>char</code> to check for XML compliance.
 
      * @return <code>boolean</code> - true if it's a character, false otherwise.
      */
     public static boolean isXMLCharacter( char c ) {
 
         // A parsed entity contains text, a sequence of characters, which may
         // represent markup or character data. A character is an atomic unit of
         // text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters
         // are tab, carriage return, line feed, and the legal graphic characters
         // of Unicode and ISO/IEC 10646. The use of "compatibility characters",
         // as defined in section 6.8 of [Unicode], is discouraged.
 
         // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
         // [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
         // blocks, FFFE, and FFFF. */
         
         if (c == '\n') return true;
         if (c == '\r') return true;
         if (c == '\t') return true;
 
         //NOTE: this was BROKEN!  The range between 0x80 and 0xFF is valid XML
         //and would end up dropping latin characters in UTF-8.  Why did I want
         //to return false here again?
         
         //if (c < 0x20) return false;  if (c < 0x80) return true;
         //if (c < 0xFF) return false; if (c <= 0xD7FF) return true;
 
         if (c < 0x20) return false;  if (c <= 0xD7FF) return true;
         if (c < 0xE000) return false;  if (c <= 0xFFFD) return true;
         if (c < 0x10000) return false;  if (c <= 0x10FFFF) return true;
         
         return false;
 
     }
 
 }

1		/*
2		* Copyright 1999,2004 The Apache Software Foundation.
3		*
4		* Licensed under the Apache License, Version 2.0 (the "License");
5		* you may not use this file except in compliance with the License.
6		* You may obtain a copy of the License at
7		*
8		* http://www.apache.org/licenses/LICENSE-2.0
9		*
10		* Unless required by applicable law or agreed to in writing, software
11		* distributed under the License is distributed on an "AS IS" BASIS,
12		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13		* See the License for the specific language governing permissions and
14		* limitations under the License.
15		*/
16
17		package org.apache.commons.feedparser.tools;
18
19		/**
20		* Class that can cleanse a string so that nothing can be present to break an
21		* XML parser. This is a VERY non-portable class as it is meant to work just
22		* with Xalan/Xerces and may remove more text and replace things that are
23		* non-XML centric.
24		*
25		* @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
26		* @version $Id: XMLCleanser.java 159211 2005-03-27 23:24:21Z burton $
27		*/
28	0	public class XMLCleanser {
29
30		public static String cleanse( String content ) {
31
32	0	StringBuffer buff = new StringBuffer( content.length() );
33
34	0	for ( int i = 0; i < content.length(); ++i ) {
35
36	0	char c = content.charAt( i );
37
38	0	if ( isXMLCharacter( c ) ) {
39
40	0	buff.append( c );
41
42		}
43
44		}
45
46	0	return buff.toString();
47
48		}
49
50		/**
51		* Copy based on a byte array.
52		*
53		*
54		*/
55		public static String cleanse( byte[] content, String encoding ) throws Exception {
56
57	0	String s = new String( content, encoding );
58
59	0	StringBuffer buff = new StringBuffer( content.length );
60
61	0	for ( int i = 0; i < s.length(); ++i ) {
62
63	0	char c = s.charAt( i );
64
65	0	if ( isXMLCharacter( c ) ) {
66
67	0	buff.append( c );
68
69		}
70
71		}
72
73	0	return buff.toString();
74
75		}
76
77		public static char[] cleanseToCharArray( byte[] content ) {
78
79	0	char[] buff = new char[content.length];
80
81	0	int index = 0;
82
83	0	for ( int i = 0; i < content.length; ++i ) {
84
85	0	char c = (char)content[ i ];
86
87	0	if ( isXMLCharacter( c ) ) {
88
89	0	buff[index] = c;
90
91	0	++index;
92		}
93
94		}
95
96	0	return buff;
97
98		}
99
100		/**
101		* Copy based on a byte array.
102		*
103		*
104		*/
105		public static byte[] cleanseToByteArray( byte[] content ) {
106
107	0	byte[] buff = new byte[ content.length ];
108
109	0	int index = 0;
110	0	for ( int i = 0; i < content.length; ++i ) {
111
112	0	char c = (char)content[ i ];
113
114	0	if ( isXMLCharacter( c ) ) {
115
116		//buff.append( c );
117	0	buff[index] = content[ i ];
118	0	++index;
119		}
120
121		}
122
123	0	return buff;
124
125		}
126
127		/*
128		* This is a utility function for determining whether a specified character
129		* is a character according to production 2 of the XML 1.0 specification.
130		*
131		* @param c <code>char</code> to check for XML compliance.
132
133		* @return <code>boolean</code> - true if it's a character, false otherwise.
134		*/
135		public static boolean isXMLCharacter( char c ) {
136
137		// A parsed entity contains text, a sequence of characters, which may
138		// represent markup or character data. A character is an atomic unit of
139		// text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters
140		// are tab, carriage return, line feed, and the legal graphic characters
141		// of Unicode and ISO/IEC 10646. The use of "compatibility characters",
142		// as defined in section 6.8 of [Unicode], is discouraged.
143
144		// [2] Char ::= #x9 \| #xA \| #xD \| [#x20-#xD7FF] \| [#xE000-#xFFFD] \|
145		// [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
146		// blocks, FFFE, and FFFF. */
147
148	0	if (c == '\n') return true;
149	0	if (c == '\r') return true;
150	0	if (c == '\t') return true;
151
152		//NOTE: this was BROKEN! The range between 0x80 and 0xFF is valid XML
153		//and would end up dropping latin characters in UTF-8. Why did I want
154		//to return false here again?
155
156		//if (c < 0x20) return false; if (c < 0x80) return true;
157		//if (c < 0xFF) return false; if (c <= 0xD7FF) return true;
158
159	0	if (c < 0x20) return false; if (c <= 0xD7FF) return true;
160	0	if (c < 0xE000) return false; if (c <= 0xFFFD) return true;
161	0	if (c < 0x10000) return false; if (c <= 0x10FFFF) return true;
162
163	0	return false;
164
165		}
166
167		}