Coverage Report - org.apache.commons.feedparser.tools.XMLCleanser
 
Classes in this File Line Coverage Branch Coverage Complexity
XMLCleanser
0%
0/37
0%
0/34
6.2
 
 1  
 /*
 2  
  * Copyright 1999,2004 The Apache Software Foundation.
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  * 
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.feedparser.tools;
 18  
 
 19  
 /**
 20  
  * Class that can cleanse a string so that nothing can be present to break an
 21  
  * XML parser.  This is a VERY non-portable class as it is meant to work just
 22  
  * with Xalan/Xerces and may remove more text and replace things that are
 23  
  * non-XML centric.
 24  
  *
 25  
  * @author <a href="mailto:burton@peerfear.org">Kevin A. Burton</a>
 26  
  * @version $Id: XMLCleanser.java 159211 2005-03-27 23:24:21Z burton $
 27  
  */
 28  0
 public class XMLCleanser {
 29  
 
 30  
     public static String cleanse( String content ) {
 31  
 
 32  0
         StringBuffer buff = new StringBuffer( content.length() );
 33  
 
 34  0
         for ( int i = 0; i < content.length(); ++i ) {
 35  
 
 36  0
             char c = content.charAt( i );
 37  
             
 38  0
             if ( isXMLCharacter( c ) ) {
 39  
 
 40  0
                 buff.append( c );
 41  
                 
 42  
             } 
 43  
 
 44  
         }
 45  
 
 46  0
         return buff.toString();
 47  
 
 48  
     }
 49  
 
 50  
     /**
 51  
      * Copy based on a byte array.  
 52  
      *
 53  
      * 
 54  
      */
 55  
     public static String cleanse( byte[] content, String encoding ) throws Exception {
 56  
 
 57  0
         String s = new String( content, encoding );
 58  
         
 59  0
         StringBuffer buff = new StringBuffer( content.length );
 60  
 
 61  0
         for ( int i = 0; i < s.length(); ++i ) {
 62  
 
 63  0
             char c = s.charAt( i );
 64  
             
 65  0
             if ( isXMLCharacter( c ) ) {
 66  
 
 67  0
                 buff.append( c );
 68  
                 
 69  
             } 
 70  
 
 71  
         }
 72  
 
 73  0
         return buff.toString();
 74  
 
 75  
     }
 76  
 
 77  
     public static char[] cleanseToCharArray( byte[] content ) {
 78  
 
 79  0
         char[] buff = new char[content.length];
 80  
 
 81  0
         int index = 0;
 82  
 
 83  0
         for ( int i = 0; i < content.length; ++i ) {
 84  
 
 85  0
             char c = (char)content[ i ];
 86  
             
 87  0
             if ( isXMLCharacter( c ) ) {
 88  
 
 89  0
                 buff[index] = c;
 90  
                 
 91  0
                 ++index;
 92  
             } 
 93  
 
 94  
         }
 95  
 
 96  0
         return buff;
 97  
 
 98  
     }
 99  
     
 100  
     /**
 101  
      * Copy based on a byte array.  
 102  
      *
 103  
      * 
 104  
      */
 105  
     public static byte[] cleanseToByteArray( byte[] content ) {
 106  
 
 107  0
         byte[] buff = new byte[ content.length ];
 108  
 
 109  0
         int index = 0;
 110  0
         for ( int i = 0; i < content.length; ++i ) {
 111  
 
 112  0
             char c = (char)content[ i ];
 113  
             
 114  0
             if ( isXMLCharacter( c ) ) {
 115  
 
 116  
                 //buff.append( c );
 117  0
                 buff[index] = content[ i ];
 118  0
                 ++index;
 119  
             } 
 120  
 
 121  
         }
 122  
 
 123  0
         return buff;
 124  
 
 125  
     }
 126  
 
 127  
     /*
 128  
      * This is a utility function for determining whether a specified character
 129  
      * is a character according to production 2 of the XML 1.0 specification.
 130  
      *
 131  
      * @param c <code>char</code> to check for XML compliance.
 132  
 
 133  
      * @return <code>boolean</code> - true if it's a character, false otherwise.
 134  
      */
 135  
     public static boolean isXMLCharacter( char c ) {
 136  
 
 137  
         // A parsed entity contains text, a sequence of characters, which may
 138  
         // represent markup or character data. A character is an atomic unit of
 139  
         // text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters
 140  
         // are tab, carriage return, line feed, and the legal graphic characters
 141  
         // of Unicode and ISO/IEC 10646. The use of "compatibility characters",
 142  
         // as defined in section 6.8 of [Unicode], is discouraged.
 143  
 
 144  
         // [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
 145  
         // [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
 146  
         // blocks, FFFE, and FFFF. */
 147  
         
 148  0
         if (c == '\n') return true;
 149  0
         if (c == '\r') return true;
 150  0
         if (c == '\t') return true;
 151  
 
 152  
         //NOTE: this was BROKEN!  The range between 0x80 and 0xFF is valid XML
 153  
         //and would end up dropping latin characters in UTF-8.  Why did I want
 154  
         //to return false here again?
 155  
         
 156  
         //if (c < 0x20) return false;  if (c < 0x80) return true;
 157  
         //if (c < 0xFF) return false; if (c <= 0xD7FF) return true;
 158  
 
 159  0
         if (c < 0x20) return false;  if (c <= 0xD7FF) return true;
 160  0
         if (c < 0xE000) return false;  if (c <= 0xFFFD) return true;
 161  0
         if (c < 0x10000) return false;  if (c <= 0x10FFFF) return true;
 162  
         
 163  0
         return false;
 164  
 
 165  
     }
 166  
 
 167  
 }