Coverage Report - org.apache.commons.feedparser.FeedFilter
 
Classes in this File Line Coverage Branch Coverage Complexity
FeedFilter
0%
0/164
0%
0/24
2.857
 
 1  
 /*
 2  
  * Copyright 1999,2004 The Apache Software Foundation.
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  * 
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.feedparser;
 18  
 
 19  
 import java.util.HashMap;
 20  
 import java.util.regex.Matcher;
 21  
 import java.util.regex.Pattern;
 22  
 
 23  
 import org.apache.log4j.Logger;
 24  
 
 25  
 /**
 26  
  *
 27  
  * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
 28  
  * @version $Id: FeedFilter.java 373614 2006-01-30 22:31:21Z mvdb $
 29  
  */
 30  0
 public class FeedFilter {
 31  
 
 32  0
     private static Logger log = Logger.getLogger( FeedFilter.class );
 33  
 
 34  0
     public static boolean DO_REMOVE_LEADING_PROLOG = true;
 35  0
     public static boolean DO_DECODE_ENTITIES = true;
 36  
 
 37  0
     public static HashMap LATIN1_ENTITIES = new HashMap();
 38  
 
 39  0
     private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
 40  
 
 41  
     /**
 42  
      * This is probably the wrong behavior.  I shouldn't call this method I
 43  
      * think because assuming a content type is bad form.
 44  
      *
 45  
      * @deprecated Specify an encoding with #parse( bytes[], encoding )
 46  
      * 
 47  
      */
 48  
     public static byte[] parse( byte[] bytes )
 49  
         throws Exception {
 50  
 
 51  0
         return parse( bytes, "UTF-8" );
 52  
 
 53  
     }
 54  
 
 55  
     public static byte[] parse( byte[] bytes, String encoding )
 56  
         throws Exception {
 57  
 
 58  0
         String content = new String( bytes, encoding );
 59  
 
 60  0
         return parse( content, encoding );
 61  
 
 62  
     }
 63  
 
 64  
     /**
 65  
      * Parse out an input string of content.
 66  
      * 
 67  
      * http://wiki.apache.org/jakarta-commons/FeedParser_2fStringAllocationConsideredHelpful
 68  
      *
 69  
      * 
 70  
      */
 71  
     public static byte[] parse( String content, String encoding )
 72  
         throws Exception {
 73  
 
 74  
         //FIXME: return an object here so that I can flag a bozo bit.
 75  
         
 76  
         //remove leading prolog...
 77  0
         if ( DO_REMOVE_LEADING_PROLOG )
 78  0
             content = doRemoveLeadingProlog( content, encoding );
 79  
 
 80  
         //decode HTML entities that are referenced.
 81  0
         if ( DO_DECODE_ENTITIES )
 82  0
             content = doDecodeEntities( content );
 83  
 
 84  
         //TODO: undeclared namespace prefixes should be expanded to their common
 85  
         //form. 'rdf, 'atom', 'xhtml' etc. Considering that they're will only be
 86  
         //a handful H and then 4^36 different possibilities the probability will
 87  
         //only be H in 4^36 which is pretty good that we won't have a false
 88  
         //positive.
 89  
         
 90  0
         return content.getBytes( encoding );
 91  
 
 92  
     }
 93  
         
 94  
     /**
 95  
      * Removing prolog whitespace, comments, and other garbage from the
 96  
      * beginning of a feed.
 97  
      *
 98  
      * 
 99  
      */
 100  
     private static String doRemoveLeadingProlog( String content, String encoding ) {
 101  
 
 102  
         // if we're a UTF-16 or UTF-32 feed we need to LEAVE the prolog because
 103  
         // it triggers a UTF-16 parse due to the BOM.
 104  
         //
 105  
         // FIXME: this isn't actually true.  We should leave the BOM and remove
 106  
         // the prolog anyway due to the fact that this will still break the
 107  
         // parser.  Come up with some tests for UTF-16 to see if I can get it to
 108  
         // break and then update this method.
 109  
 
 110  0
         if ( "UTF-16".equals( encoding ) ||
 111  
              "UTF-32".equals( encoding ) )
 112  0
             return content;
 113  
         
 114  
         //move to the beginning of the first element or comment.  When this is a
 115  
         //processing instruction we will move to that
 116  0
         int begin = content.indexOf( "<" );
 117  
 
 118  0
         if ( begin > 0 ) {
 119  0
             content = content.substring( begin, content.length() );
 120  0
             log.warn( "Skipped whitespace in prolog and moved towards first element." );
 121  
         }
 122  
 
 123  
         //now skip to the XML processing instruction when necessary.  This is
 124  
         //used to remove comments prior to <?xml which are not allowed.
 125  
         
 126  0
         begin = content.indexOf( "<?xml" );
 127  
 
 128  0
         if ( begin > 0 ) {
 129  0
             content = content.substring( begin, content.length() );
 130  0
             log.warn( "Removed prolog towards first processing instruction." );
 131  
         }
 132  
 
 133  0
         content = doRemoveElementProlog( content );
 134  
 
 135  0
         return content;
 136  
         
 137  
     }
 138  
 
 139  
     /**
 140  
      * Remove element content between:
 141  
      *
 142  
      * <?xml version="1.0"?>
 143  
      *
 144  
      * THIS IS BROKEN PROLOG
 145  
      *
 146  
      * <foo>
 147  
      *
 148  
      * 
 149  
      */
 150  
     private static String doRemoveElementProlog( String content ) {
 151  
 
 152  0
         int end = content.lastIndexOf( "?>", 100 );
 153  
 
 154  0
         if ( end == -1 )
 155  0
             return content;
 156  
 
 157  0
         StringBuffer buff = new StringBuffer( content.length() );
 158  0
         end = end + 2;
 159  0
         buff.append( content.substring( 0, end ) );
 160  
 
 161  0
         int begin = content.indexOf( "<", end );
 162  
 
 163  0
         if ( begin != -1 ) {
 164  
 
 165  0
             buff.append( "\n" );
 166  0
             buff.append( content.substring( begin, content.length() ) );
 167  
             
 168  
         }
 169  
         
 170  0
         return buff.toString();
 171  
         
 172  
     }
 173  
     
 174  
     private static String doDecodeEntities( String content ) {
 175  
 
 176  0
         StringBuffer buff = new StringBuffer( content.length() + 1000 );
 177  
 
 178  0
         Matcher m = entity_pattern.matcher( content );
 179  
 
 180  0
         int begin = 0;
 181  
 
 182  0
         boolean hasFilterDecodedEntities = false;
 183  0
         boolean hasFilterFoundUnknownEntity = false;
 184  
 
 185  
         //FIXME: note that when I was benchmarking this code that this showed up
 186  
         //as a MAJOR bottleneck so we might want to optimize it a little more.
 187  
 
 188  0
         while ( m.find() ) {
 189  
 
 190  0
             buff.append( content.substring( begin, m.start() ) );
 191  
             
 192  0
             String entity = m.group( 1 );
 193  
 
 194  0
             String value = (String)LATIN1_ENTITIES.get( entity );
 195  
 
 196  0
             if ( value != null ) {
 197  0
                 buff.append( "&#" );
 198  0
                 buff.append( value );
 199  0
                 buff.append( ";" );
 200  
 
 201  0
                 hasFilterDecodedEntities = true;
 202  
 
 203  
             } else {
 204  
 
 205  
                 //This is not a known entity so we have no way to correct it.
 206  
                 //If this is done then we have a problem and the feed probably
 207  
                 //still won't parse
 208  0
                 buff.append( "&" );
 209  0
                 buff.append( entity );
 210  0
                 buff.append( ";" );
 211  
 
 212  0
                 hasFilterFoundUnknownEntity = true;
 213  
             }
 214  
 
 215  0
             begin = m.end( 0 );
 216  
             
 217  0
         } 
 218  
 
 219  0
         buff.append( content.substring( begin, content.length() ) );
 220  
 
 221  0
         if ( hasFilterFoundUnknownEntity ) 
 222  0
             log.warn( "Filter encountered unknown entities" );
 223  
 
 224  0
         if ( hasFilterDecodedEntities ) 
 225  0
             log.warn( "Filter has decoded latin1 entities." );
 226  
 
 227  0
         return buff.toString();
 228  
         
 229  
     }
 230  
     
 231  
     public static void main( String[] args ) throws Exception {
 232  
 
 233  0
         byte[] b = parse( "hello &eacute; world".getBytes() );
 234  
 
 235  0
         String v = new String( b );
 236  
 
 237  0
         System.out.println( "v: " + v );
 238  
         
 239  0
     }
 240  
     
 241  
     static {
 242  
 
 243  
         // load the latin1 entity map.  We will replace latin1 entities with
 244  
         // their char references directly.  For example if someone incorrectly
 245  
         // references:
 246  
         //
 247  
         // &auml;
 248  
         //
 249  
         // we replace it with:
 250  
         //
 251  
         // &#228;
 252  
         //
 253  
         // Which is correct in Latin1
 254  
 
 255  
         // http://my.netscape.com/publish/formats/rss-0.91.dtd
 256  
 
 257  0
         LATIN1_ENTITIES.put( "nbsp",      "160" );
 258  0
         LATIN1_ENTITIES.put( "iexcl",     "161" );
 259  0
         LATIN1_ENTITIES.put( "cent",      "162" );
 260  0
         LATIN1_ENTITIES.put( "pound",     "163" );
 261  0
         LATIN1_ENTITIES.put( "curren",    "164" );
 262  0
         LATIN1_ENTITIES.put( "yen",       "165" );
 263  0
         LATIN1_ENTITIES.put( "brvbar",    "166" );
 264  0
         LATIN1_ENTITIES.put( "sect",      "167" );
 265  0
         LATIN1_ENTITIES.put( "uml",       "168" );
 266  0
         LATIN1_ENTITIES.put( "copy",      "169" );
 267  0
         LATIN1_ENTITIES.put( "ordf",      "170" );
 268  0
         LATIN1_ENTITIES.put( "laquo",     "171" );
 269  0
         LATIN1_ENTITIES.put( "not",       "172" );
 270  0
         LATIN1_ENTITIES.put( "shy",       "173" );
 271  0
         LATIN1_ENTITIES.put( "reg",       "174" );
 272  0
         LATIN1_ENTITIES.put( "macr",      "175" );
 273  0
         LATIN1_ENTITIES.put( "deg",       "176" );
 274  0
         LATIN1_ENTITIES.put( "plusmn",    "177" );
 275  0
         LATIN1_ENTITIES.put( "sup2",      "178" );
 276  0
         LATIN1_ENTITIES.put( "sup3",      "179" );
 277  0
         LATIN1_ENTITIES.put( "acute",     "180" );
 278  0
         LATIN1_ENTITIES.put( "micro",     "181" );
 279  0
         LATIN1_ENTITIES.put( "para",      "182" );
 280  0
         LATIN1_ENTITIES.put( "middot",    "183" );
 281  0
         LATIN1_ENTITIES.put( "cedil",     "184" );
 282  0
         LATIN1_ENTITIES.put( "sup1",      "185" );
 283  0
         LATIN1_ENTITIES.put( "ordm",      "186" );
 284  0
         LATIN1_ENTITIES.put( "raquo",     "187" );
 285  0
         LATIN1_ENTITIES.put( "frac14",    "188" );
 286  0
         LATIN1_ENTITIES.put( "frac12",    "189" );
 287  0
         LATIN1_ENTITIES.put( "frac34",    "190" );
 288  0
         LATIN1_ENTITIES.put( "iquest",    "191" );
 289  0
         LATIN1_ENTITIES.put( "Agrave",    "192" );
 290  0
         LATIN1_ENTITIES.put( "Aacute",    "193" );
 291  0
         LATIN1_ENTITIES.put( "Acirc",     "194" );
 292  0
         LATIN1_ENTITIES.put( "Atilde",    "195" );
 293  0
         LATIN1_ENTITIES.put( "Auml",      "196" );
 294  0
         LATIN1_ENTITIES.put( "Aring",     "197" );
 295  0
         LATIN1_ENTITIES.put( "AElig",     "198" );
 296  0
         LATIN1_ENTITIES.put( "Ccedil",    "199" );
 297  0
         LATIN1_ENTITIES.put( "Egrave",    "200" );
 298  0
         LATIN1_ENTITIES.put( "Eacute",    "201" );
 299  0
         LATIN1_ENTITIES.put( "Ecirc",     "202" );
 300  0
         LATIN1_ENTITIES.put( "Euml",      "203" );
 301  0
         LATIN1_ENTITIES.put( "Igrave",    "204" );
 302  0
         LATIN1_ENTITIES.put( "Iacute",    "205" );
 303  0
         LATIN1_ENTITIES.put( "Icirc",     "206" );
 304  0
         LATIN1_ENTITIES.put( "Iuml",      "207" );
 305  0
         LATIN1_ENTITIES.put( "ETH",       "208" );
 306  0
         LATIN1_ENTITIES.put( "Ntilde",    "209" );
 307  0
         LATIN1_ENTITIES.put( "Ograve",    "210" );
 308  0
         LATIN1_ENTITIES.put( "Oacute",    "211" );
 309  0
         LATIN1_ENTITIES.put( "Ocirc",     "212" );
 310  0
         LATIN1_ENTITIES.put( "Otilde",    "213" );
 311  0
         LATIN1_ENTITIES.put( "Ouml",      "214" );
 312  0
         LATIN1_ENTITIES.put( "times",     "215" );
 313  0
         LATIN1_ENTITIES.put( "Oslash",    "216" );
 314  0
         LATIN1_ENTITIES.put( "Ugrave",    "217" );
 315  0
         LATIN1_ENTITIES.put( "Uacute",    "218" );
 316  0
         LATIN1_ENTITIES.put( "Ucirc",     "219" );
 317  0
         LATIN1_ENTITIES.put( "Uuml",      "220" );
 318  0
         LATIN1_ENTITIES.put( "Yacute",    "221" );
 319  0
         LATIN1_ENTITIES.put( "THORN",     "222" );
 320  0
         LATIN1_ENTITIES.put( "szlig",     "223" );
 321  0
         LATIN1_ENTITIES.put( "agrave",    "224" );
 322  0
         LATIN1_ENTITIES.put( "aacute",    "225" );
 323  0
         LATIN1_ENTITIES.put( "acirc",     "226" );
 324  0
         LATIN1_ENTITIES.put( "atilde",    "227" );
 325  0
         LATIN1_ENTITIES.put( "auml",      "228" );
 326  0
         LATIN1_ENTITIES.put( "aring",     "229" );
 327  0
         LATIN1_ENTITIES.put( "aelig",     "230" );
 328  0
         LATIN1_ENTITIES.put( "ccedil",    "231" );
 329  0
         LATIN1_ENTITIES.put( "egrave",    "232" );
 330  0
         LATIN1_ENTITIES.put( "eacute",    "233" );
 331  0
         LATIN1_ENTITIES.put( "ecirc",     "234" );
 332  0
         LATIN1_ENTITIES.put( "euml",      "235" );
 333  0
         LATIN1_ENTITIES.put( "igrave",    "236" );
 334  0
         LATIN1_ENTITIES.put( "iacute",    "237" );
 335  0
         LATIN1_ENTITIES.put( "icirc",     "238" );
 336  0
         LATIN1_ENTITIES.put( "iuml",      "239" );
 337  0
         LATIN1_ENTITIES.put( "eth",       "240" );
 338  0
         LATIN1_ENTITIES.put( "ntilde",    "241" );
 339  0
         LATIN1_ENTITIES.put( "ograve",    "242" );
 340  0
         LATIN1_ENTITIES.put( "oacute",    "243" );
 341  0
         LATIN1_ENTITIES.put( "ocirc",     "244" );
 342  0
         LATIN1_ENTITIES.put( "otilde",    "245" );
 343  0
         LATIN1_ENTITIES.put( "ouml",      "246" );
 344  0
         LATIN1_ENTITIES.put( "divide",    "247" );
 345  0
         LATIN1_ENTITIES.put( "oslash",    "248" );
 346  0
         LATIN1_ENTITIES.put( "ugrave",    "249" );
 347  0
         LATIN1_ENTITIES.put( "uacute",    "250" );
 348  0
         LATIN1_ENTITIES.put( "ucirc",     "251" );
 349  0
         LATIN1_ENTITIES.put( "uuml",      "252" );
 350  0
         LATIN1_ENTITIES.put( "yacute",    "253" );
 351  0
         LATIN1_ENTITIES.put( "thorn",     "254" );
 352  0
         LATIN1_ENTITIES.put( "yuml",      "255" );
 353  
 
 354  0
     }
 355  
     
 356  
 }