Coverage Report - org.apache.commons.feedparser.network.URLResourceRequest
 
Classes in this File Line Coverage Branch Coverage Complexity
URLResourceRequest
0%
0/91
0%
0/38
4.875
 
 1  
 /*
 2  
  * Copyright 1999,2004 The Apache Software Foundation.
 3  
  * 
 4  
  * Licensed under the Apache License, Version 2.0 (the "License");
 5  
  * you may not use this file except in compliance with the License.
 6  
  * You may obtain a copy of the License at
 7  
  * 
 8  
  *      http://www.apache.org/licenses/LICENSE-2.0
 9  
  * 
 10  
  * Unless required by applicable law or agreed to in writing, software
 11  
  * distributed under the License is distributed on an "AS IS" BASIS,
 12  
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13  
  * See the License for the specific language governing permissions and
 14  
  * limitations under the License.
 15  
  */
 16  
 
 17  
 package org.apache.commons.feedparser.network;
 18  
 
 19  
 import java.io.FileNotFoundException;
 20  
 import java.io.IOException;
 21  
 import java.io.InputStream;
 22  
 import java.net.ProtocolException;
 23  
 import java.net.URL;
 24  
 import java.net.URLConnection;
 25  
 import java.util.Iterator;
 26  
 import java.util.zip.GZIPInputStream;
 27  
 
 28  
 import org.apache.log4j.Logger;
 29  
 
 30  
 import sun.net.www.protocol.http.HttpURLConnection;
 31  
 
 32  
 /**
 33  
  * ResourceRequest implementation that uses java.net.URL as the backend.
 34  
  *
 35  
  * Differences from other ResourceRequests.
 36  
  *
 37  
  * setRequestMethod() - Allows us to change the request type (HEAD, etc).
 38  
  * 
 39  
  * getContentLength() - Returns the length/size of the content represented by
 40  
  * this resource.  Can be used by clients with setRequestMethod( "HEAD" ) to
 41  
  * find the size of a remote resource without doing a full fetch.
 42  
  *
 43  
  * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a>
 44  
  * @version $Id: URLResourceRequest.java 561366 2007-07-31 15:58:29Z rahul $
 45  
  */
 46  0
 public class URLResourceRequest extends BaseResourceRequest implements ResourceRequest {
 47  
 
 48  0
     private static Logger log = Logger.getLogger( URLResourceRequest.class.getName() );
 49  
 
 50  
     public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding";
 51  
     public static final String IF_NONE_MATCH_HEADER = "If-None-Match";
 52  
     public static final String GZIP_ENCODING = "gzip";
 53  
     public static final String USER_AGENT_HEADER = "User-Agent";
 54  
 
 55  
     /**
 56  
      *
 57  
      * Enable RFC 3228 HTTP Delta for feeds.
 58  
      * 
 59  
      * http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html
 60  
      * 
 61  
      *  http://bobwyman.pubsub.com/main/2004/09/implementations.html
 62  
      * 
 63  
      */
 64  0
     public static boolean ENABLE_HTTP_DELTA_FEED_IM = false;
 65  
     
 66  0
     public static String USER_AGENT
 67  
         = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:FeedParser; http://commons.apache.org/feedparser/) Gecko/20021130";
 68  
 
 69  0
     public static String USER_AGENT_MOZILLA
 70  
         = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20021130";
 71  
 
 72  
     /**
 73  
      * Not used anymore.  Provided for historical reasons.
 74  
      */
 75  
     public static final String REFERER
 76  
         = "http://commons.apache.org/feedparser/?isAggregator=true";
 77  
     
 78  
     public static final int MAX_CONTENT_LENGTH = 1000000;
 79  
     
 80  0
     private URL _url = null;
 81  
 
 82  0
     private URLConnection _urlConnection = null;
 83  
 
 84  0
     private InputStream inputStream = null;
 85  
 
 86  0
     private boolean initConnection = false;
 87  
     
 88  
     /**
 89  
      * 
 90  
      * 
 91  
      */
 92  
     public void init() throws IOException {
 93  
 
 94  0
         String resource = this.getResource();
 95  
 
 96  
         //if we are offline... we don't need to init.
 97  0
         if ( ResourceRequestFactory.isOffline() ) { return; } 
 98  
 
 99  
         //pull from the HTCache if it is enabled and then short-circuit so that
 100  
         //we don't fetch from the network.
 101  
 
 102  
         //NOTE: currently removed because the htcache wasn't portable. I can OSS
 103  
         //this in the future if necessary
 104  
 
 105  
         // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
 106  
         //     HTCache.hasContentInCache( this.getResource() ) ) {
 107  
         //
 108  
         //    //get the input stream we can use from the HTCache.
 109  
         //    this.inputStream = HTCache.getContentAsInputStream( resource );
 110  
         //  return;
 111  
         //    
 112  
         // }
 113  
 
 114  0
         _url = new URL( this.getResource() );
 115  0
         _urlConnection = _url.openConnection();
 116  
 
 117  0
     }
 118  
 
 119  
     /**
 120  
      * Init the actual connection.  Should be called AFTER init() but before
 121  
      * getInputStream() so that we can set any runtime params requestMethod,
 122  
      * etc.  If getInputStream() is called without an initConnection() we do
 123  
      * this automatically.  initConnection() might not want to be called when
 124  
      * doing a HEAD request.
 125  
      * 
 126  
      * 
 127  
      */
 128  
     public void initConnection() throws NetworkException {
 129  
 
 130  0
         long before = System.currentTimeMillis();
 131  
 
 132  0
         initConnection = true;
 133  
 
 134  0
         this.fireInit();
 135  
 
 136  
         //FIXME: do smart user agent detection.  if this is a .html file we can
 137  
         //set it to us Mozilla and if not we can use NewsMonster
 138  
         //_urlConnection.setRequestProperty( "Referer", REFERER );
 139  
 
 140  0
         String resource = this.getResource();
 141  
 
 142  
         //set the user agent if it hasn't ALREADY been set by the caller.
 143  0
         if ( getRequestHeaderField( USER_AGENT_HEADER ) == null ) {
 144  0
             _urlConnection.setRequestProperty( USER_AGENT_HEADER, USER_AGENT );
 145  
         } 
 146  
 
 147  0
         _urlConnection.setRequestProperty( ACCEPT_ENCODING_HEADER, GZIP_ENCODING );
 148  
 
 149  
         //copy over any headers set in the request..
 150  
 
 151  0
         Iterator it = getRequestHeaderFields();
 152  
 
 153  0
         while ( it.hasNext() ) {
 154  
 
 155  0
             String key = (String)it.next();
 156  
 
 157  0
             _urlConnection.setRequestProperty( key, getRequestHeaderField( key ) );
 158  
             
 159  0
         } 
 160  
 
 161  0
         if ( _urlConnection instanceof HttpURLConnection ) {
 162  
 
 163  0
             HttpURLConnection httpURLConn = (HttpURLConnection)_urlConnection;
 164  
 
 165  0
             httpURLConn.setFollowRedirects( getFollowRedirects() );
 166  0
             httpURLConn.setInstanceFollowRedirects( getFollowRedirects() );
 167  
 
 168  0
             if ( this.getIfModifiedSince() != -1 )
 169  0
                 httpURLConn.setIfModifiedSince( this.getIfModifiedSince() );
 170  
 
 171  0
             if ( getEtag() != null ) {
 172  0
                 httpURLConn.setRequestProperty( IF_NONE_MATCH_HEADER, getEtag() );
 173  
 
 174  
                 //now support RFC3229 HTTP Delta
 175  
                 //A-IM: feed, gzip
 176  
 
 177  0
                 if ( ENABLE_HTTP_DELTA_FEED_IM ) {
 178  
 
 179  
                     //note that this will return HTTP 226 if used.
 180  
                     //
 181  
                     
 182  0
                     httpURLConn.setRequestProperty( "A-IM", "feed, gzip" );
 183  
 
 184  
                 }
 185  
 
 186  
             }
 187  
             
 188  
             try {
 189  
 
 190  0
                 httpURLConn.connect();
 191  
 
 192  
                 //setResource( getRedirectedResource() );
 193  
                 
 194  0
                 this.setResponseCode( httpURLConn.getResponseCode() ); 
 195  
 
 196  0
             } catch ( IOException e ) {
 197  0
                 throw new NetworkException( e );
 198  0
             }
 199  
 
 200  
         } 
 201  
 
 202  0
         int contentLength = _urlConnection.getContentLength();
 203  
 
 204  
         //bigger than 1 meg and it is a remote document (it is safe to process
 205  
         //local documents)
 206  0
         if ( contentLength > MAX_CONTENT_LENGTH &&
 207  
              this.getResource().startsWith( "file:" ) == false ) {
 208  
 
 209  
             //NOTE: make 100% sure this doens't just go ahead and download the
 210  
             //file FIRST before doing a HEAD.  I think that's what happens but I
 211  
             //might be wrong.
 212  
             
 213  0
             throw new NetworkException( "Content is too large - " + contentLength + " - " + getResource() );
 214  
             
 215  
         } 
 216  
 
 217  0
         long after = System.currentTimeMillis();
 218  
         
 219  0
         log.debug( getResource() + " - init duration: " + (after-before) );
 220  
         
 221  0
     }
 222  
 
 223  0
     java.lang.reflect.Field FIELD_HTTP_URL_CONNECTION_HTTP = null;
 224  0
     java.lang.reflect.Field FIELD_HTTP_CLIENT_URL = null;
 225  
     
 226  
     /**
 227  
      * This method used Reflection to pull out the redirected URL in
 228  
      * java.net.URL.  Internally sun.net.www.protocol.http.HttpURLConnection
 229  
      * stores a reference to sun.net.www.http.HttpClient which then in turn does
 230  
      * all the redirection and stores the redirect java.net.URL.  We just use
 231  
      * reflection to FETCH this URL and then call toString to get the correct
 232  
      * value.
 233  
      * 
 234  
      * Java needs the concept of readonly private variables.
 235  
      *
 236  
      * 
 237  
      */
 238  
     public String getResourceFromRedirect() {
 239  
 
 240  
         try {
 241  
 
 242  0
             if ( FIELD_HTTP_URL_CONNECTION_HTTP == null ) {
 243  
 
 244  
                 //Note: when using a FILE URL this won't work!                
 245  0
                 FIELD_HTTP_URL_CONNECTION_HTTP = _urlConnection.getClass().getDeclaredField( "http" );
 246  0
                 FIELD_HTTP_URL_CONNECTION_HTTP.setAccessible( true );
 247  
                 
 248  
             }
 249  
 
 250  0
             Object http = FIELD_HTTP_URL_CONNECTION_HTTP.get( _urlConnection );
 251  
 
 252  
             //when java.net.URL has already cleaned itself up 'http' will be
 253  
             //null here.
 254  0
             if ( http == null )
 255  0
                 return getResource();
 256  
 
 257  0
             if ( FIELD_HTTP_CLIENT_URL == null ) {
 258  
 
 259  0
                 FIELD_HTTP_CLIENT_URL = http.getClass().getDeclaredField( "url" );
 260  0
                 FIELD_HTTP_CLIENT_URL.setAccessible( true );
 261  
                 
 262  
             }
 263  
             
 264  0
             Object url = FIELD_HTTP_CLIENT_URL.get( http );
 265  
 
 266  
             //this will be a java.net.URL and now I can call the toString method
 267  
             //on it which will return our full URI.
 268  0
             return url.toString();
 269  
             
 270  0
         } catch ( Throwable t ) {
 271  
             //log.error( t );
 272  0
             return getResource();
 273  
         }
 274  
         
 275  
     }
 276  
 
 277  
     public InputStream getInputStream() throws NetworkException {
 278  
 
 279  
         try {
 280  0
             return _getInputStream();
 281  
 
 282  0
         } catch ( IOException e ) {
 283  
 
 284  0
             String message = null;
 285  
             
 286  
             //the modern VM buries the FileNotFoundException which prevents a
 287  
             //catch.  Very very ugly.
 288  0
             if ( e.getCause() instanceof FileNotFoundException ) {
 289  0
                 message = "File not found: " + e.getCause().getMessage();
 290  
             } else {
 291  0
                 message = e.getMessage();
 292  
             }
 293  
 
 294  0
             throw new NetworkException( message, e, this, _url, _urlConnection );
 295  
         }
 296  
 
 297  
     }
 298  
     
 299  
     /**
 300  
      * 
 301  
      *
 302  
      * 
 303  
      */
 304  
     public InputStream _getInputStream() throws IOException {
 305  
 
 306  0
         if ( ! initConnection ) { initConnection(); } 
 307  
 
 308  0
         String resource = this.getResource();
 309  
 
 310  
         //if we haven't pulled from the cache (as above) and we are offline we
 311  
         //need to throw an exception.
 312  0
         if ( ResourceRequestFactory.isOffline() ) {
 313  
 
 314  
             //see if we can return from the HTCache.
 315  
             // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() &&
 316  
             //     HTCache.hasContentInCache( resource ) )
 317  
             //    return HTCache.getContentAsInputStream( resource );
 318  
 
 319  
             //if not we should throw an exception
 320  0
             throw new IOException( "ResourceRequestFactory is offline and content was not in cache - " +
 321  
                                    resource );
 322  
 
 323  
         }
 324  
 
 325  
         //if we are using an input stream NOT from init() 
 326  0
         if ( this.inputStream == null ) {
 327  
             
 328  0
             this.inputStream = _urlConnection.getInputStream();
 329  0
             this.inputStream = new AdvancedInputStream( this.inputStream, this );
 330  
 
 331  
             //first decompress
 332  0
             if ( GZIP_ENCODING.equals( _urlConnection.getContentEncoding() ) ) {
 333  
 
 334  
                 //note.  the advanced input stream must be wrapped by a GZIP
 335  
                 //input stream and not vice-versa or we will end up with
 336  
                 //incorrect results.
 337  
                 
 338  0
                 this.inputStream = new GZIPInputStream( this.inputStream );
 339  
 
 340  
             }
 341  
         
 342  
             // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() ) {
 343  
                 
 344  
             //     System.out.println( "cache store for: " +
 345  
             //                         resource + " as " +
 346  
             //                         HTCache.getContentAsPath( resource ) );
 347  
 
 348  
             //     //FIXME: performance improvement... don't write do disk and then
 349  
             //     //read from disk.?
 350  
                 
 351  
             //     //store this content from the network and save it in the cache.  Then fetch it and return
 352  
             //     HTCache.store( resource, this.inputStream );
 353  
                 
 354  
             //     return HTCache.getContentAsInputStream( resource );
 355  
                 
 356  
             // }
 357  
 
 358  
         }
 359  
 
 360  0
         setResource( getResourceFromRedirect() );
 361  
 
 362  
         //this is potentially teh cached input stream created if we have used
 363  
         //the HTCache.
 364  0
         return inputStream;
 365  
         
 366  
     }
 367  
 
 368  
     /**
 369  
      * Set the RequestMethod of this URLConnection.
 370  
      *
 371  
      * 
 372  
      */
 373  
     public void setRequestMethod( String method ) throws NetworkException {
 374  
 
 375  
         try { 
 376  
             
 377  0
             if ( _urlConnection instanceof HttpURLConnection ) {
 378  
                 
 379  0
                 ((HttpURLConnection)_urlConnection).setRequestMethod( method );
 380  
                 
 381  
             } 
 382  
             
 383  0
         } catch ( ProtocolException pe ) {
 384  
             
 385  0
             NetworkException ne = new NetworkException( pe.getMessage() );
 386  0
             ne.initCause( pe );
 387  0
             throw ne;
 388  
             
 389  0
         }
 390  
 
 391  0
     }
 392  
 
 393  
     /**
 394  
      * 
 395  
      *
 396  
      * 
 397  
      */
 398  
     public int getContentLength() throws IOException {
 399  
 
 400  0
         if ( ! initConnection ) { initConnection(); } 
 401  
 
 402  
         //if ( _urlConnection instanceof HttpURLConnection ) {
 403  
 
 404  0
         return  _urlConnection.getContentLength();
 405  
         
 406  
     }
 407  
     
 408  
     public String getHeaderField( String name ) {
 409  0
         return  _urlConnection.getHeaderField( name );
 410  
     }
 411  
 
 412  
 }