Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
URLResourceRequest |
|
| 4.875;4.875 |
1 | /* | |
2 | * Copyright 1999,2004 The Apache Software Foundation. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | package org.apache.commons.feedparser.network; | |
18 | ||
19 | import java.io.FileNotFoundException; | |
20 | import java.io.IOException; | |
21 | import java.io.InputStream; | |
22 | import java.net.ProtocolException; | |
23 | import java.net.URL; | |
24 | import java.net.URLConnection; | |
25 | import java.util.Iterator; | |
26 | import java.util.zip.GZIPInputStream; | |
27 | ||
28 | import org.apache.log4j.Logger; | |
29 | ||
30 | import sun.net.www.protocol.http.HttpURLConnection; | |
31 | ||
32 | /** | |
33 | * ResourceRequest implementation that uses java.net.URL as the backend. | |
34 | * | |
35 | * Differences from other ResourceRequests. | |
36 | * | |
37 | * setRequestMethod() - Allows us to change the request type (HEAD, etc). | |
38 | * | |
39 | * getContentLength() - Returns the length/size of the content represented by | |
40 | * this resource. Can be used by clients with setRequestMethod( "HEAD" ) to | |
41 | * find the size of a remote resource without doing a full fetch. | |
42 | * | |
43 | * @author <a href="mailto:burton@openprivacy.org">Kevin A. Burton</a> | |
44 | * @version $Id: URLResourceRequest.java 561366 2007-07-31 15:58:29Z rahul $ | |
45 | */ | |
46 | 0 | public class URLResourceRequest extends BaseResourceRequest implements ResourceRequest { |
47 | ||
48 | 0 | private static Logger log = Logger.getLogger( URLResourceRequest.class.getName() ); |
49 | ||
50 | public static final String ACCEPT_ENCODING_HEADER = "Accept-Encoding"; | |
51 | public static final String IF_NONE_MATCH_HEADER = "If-None-Match"; | |
52 | public static final String GZIP_ENCODING = "gzip"; | |
53 | public static final String USER_AGENT_HEADER = "User-Agent"; | |
54 | ||
55 | /** | |
56 | * | |
57 | * Enable RFC 3228 HTTP Delta for feeds. | |
58 | * | |
59 | * http://bobwyman.pubsub.com/main/2004/09/using_rfc3229_w.html | |
60 | * | |
61 | * http://bobwyman.pubsub.com/main/2004/09/implementations.html | |
62 | * | |
63 | */ | |
64 | 0 | public static boolean ENABLE_HTTP_DELTA_FEED_IM = false; |
65 | ||
66 | 0 | public static String USER_AGENT |
67 | = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1; aggregator:FeedParser; http://commons.apache.org/feedparser/) Gecko/20021130"; | |
68 | ||
69 | 0 | public static String USER_AGENT_MOZILLA |
70 | = "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.2.1) Gecko/20021130"; | |
71 | ||
72 | /** | |
73 | * Not used anymore. Provided for historical reasons. | |
74 | */ | |
75 | public static final String REFERER | |
76 | = "http://commons.apache.org/feedparser/?isAggregator=true"; | |
77 | ||
78 | public static final int MAX_CONTENT_LENGTH = 1000000; | |
79 | ||
80 | 0 | private URL _url = null; |
81 | ||
82 | 0 | private URLConnection _urlConnection = null; |
83 | ||
84 | 0 | private InputStream inputStream = null; |
85 | ||
86 | 0 | private boolean initConnection = false; |
87 | ||
88 | /** | |
89 | * | |
90 | * | |
91 | */ | |
92 | public void init() throws IOException { | |
93 | ||
94 | 0 | String resource = this.getResource(); |
95 | ||
96 | //if we are offline... we don't need to init. | |
97 | 0 | if ( ResourceRequestFactory.isOffline() ) { return; } |
98 | ||
99 | //pull from the HTCache if it is enabled and then short-circuit so that | |
100 | //we don't fetch from the network. | |
101 | ||
102 | //NOTE: currently removed because the htcache wasn't portable. I can OSS | |
103 | //this in the future if necessary | |
104 | ||
105 | // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() && | |
106 | // HTCache.hasContentInCache( this.getResource() ) ) { | |
107 | // | |
108 | // //get the input stream we can use from the HTCache. | |
109 | // this.inputStream = HTCache.getContentAsInputStream( resource ); | |
110 | // return; | |
111 | // | |
112 | // } | |
113 | ||
114 | 0 | _url = new URL( this.getResource() ); |
115 | 0 | _urlConnection = _url.openConnection(); |
116 | ||
117 | 0 | } |
118 | ||
119 | /** | |
120 | * Init the actual connection. Should be called AFTER init() but before | |
121 | * getInputStream() so that we can set any runtime params requestMethod, | |
122 | * etc. If getInputStream() is called without an initConnection() we do | |
123 | * this automatically. initConnection() might not want to be called when | |
124 | * doing a HEAD request. | |
125 | * | |
126 | * | |
127 | */ | |
128 | public void initConnection() throws NetworkException { | |
129 | ||
130 | 0 | long before = System.currentTimeMillis(); |
131 | ||
132 | 0 | initConnection = true; |
133 | ||
134 | 0 | this.fireInit(); |
135 | ||
136 | //FIXME: do smart user agent detection. if this is a .html file we can | |
137 | //set it to us Mozilla and if not we can use NewsMonster | |
138 | //_urlConnection.setRequestProperty( "Referer", REFERER ); | |
139 | ||
140 | 0 | String resource = this.getResource(); |
141 | ||
142 | //set the user agent if it hasn't ALREADY been set by the caller. | |
143 | 0 | if ( getRequestHeaderField( USER_AGENT_HEADER ) == null ) { |
144 | 0 | _urlConnection.setRequestProperty( USER_AGENT_HEADER, USER_AGENT ); |
145 | } | |
146 | ||
147 | 0 | _urlConnection.setRequestProperty( ACCEPT_ENCODING_HEADER, GZIP_ENCODING ); |
148 | ||
149 | //copy over any headers set in the request.. | |
150 | ||
151 | 0 | Iterator it = getRequestHeaderFields(); |
152 | ||
153 | 0 | while ( it.hasNext() ) { |
154 | ||
155 | 0 | String key = (String)it.next(); |
156 | ||
157 | 0 | _urlConnection.setRequestProperty( key, getRequestHeaderField( key ) ); |
158 | ||
159 | 0 | } |
160 | ||
161 | 0 | if ( _urlConnection instanceof HttpURLConnection ) { |
162 | ||
163 | 0 | HttpURLConnection httpURLConn = (HttpURLConnection)_urlConnection; |
164 | ||
165 | 0 | httpURLConn.setFollowRedirects( getFollowRedirects() ); |
166 | 0 | httpURLConn.setInstanceFollowRedirects( getFollowRedirects() ); |
167 | ||
168 | 0 | if ( this.getIfModifiedSince() != -1 ) |
169 | 0 | httpURLConn.setIfModifiedSince( this.getIfModifiedSince() ); |
170 | ||
171 | 0 | if ( getEtag() != null ) { |
172 | 0 | httpURLConn.setRequestProperty( IF_NONE_MATCH_HEADER, getEtag() ); |
173 | ||
174 | //now support RFC3229 HTTP Delta | |
175 | //A-IM: feed, gzip | |
176 | ||
177 | 0 | if ( ENABLE_HTTP_DELTA_FEED_IM ) { |
178 | ||
179 | //note that this will return HTTP 226 if used. | |
180 | // | |
181 | ||
182 | 0 | httpURLConn.setRequestProperty( "A-IM", "feed, gzip" ); |
183 | ||
184 | } | |
185 | ||
186 | } | |
187 | ||
188 | try { | |
189 | ||
190 | 0 | httpURLConn.connect(); |
191 | ||
192 | //setResource( getRedirectedResource() ); | |
193 | ||
194 | 0 | this.setResponseCode( httpURLConn.getResponseCode() ); |
195 | ||
196 | 0 | } catch ( IOException e ) { |
197 | 0 | throw new NetworkException( e ); |
198 | 0 | } |
199 | ||
200 | } | |
201 | ||
202 | 0 | int contentLength = _urlConnection.getContentLength(); |
203 | ||
204 | //bigger than 1 meg and it is a remote document (it is safe to process | |
205 | //local documents) | |
206 | 0 | if ( contentLength > MAX_CONTENT_LENGTH && |
207 | this.getResource().startsWith( "file:" ) == false ) { | |
208 | ||
209 | //NOTE: make 100% sure this doens't just go ahead and download the | |
210 | //file FIRST before doing a HEAD. I think that's what happens but I | |
211 | //might be wrong. | |
212 | ||
213 | 0 | throw new NetworkException( "Content is too large - " + contentLength + " - " + getResource() ); |
214 | ||
215 | } | |
216 | ||
217 | 0 | long after = System.currentTimeMillis(); |
218 | ||
219 | 0 | log.debug( getResource() + " - init duration: " + (after-before) ); |
220 | ||
221 | 0 | } |
222 | ||
223 | 0 | java.lang.reflect.Field FIELD_HTTP_URL_CONNECTION_HTTP = null; |
224 | 0 | java.lang.reflect.Field FIELD_HTTP_CLIENT_URL = null; |
225 | ||
226 | /** | |
227 | * This method used Reflection to pull out the redirected URL in | |
228 | * java.net.URL. Internally sun.net.www.protocol.http.HttpURLConnection | |
229 | * stores a reference to sun.net.www.http.HttpClient which then in turn does | |
230 | * all the redirection and stores the redirect java.net.URL. We just use | |
231 | * reflection to FETCH this URL and then call toString to get the correct | |
232 | * value. | |
233 | * | |
234 | * Java needs the concept of readonly private variables. | |
235 | * | |
236 | * | |
237 | */ | |
238 | public String getResourceFromRedirect() { | |
239 | ||
240 | try { | |
241 | ||
242 | 0 | if ( FIELD_HTTP_URL_CONNECTION_HTTP == null ) { |
243 | ||
244 | //Note: when using a FILE URL this won't work! | |
245 | 0 | FIELD_HTTP_URL_CONNECTION_HTTP = _urlConnection.getClass().getDeclaredField( "http" ); |
246 | 0 | FIELD_HTTP_URL_CONNECTION_HTTP.setAccessible( true ); |
247 | ||
248 | } | |
249 | ||
250 | 0 | Object http = FIELD_HTTP_URL_CONNECTION_HTTP.get( _urlConnection ); |
251 | ||
252 | //when java.net.URL has already cleaned itself up 'http' will be | |
253 | //null here. | |
254 | 0 | if ( http == null ) |
255 | 0 | return getResource(); |
256 | ||
257 | 0 | if ( FIELD_HTTP_CLIENT_URL == null ) { |
258 | ||
259 | 0 | FIELD_HTTP_CLIENT_URL = http.getClass().getDeclaredField( "url" ); |
260 | 0 | FIELD_HTTP_CLIENT_URL.setAccessible( true ); |
261 | ||
262 | } | |
263 | ||
264 | 0 | Object url = FIELD_HTTP_CLIENT_URL.get( http ); |
265 | ||
266 | //this will be a java.net.URL and now I can call the toString method | |
267 | //on it which will return our full URI. | |
268 | 0 | return url.toString(); |
269 | ||
270 | 0 | } catch ( Throwable t ) { |
271 | //log.error( t ); | |
272 | 0 | return getResource(); |
273 | } | |
274 | ||
275 | } | |
276 | ||
277 | public InputStream getInputStream() throws NetworkException { | |
278 | ||
279 | try { | |
280 | 0 | return _getInputStream(); |
281 | ||
282 | 0 | } catch ( IOException e ) { |
283 | ||
284 | 0 | String message = null; |
285 | ||
286 | //the modern VM buries the FileNotFoundException which prevents a | |
287 | //catch. Very very ugly. | |
288 | 0 | if ( e.getCause() instanceof FileNotFoundException ) { |
289 | 0 | message = "File not found: " + e.getCause().getMessage(); |
290 | } else { | |
291 | 0 | message = e.getMessage(); |
292 | } | |
293 | ||
294 | 0 | throw new NetworkException( message, e, this, _url, _urlConnection ); |
295 | } | |
296 | ||
297 | } | |
298 | ||
299 | /** | |
300 | * | |
301 | * | |
302 | * | |
303 | */ | |
304 | public InputStream _getInputStream() throws IOException { | |
305 | ||
306 | 0 | if ( ! initConnection ) { initConnection(); } |
307 | ||
308 | 0 | String resource = this.getResource(); |
309 | ||
310 | //if we haven't pulled from the cache (as above) and we are offline we | |
311 | //need to throw an exception. | |
312 | 0 | if ( ResourceRequestFactory.isOffline() ) { |
313 | ||
314 | //see if we can return from the HTCache. | |
315 | // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() && | |
316 | // HTCache.hasContentInCache( resource ) ) | |
317 | // return HTCache.getContentAsInputStream( resource ); | |
318 | ||
319 | //if not we should throw an exception | |
320 | 0 | throw new IOException( "ResourceRequestFactory is offline and content was not in cache - " + |
321 | resource ); | |
322 | ||
323 | } | |
324 | ||
325 | //if we are using an input stream NOT from init() | |
326 | 0 | if ( this.inputStream == null ) { |
327 | ||
328 | 0 | this.inputStream = _urlConnection.getInputStream(); |
329 | 0 | this.inputStream = new AdvancedInputStream( this.inputStream, this ); |
330 | ||
331 | //first decompress | |
332 | 0 | if ( GZIP_ENCODING.equals( _urlConnection.getContentEncoding() ) ) { |
333 | ||
334 | //note. the advanced input stream must be wrapped by a GZIP | |
335 | //input stream and not vice-versa or we will end up with | |
336 | //incorrect results. | |
337 | ||
338 | 0 | this.inputStream = new GZIPInputStream( this.inputStream ); |
339 | ||
340 | } | |
341 | ||
342 | // if ( ResourceRequestFactory.isTransparentHTCacheEnabled() ) { | |
343 | ||
344 | // System.out.println( "cache store for: " + | |
345 | // resource + " as " + | |
346 | // HTCache.getContentAsPath( resource ) ); | |
347 | ||
348 | // //FIXME: performance improvement... don't write do disk and then | |
349 | // //read from disk.? | |
350 | ||
351 | // //store this content from the network and save it in the cache. Then fetch it and return | |
352 | // HTCache.store( resource, this.inputStream ); | |
353 | ||
354 | // return HTCache.getContentAsInputStream( resource ); | |
355 | ||
356 | // } | |
357 | ||
358 | } | |
359 | ||
360 | 0 | setResource( getResourceFromRedirect() ); |
361 | ||
362 | //this is potentially teh cached input stream created if we have used | |
363 | //the HTCache. | |
364 | 0 | return inputStream; |
365 | ||
366 | } | |
367 | ||
368 | /** | |
369 | * Set the RequestMethod of this URLConnection. | |
370 | * | |
371 | * | |
372 | */ | |
373 | public void setRequestMethod( String method ) throws NetworkException { | |
374 | ||
375 | try { | |
376 | ||
377 | 0 | if ( _urlConnection instanceof HttpURLConnection ) { |
378 | ||
379 | 0 | ((HttpURLConnection)_urlConnection).setRequestMethod( method ); |
380 | ||
381 | } | |
382 | ||
383 | 0 | } catch ( ProtocolException pe ) { |
384 | ||
385 | 0 | NetworkException ne = new NetworkException( pe.getMessage() ); |
386 | 0 | ne.initCause( pe ); |
387 | 0 | throw ne; |
388 | ||
389 | 0 | } |
390 | ||
391 | 0 | } |
392 | ||
393 | /** | |
394 | * | |
395 | * | |
396 | * | |
397 | */ | |
398 | public int getContentLength() throws IOException { | |
399 | ||
400 | 0 | if ( ! initConnection ) { initConnection(); } |
401 | ||
402 | //if ( _urlConnection instanceof HttpURLConnection ) { | |
403 | ||
404 | 0 | return _urlConnection.getContentLength(); |
405 | ||
406 | } | |
407 | ||
408 | public String getHeaderField( String name ) { | |
409 | 0 | return _urlConnection.getHeaderField( name ); |
410 | } | |
411 | ||
412 | } |