Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
HttpFileDownloadStage |
|
| 0.0;0 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.commons.pipeline.stage; | |
19 | ||
20 | import java.io.BufferedInputStream; | |
21 | import java.io.BufferedOutputStream; | |
22 | import java.io.File; | |
23 | import java.io.FileOutputStream; | |
24 | import java.io.IOException; | |
25 | import java.io.InputStream; | |
26 | import java.io.OutputStream; | |
27 | import java.net.HttpURLConnection; | |
28 | import java.net.MalformedURLException; | |
29 | import java.net.URL; | |
30 | ||
31 | import org.apache.commons.logging.Log; | |
32 | import org.apache.commons.logging.LogFactory; | |
33 | import org.apache.commons.pipeline.StageException; | |
34 | import org.apache.commons.pipeline.validation.ConsumedTypes; | |
35 | import org.apache.commons.pipeline.validation.ProducedTypes; | |
36 | ||
37 | ||
38 | /** | |
39 | * This {@link org.apache.commons.pipeline.Pipeline$Stage Stage} provides the | |
40 | * functionality needed to retrieve data from an HTTP URL. Multipart responses | |
41 | * are not yet supported. | |
42 | */ | |
43 | @ConsumedTypes({URL.class, String.class}) | |
44 | @ProducedTypes({File.class}) | |
45 | public class HttpFileDownloadStage extends BaseStage { | |
46 | private static final int BUFFER_SIZE = 10000; | |
47 | 2 | private String workDir = null; |
48 | 2 | private Log log = LogFactory.getLog(HttpFileDownloadStage.class); |
49 | ||
50 | 4 | public HttpFileDownloadStage() { } |
51 | ||
52 | /** | |
53 | * Creates a new HttpFileDownloadStage which will download files to the | |
54 | * specified work directory. | |
55 | * @param workDir the path to which files will be downloaded. | |
56 | */ | |
57 | 0 | public HttpFileDownloadStage(String workDir) { |
58 | 0 | this.workDir = workDir; |
59 | 0 | } |
60 | ||
61 | /** | |
62 | * Removes a java.net.URL (an HTTP URL) or string representing a URL from | |
63 | * the input queue, and then retrieves the data at that URL and stores it | |
64 | * in a temporary file. The file is stored in the directory specified by | |
65 | * {@link #setWorkDir(String) setWorkDir()}, or to the system default | |
66 | * temporary directory if no work directory is set. | |
67 | * | |
68 | * @param obj The URL from which to download data. | |
69 | * @throws IllegalArgumentException if the parameter obj is not a string or | |
70 | * an instance of {@link java.net.URL}. | |
71 | * @throws StageException if there is an error retrieving data from the | |
72 | * URL specified. | |
73 | */ | |
74 | public void process(Object obj) throws StageException { | |
75 | //Map params = new HashMap(); | |
76 | ||
77 | URL url; | |
78 | try { | |
79 | 2 | if (obj instanceof String) { |
80 | // String loc = (String) obj; | |
81 | // int paramIndex = loc.indexOf('?'); | |
82 | // if (paramIndex > 0) { | |
83 | // url = new URL(loc.substring(0, paramIndex)); | |
84 | // for (StringTokenizer st = new StringTokenizer(loc.substring(paramIndex + 1), "&"); st.hasMoreTokens();) { | |
85 | // String tok = st.nextToken(); | |
86 | // int eqIndex = tok.indexOf('='); | |
87 | // if (eqIndex > 0) { | |
88 | // params.put(tok.substring(0, eqIndex), tok.substring(eqIndex + 1)); | |
89 | // } | |
90 | // else { | |
91 | // params.put(tok, null); | |
92 | // } | |
93 | // } | |
94 | // } | |
95 | // else { | |
96 | 1 | url = new URL((String) obj); |
97 | // } | |
98 | 1 | } else if (obj instanceof URL) { |
99 | 1 | url = (URL) obj; |
100 | } else { | |
101 | 0 | throw new IllegalArgumentException("Unrecognized parameter class to process() for HttpFileDownload: " + obj.getClass().getName() + "; must be URL or String"); |
102 | } | |
103 | 0 | } catch (MalformedURLException e) { |
104 | 0 | throw new StageException(this, "Malformed URL: " + obj, e); |
105 | 2 | } |
106 | ||
107 | 2 | log.debug("Retrieving data from " + url.toString()); |
108 | ||
109 | // try { | |
110 | // url = handleRedirects(url); | |
111 | // } | |
112 | // catch (Exception e) { //catches MalformedURLException, IOException | |
113 | // throw new StageException("An error was encountered attempting to follow URL redirects from " + url.toString(), e); | |
114 | // } | |
115 | ||
116 | 2 | HttpURLConnection con = null; |
117 | try { | |
118 | 2 | con = (java.net.HttpURLConnection) url.openConnection(); |
119 | // if (!params.isEmpty()) { | |
120 | // con.setRequestMethod("GET"); | |
121 | // for (Iterator iter = params.entrySet().iterator(); iter.hasNext();) { | |
122 | // Map.Entry entry = (Map.Entry) iter.next(); | |
123 | // con.setRequestProperty((String) entry.getKey(), (String) entry.getValue()); | |
124 | // } | |
125 | // } | |
126 | ||
127 | 2 | File workDir = (this.workDir == null) ? null : new File(this.workDir); |
128 | 2 | File workFile = File.createTempFile("http-file-download","tmp", workDir); |
129 | ||
130 | 2 | InputStream in = new BufferedInputStream(con.getInputStream()); |
131 | 2 | OutputStream out = new BufferedOutputStream(new FileOutputStream(workFile, false)); |
132 | 2 | byte[] buffer = new byte[BUFFER_SIZE]; //attempt to read 10k at a time |
133 | 2 | for (int results = 0; (results = in.read(buffer)) != -1;) { |
134 | 2 | out.write(buffer, 0, results); |
135 | } | |
136 | 2 | out.close(); |
137 | 2 | in.close(); |
138 | ||
139 | 2 | this.emit(workFile); |
140 | 0 | } catch (IOException e) { |
141 | 0 | throw new StageException(this, "An error occurred downloading a data file from " + url.toString(), e); |
142 | } finally { | |
143 | 2 | con.disconnect(); |
144 | 2 | } |
145 | 2 | } |
146 | ||
147 | ||
148 | /** | |
149 | * Sets the working directory for the file download. If the directory does | |
150 | * not already exist, it will be created during the preprocess() step. | |
151 | * If you do not set this directory, the work directory will be the | |
152 | * default temporary directory for your machine type. | |
153 | */ | |
154 | public void setWorkDir(String workDir) { | |
155 | 0 | this.workDir = workDir; |
156 | 0 | } |
157 | ||
158 | /** | |
159 | * Returns the name of the file download directory. | |
160 | */ | |
161 | public String getWorkDir() { | |
162 | 0 | return this.workDir; |
163 | } | |
164 | ||
165 | /** | |
166 | * Follows redirects from the specified URL and recursively returns the destination | |
167 | * URL. This method does not check for circular redirects, so it is possible that a malicious | |
168 | * site could force this method into infinite recursion. | |
169 | * | |
170 | * TODO: Add a max_hops parameterized version | |
171 | */ | |
172 | public URL handleRedirects(URL url) throws IOException, MalformedURLException { | |
173 | 0 | java.net.HttpURLConnection.setFollowRedirects(false); |
174 | 0 | HttpURLConnection con = (HttpURLConnection) url.openConnection(); |
175 | 0 | int response = con.getResponseCode(); |
176 | 0 | log.debug("Response code for " + url + " = " + response); |
177 | ||
178 | 0 | if (response == java.net.HttpURLConnection.HTTP_MOVED_PERM || response == java.net.HttpURLConnection.HTTP_MOVED_TEMP) { |
179 | 0 | String location = con.getHeaderField("Location"); |
180 | 0 | log.debug("Handling redirect to location: " + location); |
181 | ||
182 | 0 | if (location.startsWith("http:")) { |
183 | 0 | url = new URL(location); |
184 | 0 | } else if (location.startsWith("/")) { |
185 | 0 | url = new URL("http://" + url.getHost() + location); |
186 | } else { | |
187 | 0 | url = new URL(con.getURL(), location); |
188 | } | |
189 | ||
190 | 0 | url = handleRedirects(url); // to handle nested redirections |
191 | } | |
192 | ||
193 | 0 | return url; |
194 | } | |
195 | } |