Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
SiteCrawler |
|
| 2.2941176470588234;2.294 | ||||
SiteCrawler$1 |
|
| 2.2941176470588234;2.294 |
1 | /* | |
2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
3 | * contributor license agreements. See the NOTICE file distributed with | |
4 | * this work for additional information regarding copyright ownership. | |
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
6 | * (the "License"); you may not use this file except in compliance with | |
7 | * the License. You may obtain a copy of the License at | |
8 | * | |
9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
10 | * | |
11 | * Unless required by applicable law or agreed to in writing, software | |
12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
14 | * See the License for the specific language governing permissions and | |
15 | * limitations under the License. | |
16 | */ | |
17 | ||
18 | package org.apache.any23.plugin.crawler; | |
19 | ||
20 | import edu.uci.ics.crawler4j.crawler.CrawlController; | |
21 | import edu.uci.ics.crawler4j.crawler.WebCrawler; | |
22 | ||
23 | import java.io.File; | |
24 | import java.net.URL; | |
25 | import java.util.ArrayList; | |
26 | import java.util.Collections; | |
27 | import java.util.List; | |
28 | import java.util.concurrent.ExecutorService; | |
29 | import java.util.concurrent.Executors; | |
30 | import java.util.regex.Pattern; | |
31 | ||
32 | /** | |
33 | * A basic <em>site crawler</em> to extract semantic content | |
34 | * of small/medium size sites. | |
35 | * | |
36 | * @author Michele Mostarda (mostarda@fbk.eu) | |
37 | */ | |
38 | 0 | public class SiteCrawler { |
39 | ||
40 | public static final String DEFAULT_PAGE_FILTER_RE = | |
41 | ".*(\\.(" + | |
42 | "css|js" + | |
43 | "|bmp|gif|jpe?g|png|tiff?" + | |
44 | "|mid|mp2|mp3|mp4|wav|wma" + | |
45 | "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" + | |
46 | "|pdf" + | |
47 | "|swf" + | |
48 | "|zip|rar|gz" + | |
49 | "|xml|txt" + | |
50 | "))$"; | |
51 | ||
52 | /** | |
53 | * Default number of crawler instances. | |
54 | */ | |
55 | public static final int DEFAULT_NUM_OF_CRAWLERS = 10; | |
56 | ||
57 | /** | |
58 | * Default crawler implementation. | |
59 | */ | |
60 | 0 | public static final Class<? extends WebCrawler> DEFAULT_WEB_CRAWLER = DefaultWebCrawler.class; |
61 | ||
62 | /** | |
63 | * Default filter applied to skip contents. | |
64 | */ | |
65 | 0 | public final Pattern defaultFilters = Pattern.compile(DEFAULT_PAGE_FILTER_RE); |
66 | ||
67 | /** | |
68 | * The crawler threads controller. | |
69 | */ | |
70 | private final CrawlController controller; | |
71 | ||
72 | /** | |
73 | * Crawler listeners. | |
74 | */ | |
75 | 0 | private final List<CrawlerListener> listeners = new ArrayList<CrawlerListener>(); |
76 | ||
77 | /** | |
78 | * Actual number of crawler instances. | |
79 | */ | |
80 | 0 | private int numOfCrawlers = DEFAULT_NUM_OF_CRAWLERS; |
81 | ||
82 | /** | |
83 | * Actual web crawler. | |
84 | */ | |
85 | 0 | private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER; |
86 | ||
87 | /** | |
88 | * Max allowed depth, <code>-1</code> means no limit. | |
89 | */ | |
90 | 0 | private int maxDepth = -1; |
91 | ||
92 | /** | |
93 | * Max allowed pages, <code>-1</code> means no limit. | |
94 | */ | |
95 | 0 | private int maxPages = -1; |
96 | ||
97 | /** | |
98 | * Subsequent call politeness delay, <code>-1</code> means no limit. | |
99 | */ | |
100 | 0 | private int politenessDelay = -1; |
101 | ||
102 | /** | |
103 | * Internal executor service. | |
104 | */ | |
105 | private ExecutorService service; | |
106 | ||
107 | /** | |
108 | * Constructor. | |
109 | * | |
110 | * @param storageFolder location used to store the temporary data structures used by the crawler. | |
111 | */ | |
112 | 0 | public SiteCrawler(File storageFolder) { |
113 | try { | |
114 | 0 | controller = new CrawlController( storageFolder.getAbsolutePath() ); |
115 | 0 | } catch (Exception e) { |
116 | 0 | throw new IllegalArgumentException("Error while initializing crawler controller.", e); |
117 | 0 | } |
118 | 0 | } |
119 | ||
120 | /** | |
121 | * @return number of crawler instances. | |
122 | */ | |
123 | public int getNumOfCrawlers() { | |
124 | 0 | return numOfCrawlers; |
125 | } | |
126 | ||
127 | /** | |
128 | * Sets the number of crawler instances. | |
129 | * | |
130 | * @param n an integer >= 0. | |
131 | */ | |
132 | public void setNumOfCrawlers(int n) { | |
133 | 0 | if(n <=0) throw new IllegalArgumentException("Invalid number of crawlers, must be > 0 ."); |
134 | 0 | this.numOfCrawlers = n; |
135 | 0 | } |
136 | ||
137 | public Class<? extends WebCrawler> getWebCrawler() { | |
138 | 0 | return webCrawler; |
139 | } | |
140 | ||
141 | /** | |
142 | * Sets the actual crawler clas. | |
143 | * | |
144 | * @param c a not <code>class</code>. | |
145 | */ | |
146 | public void setWebCrawler(Class<? extends WebCrawler> c) { | |
147 | 0 | if(c == null) throw new NullPointerException("c cannot be null."); |
148 | 0 | this.webCrawler = c; |
149 | 0 | } |
150 | ||
151 | /** | |
152 | * @return the max allowed crawl depth, <code>-1</code> means no limit. | |
153 | */ | |
154 | public int getMaxDepth() { | |
155 | 0 | return maxDepth; |
156 | } | |
157 | ||
158 | /** | |
159 | * Sets the maximum depth. | |
160 | * | |
161 | * @param maxDepth maximum allowed depth. <code>-1</code> means no limit. | |
162 | */ | |
163 | public void setMaxDepth(int maxDepth) { | |
164 | 0 | if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0"); |
165 | 0 | if(maxDepth > 0) try { |
166 | 0 | controller.setMaximumCrawlDepth(maxDepth); |
167 | 0 | } catch (Exception e) { |
168 | 0 | throw new IllegalArgumentException("Error while setting maxDepth.", e); |
169 | 0 | } |
170 | 0 | this.maxDepth = maxDepth; |
171 | 0 | } |
172 | ||
173 | /** | |
174 | * @return max number of allowed pages. | |
175 | */ | |
176 | public int getMaxPages() { | |
177 | 0 | return maxPages; |
178 | } | |
179 | ||
180 | /** | |
181 | * Sets the maximum collected pages. | |
182 | * | |
183 | * @param maxPages maximum allowed pages. <code>-1</code> means no limit. | |
184 | */ | |
185 | public void setMaxPages(int maxPages) { | |
186 | 0 | if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0"); |
187 | 0 | if(maxPages > 0) controller.setMaximumPagesToFetch(maxPages); |
188 | 0 | this.maxPages = maxPages; |
189 | 0 | } |
190 | ||
191 | /** | |
192 | * @return the politeness delay in milliseconds. | |
193 | */ | |
194 | public int getPolitenessDelay() { | |
195 | 0 | return politenessDelay; |
196 | } | |
197 | ||
198 | /** | |
199 | * Sets the politeness delay. <code>-1</code> means no politeness. | |
200 | * | |
201 | * @param millis delay in milliseconds. | |
202 | */ | |
203 | public void setPolitenessDelay(int millis) { | |
204 | 0 | if(millis < -1) throw new IllegalArgumentException("Invalid politenessDelay, must be >= -1"); |
205 | 0 | if(millis >= 0) controller.setPolitenessDelay(millis); |
206 | 0 | this.politenessDelay = millis; |
207 | 0 | } |
208 | ||
209 | /** | |
210 | * Registers a {@link CrawlerListener} to this crawler. | |
211 | * | |
212 | * @param listener | |
213 | */ | |
214 | public void addListener(CrawlerListener listener) { | |
215 | 0 | listeners.add(listener); |
216 | 0 | } |
217 | ||
218 | /** | |
219 | * Deregisters a {@link CrawlerListener} from this crawler. | |
220 | * | |
221 | * @param listener | |
222 | */ | |
223 | public void removeListener(CrawlerListener listener) { | |
224 | 0 | listeners.remove(listener); |
225 | 0 | } |
226 | ||
227 | /** | |
228 | * Starts the crawling process. | |
229 | * | |
230 | * @param seed the starting URL for the crawler process. | |
231 | * @param filters filters to be applied to the crawler process. Can be <code>null</code>. | |
232 | * @param wait if <code>true</code> the process will wait for the crawler termination. | |
233 | * @throws Exception | |
234 | */ | |
235 | public synchronized void start( | |
236 | final URL seed, final Pattern filters, final boolean wait | |
237 | ) throws Exception { | |
238 | 0 | SharedData.setCrawlData(seed.toExternalForm(), filters, Collections.synchronizedList(listeners) ); |
239 | 0 | controller.addSeed(seed.toExternalForm()); |
240 | 0 | final Runnable internalRunnable = new Runnable() { |
241 | @Override | |
242 | public void run() { | |
243 | 0 | controller.start(getWebCrawler(), getNumOfCrawlers()); |
244 | 0 | } |
245 | }; | |
246 | 0 | if(wait) { |
247 | 0 | internalRunnable.run(); |
248 | } else { | |
249 | 0 | if(service != null) throw new IllegalStateException("Another service seems to run."); |
250 | 0 | service = Executors.newSingleThreadExecutor(); |
251 | 0 | service.execute(internalRunnable); |
252 | } | |
253 | 0 | } |
254 | ||
255 | /** | |
256 | * Starts the crawler process with the {@link #defaultFilters}. | |
257 | * | |
258 | * @param seed the starting URL for the crawler process. | |
259 | * @param wait if <code>true</code> the process will wait for the crawler termination. | |
260 | * @throws Exception | |
261 | */ | |
262 | public void start(final URL seed, final boolean wait) throws Exception { | |
263 | 0 | start(seed, defaultFilters, wait); |
264 | 0 | } |
265 | ||
266 | /** | |
267 | * Interrupts the crawler process if started with <code>wait</code> flag == <code>false</code>. | |
268 | */ | |
269 | public synchronized void stop() { | |
270 | 0 | service.shutdownNow(); |
271 | 0 | } |
272 | ||
273 | } |