Coverage Report

Coverage Report - org.apache.any23.plugin.crawler.SiteCrawler

Classes in this File

Line Coverage

Branch Coverage

Complexity

SiteCrawler

0/59

0/24

2.294

SiteCrawler$1

0/3

N/A

2.294

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 package org.apache.any23.plugin.crawler;
 
 import edu.uci.ics.crawler4j.crawler.CrawlController;
 import edu.uci.ics.crawler4j.crawler.WebCrawler;
 
 import java.io.File;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.regex.Pattern;
 
 /**
  * A basic <em>site crawler</em> to extract semantic content
  * of small/medium size sites.
  *
  * @author Michele Mostarda (mostarda@fbk.eu)
  */
 public class SiteCrawler {
 
     public static final String DEFAULT_PAGE_FILTER_RE =
         ".*(\\.(" +
                     "css|js"                            +
                     "|bmp|gif|jpe?g|png|tiff?"          +
                     "|mid|mp2|mp3|mp4|wav|wma"          +
                     "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
                     "|pdf"        +
                     "|swf"        +
                     "|zip|rar|gz" +
                     "|xml|txt"    +
         "))$";
 
     /**
      * Default number of crawler instances.
      */
     public static final int DEFAULT_NUM_OF_CRAWLERS = 10;
 
     /**
      * Default crawler implementation.
      */
     public static final Class<? extends WebCrawler> DEFAULT_WEB_CRAWLER = DefaultWebCrawler.class;
 
     /**
      * Default filter applied to skip contents.
      */
     public final Pattern defaultFilters = Pattern.compile(DEFAULT_PAGE_FILTER_RE);
 
     /**
      * The crawler threads controller.
      */
     private final CrawlController controller;
 
     /**
      * Crawler listeners.
      */
     private final List<CrawlerListener> listeners = new ArrayList<CrawlerListener>();
 
     /**
      * Actual number of crawler instances.
      */
     private int numOfCrawlers = DEFAULT_NUM_OF_CRAWLERS;
 
     /**
      * Actual web crawler.
      */
     private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
 
     /**
      * Max allowed depth, <code>-1</code> means no limit.
      */
     private int maxDepth = -1;
 
     /**
      *  Max allowed pages, <code>-1</code> means no limit.
      */
     private int maxPages = -1;
 
     /**
      * Subsequent call politeness delay, <code>-1</code> means no limit.
      */
     private int politenessDelay = -1;
 
     /**
      * Internal executor service.
      */
     private ExecutorService service;
 
     /**
      * Constructor.
      *
      * @param storageFolder location used to store the temporary data structures used by the crawler.
      */
     public SiteCrawler(File storageFolder) {
         try {
             controller = new CrawlController( storageFolder.getAbsolutePath() );
         } catch (Exception e) {
             throw new IllegalArgumentException("Error while initializing crawler controller.", e);
         }
     }
 
     /**
      * @return number of crawler instances.
      */
     public int getNumOfCrawlers() {
         return numOfCrawlers;
     }
 
     /**
      * Sets the number of crawler instances.
      *
      * @param n an integer &gt;= 0.
      */
     public void setNumOfCrawlers(int n) {
         if(n <=0) throw new IllegalArgumentException("Invalid number of crawlers, must be > 0 .");
         this.numOfCrawlers = n;
     }
 
     public Class<? extends WebCrawler> getWebCrawler() {
         return webCrawler;
     }
 
     /**
      * Sets the actual crawler clas.
      *
      * @param c a not <code>class</code>.
      */
     public void setWebCrawler(Class<? extends WebCrawler> c) {
         if(c == null) throw new NullPointerException("c cannot be null.");
         this.webCrawler = c;
     }
 
     /**
      * @return the max allowed crawl depth, <code>-1</code> means no limit.
      */
     public int getMaxDepth() {
         return maxDepth;
     }
 
     /**
      * Sets the maximum depth.
      *
      * @param maxDepth maximum allowed depth. <code>-1</code> means no limit.
      */
     public void setMaxDepth(int maxDepth) {
         if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
         if(maxDepth > 0) try {
             controller.setMaximumCrawlDepth(maxDepth);
         } catch (Exception e) {
             throw new IllegalArgumentException("Error while setting maxDepth.", e);
         }
         this.maxDepth = maxDepth;
     }
 
     /**
      * @return max number of allowed pages.
      */
     public int getMaxPages() {
         return maxPages;
     }
 
     /**
      * Sets the maximum collected pages.
      *
      * @param maxPages maximum allowed pages. <code>-1</code> means no limit.
      */
     public void setMaxPages(int maxPages) {
         if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
         if(maxPages > 0) controller.setMaximumPagesToFetch(maxPages);
         this.maxPages = maxPages;
     }
 
     /**
      * @return the politeness delay in milliseconds.
      */
     public int getPolitenessDelay() {
         return politenessDelay;
     }
 
     /**
      * Sets the politeness delay. <code>-1</code> means no politeness.
      *
      * @param millis delay in milliseconds.
      */
     public void setPolitenessDelay(int millis) {
         if(millis < -1) throw new IllegalArgumentException("Invalid politenessDelay, must be >= -1");
         if(millis >= 0) controller.setPolitenessDelay(millis);
         this.politenessDelay = millis;
     }
 
     /**
      * Registers a {@link CrawlerListener} to this crawler.
      *
      * @param listener
      */
     public void addListener(CrawlerListener listener) {
         listeners.add(listener);
     }
 
     /**
      * Deregisters a {@link CrawlerListener} from this crawler.
      *
      * @param listener
      */
     public void removeListener(CrawlerListener listener) {
         listeners.remove(listener);
     }
 
     /**
      * Starts the crawling process.
      *
      * @param seed the starting URL for the crawler process.
      * @param filters filters to be applied to the crawler process. Can be <code>null</code>.
      * @param wait if <code>true</code> the process will wait for the crawler termination.
      * @throws Exception
      */
     public synchronized void start(
             final URL seed, final Pattern filters, final boolean wait
     ) throws Exception {
         SharedData.setCrawlData(seed.toExternalForm(), filters, Collections.synchronizedList(listeners) );
         controller.addSeed(seed.toExternalForm());
         final Runnable internalRunnable = new Runnable() {
             @Override
             public void run() {
                 controller.start(getWebCrawler(), getNumOfCrawlers());
             }
         };
         if(wait) {
             internalRunnable.run();
         } else {
             if(service != null) throw new IllegalStateException("Another service seems to run.");
             service = Executors.newSingleThreadExecutor();
             service.execute(internalRunnable);
         }
     }
 
     /**
      * Starts the crawler process with the {@link #defaultFilters}.
      *
      * @param seed the starting URL for the crawler process.
      * @param wait if <code>true</code> the process will wait for the crawler termination.
      * @throws Exception
      */
     public void start(final URL seed, final boolean wait) throws Exception {
         start(seed, defaultFilters, wait);
     }
 
     /**
      * Interrupts the crawler process if started with <code>wait</code> flag == <code>false</code>.
      */
     public synchronized void stop() {
         service.shutdownNow();
     }
 
 }

1		/*
2		* Licensed to the Apache Software Foundation (ASF) under one or more
3		* contributor license agreements. See the NOTICE file distributed with
4		* this work for additional information regarding copyright ownership.
5		* The ASF licenses this file to You under the Apache License, Version 2.0
6		* (the "License"); you may not use this file except in compliance with
7		* the License. You may obtain a copy of the License at
8		*
9		* http://www.apache.org/licenses/LICENSE-2.0
10		*
11		* Unless required by applicable law or agreed to in writing, software
12		* distributed under the License is distributed on an "AS IS" BASIS,
13		* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14		* See the License for the specific language governing permissions and
15		* limitations under the License.
16		*/
17
18		package org.apache.any23.plugin.crawler;
19
20		import edu.uci.ics.crawler4j.crawler.CrawlController;
21		import edu.uci.ics.crawler4j.crawler.WebCrawler;
22
23		import java.io.File;
24		import java.net.URL;
25		import java.util.ArrayList;
26		import java.util.Collections;
27		import java.util.List;
28		import java.util.concurrent.ExecutorService;
29		import java.util.concurrent.Executors;
30		import java.util.regex.Pattern;
31
32		/**
33		* A basic <em>site crawler</em> to extract semantic content
34		* of small/medium size sites.
35		*
36		* @author Michele Mostarda (mostarda@fbk.eu)
37		*/
38	0	public class SiteCrawler {
39
40		public static final String DEFAULT_PAGE_FILTER_RE =
41		".*(\\.(" +
42		"css\|js" +
43		"\|bmp\|gif\|jpe?g\|png\|tiff?" +
44		"\|mid\|mp2\|mp3\|mp4\|wav\|wma" +
45		"\|avi\|mov\|mpeg\|ram\|m4v\|wmv\|rm\|smil" +
46		"\|pdf" +
47		"\|swf" +
48		"\|zip\|rar\|gz" +
49		"\|xml\|txt" +
50		"))$";
51
52		/**
53		* Default number of crawler instances.
54		*/
55		public static final int DEFAULT_NUM_OF_CRAWLERS = 10;
56
57		/**
58		* Default crawler implementation.
59		*/
60	0	public static final Class<? extends WebCrawler> DEFAULT_WEB_CRAWLER = DefaultWebCrawler.class;
61
62		/**
63		* Default filter applied to skip contents.
64		*/
65	0	public final Pattern defaultFilters = Pattern.compile(DEFAULT_PAGE_FILTER_RE);
66
67		/**
68		* The crawler threads controller.
69		*/
70		private final CrawlController controller;
71
72		/**
73		* Crawler listeners.
74		*/
75	0	private final List<CrawlerListener> listeners = new ArrayList<CrawlerListener>();
76
77		/**
78		* Actual number of crawler instances.
79		*/
80	0	private int numOfCrawlers = DEFAULT_NUM_OF_CRAWLERS;
81
82		/**
83		* Actual web crawler.
84		*/
85	0	private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
86
87		/**
88		* Max allowed depth, <code>-1</code> means no limit.
89		*/
90	0	private int maxDepth = -1;
91
92		/**
93		* Max allowed pages, <code>-1</code> means no limit.
94		*/
95	0	private int maxPages = -1;
96
97		/**
98		* Subsequent call politeness delay, <code>-1</code> means no limit.
99		*/
100	0	private int politenessDelay = -1;
101
102		/**
103		* Internal executor service.
104		*/
105		private ExecutorService service;
106
107		/**
108		* Constructor.
109		*
110		* @param storageFolder location used to store the temporary data structures used by the crawler.
111		*/
112	0	public SiteCrawler(File storageFolder) {
113		try {
114	0	controller = new CrawlController( storageFolder.getAbsolutePath() );
115	0	} catch (Exception e) {
116	0	throw new IllegalArgumentException("Error while initializing crawler controller.", e);
117	0	}
118	0	}
119
120		/**
121		* @return number of crawler instances.
122		*/
123		public int getNumOfCrawlers() {
124	0	return numOfCrawlers;
125		}
126
127		/**
128		* Sets the number of crawler instances.
129		*
130		* @param n an integer >= 0.
131		*/
132		public void setNumOfCrawlers(int n) {
133	0	if(n <=0) throw new IllegalArgumentException("Invalid number of crawlers, must be > 0 .");
134	0	this.numOfCrawlers = n;
135	0	}
136
137		public Class<? extends WebCrawler> getWebCrawler() {
138	0	return webCrawler;
139		}
140
141		/**
142		* Sets the actual crawler clas.
143		*
144		* @param c a not <code>class</code>.
145		*/
146		public void setWebCrawler(Class<? extends WebCrawler> c) {
147	0	if(c == null) throw new NullPointerException("c cannot be null.");
148	0	this.webCrawler = c;
149	0	}
150
151		/**
152		* @return the max allowed crawl depth, <code>-1</code> means no limit.
153		*/
154		public int getMaxDepth() {
155	0	return maxDepth;
156		}
157
158		/**
159		* Sets the maximum depth.
160		*
161		* @param maxDepth maximum allowed depth. <code>-1</code> means no limit.
162		*/
163		public void setMaxDepth(int maxDepth) {
164	0	if(maxDepth < -1 \|\| maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
165	0	if(maxDepth > 0) try {
166	0	controller.setMaximumCrawlDepth(maxDepth);
167	0	} catch (Exception e) {
168	0	throw new IllegalArgumentException("Error while setting maxDepth.", e);
169	0	}
170	0	this.maxDepth = maxDepth;
171	0	}
172
173		/**
174		* @return max number of allowed pages.
175		*/
176		public int getMaxPages() {
177	0	return maxPages;
178		}
179
180		/**
181		* Sets the maximum collected pages.
182		*
183		* @param maxPages maximum allowed pages. <code>-1</code> means no limit.
184		*/
185		public void setMaxPages(int maxPages) {
186	0	if(maxPages < -1 \|\| maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
187	0	if(maxPages > 0) controller.setMaximumPagesToFetch(maxPages);
188	0	this.maxPages = maxPages;
189	0	}
190
191		/**
192		* @return the politeness delay in milliseconds.
193		*/
194		public int getPolitenessDelay() {
195	0	return politenessDelay;
196		}
197
198		/**
199		* Sets the politeness delay. <code>-1</code> means no politeness.
200		*
201		* @param millis delay in milliseconds.
202		*/
203		public void setPolitenessDelay(int millis) {
204	0	if(millis < -1) throw new IllegalArgumentException("Invalid politenessDelay, must be >= -1");
205	0	if(millis >= 0) controller.setPolitenessDelay(millis);
206	0	this.politenessDelay = millis;
207	0	}
208
209		/**
210		* Registers a {@link CrawlerListener} to this crawler.
211		*
212		* @param listener
213		*/
214		public void addListener(CrawlerListener listener) {
215	0	listeners.add(listener);
216	0	}
217
218		/**
219		* Deregisters a {@link CrawlerListener} from this crawler.
220		*
221		* @param listener
222		*/
223		public void removeListener(CrawlerListener listener) {
224	0	listeners.remove(listener);
225	0	}
226
227		/**
228		* Starts the crawling process.
229		*
230		* @param seed the starting URL for the crawler process.
231		* @param filters filters to be applied to the crawler process. Can be <code>null</code>.
232		* @param wait if <code>true</code> the process will wait for the crawler termination.
233		* @throws Exception
234		*/
235		public synchronized void start(
236		final URL seed, final Pattern filters, final boolean wait
237		) throws Exception {
238	0	SharedData.setCrawlData(seed.toExternalForm(), filters, Collections.synchronizedList(listeners) );
239	0	controller.addSeed(seed.toExternalForm());
240	0	final Runnable internalRunnable = new Runnable() {
241		@Override
242		public void run() {
243	0	controller.start(getWebCrawler(), getNumOfCrawlers());
244	0	}
245		};
246	0	if(wait) {
247	0	internalRunnable.run();
248		} else {
249	0	if(service != null) throw new IllegalStateException("Another service seems to run.");
250	0	service = Executors.newSingleThreadExecutor();
251	0	service.execute(internalRunnable);
252		}
253	0	}
254
255		/**
256		* Starts the crawler process with the {@link #defaultFilters}.
257		*
258		* @param seed the starting URL for the crawler process.
259		* @param wait if <code>true</code> the process will wait for the crawler termination.
260		* @throws Exception
261		*/
262		public void start(final URL seed, final boolean wait) throws Exception {
263	0	start(seed, defaultFilters, wait);
264	0	}
265
266		/**
267		* Interrupts the crawler process if started with <code>wait</code> flag == <code>false</code>.
268		*/
269		public synchronized void stop() {
270	0	service.shutdownNow();
271	0	}
272
273		}