public class SiteCrawler extends Object
Modifier and Type | Field and Description |
---|---|
static int |
DEFAULT_NUM_OF_CRAWLERS
Default number of crawler instances.
|
static String |
DEFAULT_PAGE_FILTER_RE |
static Class<? extends edu.uci.ics.crawler4j.crawler.WebCrawler> |
DEFAULT_WEB_CRAWLER
Default crawler implementation.
|
Pattern |
defaultFilters
Default filter applied to skip contents.
|
Constructor and Description |
---|
SiteCrawler(File storageFolder)
Constructor.
|
Modifier and Type | Method and Description |
---|---|
void |
addListener(CrawlerListener listener)
Registers a
CrawlerListener to this crawler. |
int |
getMaxDepth() |
int |
getMaxPages() |
int |
getNumOfCrawlers() |
int |
getPolitenessDelay() |
Class<? extends edu.uci.ics.crawler4j.crawler.WebCrawler> |
getWebCrawler() |
void |
removeListener(CrawlerListener listener)
Deregisters a
CrawlerListener from this crawler. |
void |
setMaxDepth(int maxDepth)
Sets the maximum depth.
|
void |
setMaxPages(int maxPages)
Sets the maximum collected pages.
|
void |
setNumOfCrawlers(int n)
Sets the number of crawler instances.
|
void |
setPolitenessDelay(int millis)
Sets the politeness delay.
|
void |
setWebCrawler(Class<? extends edu.uci.ics.crawler4j.crawler.WebCrawler> c)
Sets the actual crawler class.
|
void |
start(URL seed,
boolean wait)
Starts the crawler process with the
defaultFilters . |
void |
start(URL seed,
Pattern filters,
boolean wait)
Starts the crawling process.
|
void |
stop()
Interrupts the crawler process if started with
wait flag == false . |
public static final String DEFAULT_PAGE_FILTER_RE
public static final int DEFAULT_NUM_OF_CRAWLERS
public static final Class<? extends edu.uci.ics.crawler4j.crawler.WebCrawler> DEFAULT_WEB_CRAWLER
public final Pattern defaultFilters
public SiteCrawler(File storageFolder)
storageFolder
- location used to store the temporary data structures used by the crawler.public int getNumOfCrawlers()
public void setNumOfCrawlers(int n)
n
- an integer >= 0.public Class<? extends edu.uci.ics.crawler4j.crawler.WebCrawler> getWebCrawler()
public void setWebCrawler(Class<? extends edu.uci.ics.crawler4j.crawler.WebCrawler> c)
c
- a not class
.public int getMaxDepth()
-1
means no limit.public void setMaxDepth(int maxDepth)
maxDepth
- maximum allowed depth. -1
means no limit.public int getMaxPages()
public void setMaxPages(int maxPages)
maxPages
- maximum allowed pages. -1
means no limit.public int getPolitenessDelay()
public void setPolitenessDelay(int millis)
millis
- delay in milliseconds.public void addListener(CrawlerListener listener)
CrawlerListener
to this crawler.listener
- public void removeListener(CrawlerListener listener)
CrawlerListener
from this crawler.listener
- public void start(URL seed, Pattern filters, boolean wait) throws Exception
seed
- the starting URL for the crawler process.filters
- filters to be applied to the crawler process. Can be null
.wait
- if true
the process will wait for the crawler termination.Exception
public void start(URL seed, boolean wait) throws Exception
defaultFilters
.seed
- the starting URL for the crawler process.wait
- if true
the process will wait for the crawler termination.Exception
public void stop()
wait
flag == false
.Copyright © 2010–2019 The Apache Software Foundation. All rights reserved.