public class Generator
extends org.apache.hadoop.conf.Configured
implements org.apache.hadoop.util.Tool
Modifier and Type | Class and Description |
---|---|
static class |
Generator.CrawlDbUpdater
Update the CrawlDB so that the next generate won't include the same URLs.
|
static class |
Generator.DecreasingFloatComparator |
static class |
Generator.GeneratorOutputFormat |
static class |
Generator.HashComparator
Sort fetch lists by hash of URL.
|
static class |
Generator.PartitionReducer |
static class |
Generator.Selector
Selects entries due for fetch.
|
static class |
Generator.SelectorEntry |
static class |
Generator.SelectorInverseMapper |
Modifier and Type | Field and Description |
---|---|
static String |
GENERATE_MAX_PER_HOST_BY_IP |
static String |
GENERATE_UPDATE_CRAWLDB |
static String |
GENERATOR_COUNT_MODE |
static String |
GENERATOR_COUNT_VALUE_DOMAIN |
static String |
GENERATOR_COUNT_VALUE_HOST |
static String |
GENERATOR_CUR_TIME |
static String |
GENERATOR_DELAY |
static String |
GENERATOR_FILTER |
static String |
GENERATOR_MAX_COUNT |
static String |
GENERATOR_MAX_NUM_SEGMENTS |
static String |
GENERATOR_MIN_INTERVAL |
static String |
GENERATOR_MIN_SCORE |
static String |
GENERATOR_NORMALISE |
static String |
GENERATOR_RESTRICT_STATUS |
static String |
GENERATOR_TOP_N |
static org.slf4j.Logger |
LOG |
Constructor and Description |
---|
Generator() |
Generator(org.apache.hadoop.conf.Configuration conf) |
Modifier and Type | Method and Description |
---|---|
org.apache.hadoop.fs.Path[] |
generate(org.apache.hadoop.fs.Path dbDir,
org.apache.hadoop.fs.Path segments,
int numLists,
long topN,
long curTime) |
org.apache.hadoop.fs.Path[] |
generate(org.apache.hadoop.fs.Path dbDir,
org.apache.hadoop.fs.Path segments,
int numLists,
long topN,
long curTime,
boolean filter,
boolean force)
old signature used for compatibility - does not specify whether or not to
normalise and set the number of segments to 1
|
org.apache.hadoop.fs.Path[] |
generate(org.apache.hadoop.fs.Path dbDir,
org.apache.hadoop.fs.Path segments,
int numLists,
long topN,
long curTime,
boolean filter,
boolean norm,
boolean force,
int maxNumSegments)
Generate fetchlists in one or more segments.
|
static String |
generateSegmentName() |
static void |
main(String[] args)
Generate a fetchlist from the crawldb.
|
int |
run(String[] args) |
public static final org.slf4j.Logger LOG
public static final String GENERATE_UPDATE_CRAWLDB
public static final String GENERATOR_MIN_SCORE
public static final String GENERATOR_MIN_INTERVAL
public static final String GENERATOR_RESTRICT_STATUS
public static final String GENERATOR_FILTER
public static final String GENERATOR_NORMALISE
public static final String GENERATOR_MAX_COUNT
public static final String GENERATOR_COUNT_MODE
public static final String GENERATOR_COUNT_VALUE_DOMAIN
public static final String GENERATOR_COUNT_VALUE_HOST
public static final String GENERATOR_TOP_N
public static final String GENERATOR_CUR_TIME
public static final String GENERATOR_DELAY
public static final String GENERATOR_MAX_NUM_SEGMENTS
public static final String GENERATE_MAX_PER_HOST_BY_IP
public Generator()
public Generator(org.apache.hadoop.conf.Configuration conf)
public org.apache.hadoop.fs.Path[] generate(org.apache.hadoop.fs.Path dbDir, org.apache.hadoop.fs.Path segments, int numLists, long topN, long curTime) throws IOException
IOException
public org.apache.hadoop.fs.Path[] generate(org.apache.hadoop.fs.Path dbDir, org.apache.hadoop.fs.Path segments, int numLists, long topN, long curTime, boolean filter, boolean force) throws IOException
IOException
public org.apache.hadoop.fs.Path[] generate(org.apache.hadoop.fs.Path dbDir, org.apache.hadoop.fs.Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException
dbDir
- Crawl database directorysegments
- Segments directorynumLists
- Number of reduce taskstopN
- Number of top URLs to be selectedcurTime
- Current time in millisecondsIOException
- When an I/O error occurspublic static String generateSegmentName()
public static void main(String[] args) throws Exception
Exception
Copyright © 2014 The Apache Software Foundation