public class CrawlDatum extends Object implements org.apache.hadoop.io.WritableComparable<CrawlDatum>, Cloneable
Modifier and Type | Class and Description |
---|---|
static class |
CrawlDatum.Comparator
A Comparator optimized for CrawlDatum.
|
Modifier and Type | Field and Description |
---|---|
static String |
FETCH_DIR_NAME |
static String |
GENERATE_DIR_NAME |
static String |
PARSE_DIR_NAME |
static HashMap<Byte,String> |
statNames |
static byte |
STATUS_DB_DUPLICATE |
static byte |
STATUS_DB_FETCHED
Page was successfully fetched.
|
static byte |
STATUS_DB_GONE
Page no longer exists.
|
static byte |
STATUS_DB_MAX
Maximum value of DB-related status.
|
static byte |
STATUS_DB_NOTMODIFIED
Page was successfully fetched and found not modified.
|
static byte |
STATUS_DB_REDIR_PERM
Page permanently redirects to other page.
|
static byte |
STATUS_DB_REDIR_TEMP
Page temporarily redirects to other page.
|
static byte |
STATUS_DB_UNFETCHED
Page was not fetched yet.
|
static byte |
STATUS_FETCH_GONE
Fetching unsuccessful - page is gone.
|
static byte |
STATUS_FETCH_MAX
Maximum value of fetch-related status.
|
static byte |
STATUS_FETCH_NOTMODIFIED
Fetching successful - page is not modified.
|
static byte |
STATUS_FETCH_REDIR_PERM
Fetching permanently redirected to other page.
|
static byte |
STATUS_FETCH_REDIR_TEMP
Fetching temporarily redirected to other page.
|
static byte |
STATUS_FETCH_RETRY
Fetching unsuccessful, needs to be retried (transient errors).
|
static byte |
STATUS_FETCH_SUCCESS
Fetching was successful.
|
static byte |
STATUS_INJECTED
Page was newly injected.
|
static byte |
STATUS_LINKED
Page discovered through a link.
|
static byte |
STATUS_PARSE_META
Page got metadata from a parser
|
static byte |
STATUS_SIGNATURE
Page signature.
|
Constructor and Description |
---|
CrawlDatum() |
CrawlDatum(int status,
int fetchInterval) |
CrawlDatum(int status,
int fetchInterval,
float score) |
Modifier and Type | Method and Description |
---|---|
Object |
clone() |
int |
compareTo(CrawlDatum that)
Sort by decreasing score.
|
boolean |
equals(Object o) |
int |
getFetchInterval() |
long |
getFetchTime()
Returns either the time of the last fetch, or the next fetch time,
depending on whether Fetcher or CrawlDbReducer set the time.
|
org.apache.hadoop.io.MapWritable |
getMetaData()
returns a MapWritable if it was set or read in @see readFields(DataInput),
returns empty map in case CrawlDatum was freshly created (lazily instantiated).
|
long |
getModifiedTime() |
byte |
getRetriesSinceFetch() |
float |
getScore() |
byte[] |
getSignature() |
byte |
getStatus() |
static String |
getStatusName(byte value) |
static boolean |
hasDbStatus(CrawlDatum datum) |
static boolean |
hasFetchStatus(CrawlDatum datum) |
int |
hashCode() |
void |
putAllMetaData(CrawlDatum other)
Add all metadata from other CrawlDatum to this CrawlDatum.
|
static CrawlDatum |
read(DataInput in) |
void |
readFields(DataInput in) |
void |
set(CrawlDatum that)
Copy the contents of another instance into this instance.
|
void |
setFetchInterval(float fetchInterval) |
void |
setFetchInterval(int fetchInterval) |
void |
setFetchTime(long fetchTime)
Sets either the time of the last fetch or the next fetch time,
depending on whether Fetcher or CrawlDbReducer set the time.
|
void |
setMetaData(org.apache.hadoop.io.MapWritable mapWritable) |
void |
setModifiedTime(long modifiedTime) |
void |
setRetriesSinceFetch(int retries) |
void |
setScore(float score) |
void |
setSignature(byte[] signature) |
void |
setStatus(int status) |
String |
toString() |
void |
write(DataOutput out) |
public static final String GENERATE_DIR_NAME
public static final String FETCH_DIR_NAME
public static final String PARSE_DIR_NAME
public static final byte STATUS_DB_UNFETCHED
public static final byte STATUS_DB_FETCHED
public static final byte STATUS_DB_GONE
public static final byte STATUS_DB_REDIR_TEMP
public static final byte STATUS_DB_REDIR_PERM
public static final byte STATUS_DB_NOTMODIFIED
public static final byte STATUS_DB_DUPLICATE
public static final byte STATUS_DB_MAX
public static final byte STATUS_FETCH_SUCCESS
public static final byte STATUS_FETCH_RETRY
public static final byte STATUS_FETCH_REDIR_TEMP
public static final byte STATUS_FETCH_REDIR_PERM
public static final byte STATUS_FETCH_GONE
public static final byte STATUS_FETCH_NOTMODIFIED
public static final byte STATUS_FETCH_MAX
public static final byte STATUS_SIGNATURE
public static final byte STATUS_INJECTED
public static final byte STATUS_LINKED
public static final byte STATUS_PARSE_META
public CrawlDatum()
public CrawlDatum(int status, int fetchInterval)
public CrawlDatum(int status, int fetchInterval, float score)
public static boolean hasDbStatus(CrawlDatum datum)
public static boolean hasFetchStatus(CrawlDatum datum)
public byte getStatus()
public static String getStatusName(byte value)
public void setStatus(int status)
public long getFetchTime()
public void setFetchTime(long fetchTime)
public long getModifiedTime()
public void setModifiedTime(long modifiedTime)
public byte getRetriesSinceFetch()
public void setRetriesSinceFetch(int retries)
public int getFetchInterval()
public void setFetchInterval(int fetchInterval)
public void setFetchInterval(float fetchInterval)
public float getScore()
public void setScore(float score)
public byte[] getSignature()
public void setSignature(byte[] signature)
public void setMetaData(org.apache.hadoop.io.MapWritable mapWritable)
public void putAllMetaData(CrawlDatum other)
other
- CrawlDatumpublic org.apache.hadoop.io.MapWritable getMetaData()
public static CrawlDatum read(DataInput in) throws IOException
IOException
public void readFields(DataInput in) throws IOException
readFields
in interface org.apache.hadoop.io.Writable
IOException
public void write(DataOutput out) throws IOException
write
in interface org.apache.hadoop.io.Writable
IOException
public void set(CrawlDatum that)
public int compareTo(CrawlDatum that)
compareTo
in interface Comparable<CrawlDatum>
Copyright © 2014 The Apache Software Foundation