Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DEFAULT_FILE_NAME |
"subcollections.xml" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
TAG_BLACKLIST |
"blacklist" |
public static final String |
TAG_COLLECTION |
"subcollection" |
public static final String |
TAG_COLLECTIONS |
"subcollections" |
public static final String |
TAG_ID |
"id" |
public static final String |
TAG_KEY |
"key" |
public static final String |
TAG_NAME |
"name" |
public static final String |
TAG_WHITELIST |
"whitelist" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
FETCH_DIR_NAME |
"crawl_fetch" |
public static final String |
GENERATE_DIR_NAME |
"crawl_generate" |
public static final String |
PARSE_DIR_NAME |
"crawl_parse" |
public static final byte |
STATUS_DB_DUPLICATE |
7 |
public static final byte |
STATUS_DB_FETCHED |
2 |
public static final byte |
STATUS_DB_GONE |
3 |
public static final byte |
STATUS_DB_MAX |
31 |
public static final byte |
STATUS_DB_NOTMODIFIED |
6 |
public static final byte |
STATUS_DB_REDIR_PERM |
5 |
public static final byte |
STATUS_DB_REDIR_TEMP |
4 |
public static final byte |
STATUS_DB_UNFETCHED |
1 |
public static final byte |
STATUS_FETCH_GONE |
37 |
public static final byte |
STATUS_FETCH_MAX |
63 |
public static final byte |
STATUS_FETCH_NOTMODIFIED |
38 |
public static final byte |
STATUS_FETCH_REDIR_PERM |
36 |
public static final byte |
STATUS_FETCH_REDIR_TEMP |
35 |
public static final byte |
STATUS_FETCH_RETRY |
34 |
public static final byte |
STATUS_FETCH_SUCCESS |
33 |
public static final byte |
STATUS_INJECTED |
66 |
public static final byte |
STATUS_LINKED |
67 |
public static final byte |
STATUS_PARSE_META |
68 |
public static final byte |
STATUS_SIGNATURE |
65 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
CRAWLDB_ADDITIONS_ALLOWED |
"db.update.additions.allowed" |
public static final String |
CRAWLDB_PURGE_404 |
"db.update.purge.404" |
public static final String |
CURRENT_NAME |
"current" |
public static final String |
LOCK_NAME |
".locked" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
URL_FILTERING |
"crawldb.url.filters" |
public static final String |
URL_NORMALIZING |
"crawldb.url.normalizers" |
public static final String |
URL_NORMALIZING_SCOPE |
"crawldb.url.normalizers.scope" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final int |
SECONDS_PER_DAY |
86400 |
public static final int |
STATUS_MODIFIED |
1 |
public static final int |
STATUS_NOTMODIFIED |
2 |
public static final int |
STATUS_UNKNOWN |
0 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
GENERATE_UPDATE_CRAWLDB |
"generate.update.crawldb" |
public static final String |
GENERATOR_COUNT_MODE |
"generate.count.mode" |
public static final String |
GENERATOR_COUNT_VALUE_DOMAIN |
"domain" |
public static final String |
GENERATOR_COUNT_VALUE_HOST |
"host" |
public static final String |
GENERATOR_CUR_TIME |
"generate.curTime" |
public static final String |
GENERATOR_DELAY |
"crawl.gen.delay" |
public static final String |
GENERATOR_EXPR |
"generate.expr" |
public static final String |
GENERATOR_FILTER |
"generate.filter" |
public static final String |
GENERATOR_MAX_COUNT |
"generate.max.count" |
public static final String |
GENERATOR_MAX_NUM_SEGMENTS |
"generate.max.num.segments" |
public static final String |
GENERATOR_MIN_INTERVAL |
"generate.min.interval" |
public static final String |
GENERATOR_MIN_SCORE |
"generate.min.score" |
public static final String |
GENERATOR_NORMALISE |
"generate.normalise" |
public static final String |
GENERATOR_RESTRICT_STATUS |
"generate.restrict.status" |
public static final String |
GENERATOR_TOP_N |
"generate.topN" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
EQUAL_CHARACTER |
"=" |
public static final String |
TAB_CHARACTER |
"\t" |
public static final String |
URL_NORMALIZING_SCOPE |
"crawldb.url.normalizers.scope" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
CURRENT_NAME |
"current" |
public static final String |
IGNORE_EXTERNAL_LINKS |
"linkdb.ignore.external.links" |
public static final String |
IGNORE_INTERNAL_LINKS |
"linkdb.ignore.internal.links" |
public static final String |
LOCK_NAME |
".locked" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
URL_FILTERING |
"linkdb.url.filters" |
public static final String |
URL_NORMALIZING |
"linkdb.url.normalizer" |
public static final String |
URL_NORMALIZING_SCOPE |
"linkdb.url.normalizer.scope" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
SCHEDULE_DEC_RATE |
"db.fetch.schedule.adaptive.dec_rate" |
public static final String |
SCHEDULE_INC_RATE |
"db.fetch.schedule.adaptive.inc_rate" |
public static final String |
SCHEDULE_MIME_FILE |
"db.fetch.schedule.mime.file" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
PARTITION_MODE_DOMAIN |
"byDomain" |
public static final String |
PARTITION_MODE_HOST |
"byHost" |
public static final String |
PARTITION_MODE_IP |
"byIP" |
public static final String |
PARTITION_MODE_KEY |
"partition.url.mode" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
CONTENT_REDIR |
"content" |
public static final int |
PERM_REFRESH_TIME |
5 |
public static final String |
PROTOCOL_REDIR |
"protocol" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DEFAULT_ID |
"default" |
public static final String |
QUEUE_MODE_DOMAIN |
"byDomain" |
public static final String |
QUEUE_MODE_HOST |
"byHost" |
public static final String |
QUEUE_MODE_IP |
"byIP" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
HOSTDB_DUMP_HOMEPAGES |
"hostdb.dump.homepages" |
public static final String |
HOSTDB_DUMP_HOSTNAMES |
"hostdb.dump.hostnames" |
public static final String |
HOSTDB_FILTER_EXPRESSION |
"hostdb.filter.expression" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
HOSTDB_CHECK_FAILED |
"hostdb.check.failed" |
public static final String |
HOSTDB_CHECK_KNOWN |
"hostdb.check.known" |
public static final String |
HOSTDB_CHECK_NEW |
"hostdb.check.new" |
public static final String |
HOSTDB_FORCE_CHECK |
"hostdb.force.check" |
public static final String |
HOSTDB_NUM_RESOLVER_THREADS |
"hostdb.num.resolvers.threads" |
public static final String |
HOSTDB_NUMERIC_FIELDS |
"hostdb.numeric.fields" |
public static final String |
HOSTDB_PERCENTILES |
"hostdb.percentiles" |
public static final String |
HOSTDB_PURGE_FAILED_HOSTS_THRESHOLD |
"hostdb.purge.failed.hosts.threshold" |
public static final String |
HOSTDB_RECHECK_INTERVAL |
"hostdb.recheck.interval" |
public static final String |
HOSTDB_STRING_FIELDS |
"hostdb.string.fields" |
public static final String |
HOSTDB_URL_FILTERING |
"hostdb.url.filter" |
public static final String |
HOSTDB_URL_NORMALIZING |
"hostdb.url.normalize" |
public static final String |
LOCK_NAME |
".locked" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
INDEXER_BINARY_AS_BASE64 |
"indexer.binary.base64" |
public static final String |
INDEXER_DELETE |
"indexer.delete" |
public static final String |
INDEXER_DELETE_ROBOTS_NOINDEX |
"indexer.delete.robots.noindex" |
public static final String |
INDEXER_DELETE_SKIPPED |
"indexer.delete.skipped.by.indexingfilter" |
public static final String |
INDEXER_PARAMS |
"indexer.additional.params" |
public static final String |
INDEXER_SKIP_NOTMODIFIED |
"indexer.skip.notmodified" |
public static final String |
URL_FILTERING |
"indexer.url.filters" |
public static final String |
URL_NORMALIZING |
"indexer.url.normalizers" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
INDEXINGFILTER_ORDER |
"indexingfilter.order" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final byte |
VERSION |
2 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final byte |
ADD |
0 |
public static final byte |
DELETE |
1 |
public static final byte |
UPDATE |
2 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
dateFormatStr |
"yyyyMMddHHmm" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
MIMEFILTER_REGEX_FILE |
"mimetype.filter.file" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
LINKS_INLINKS_HOST |
"index.links.inlinks.host.ignore" |
public static final String |
LINKS_ONLY_HOSTS |
"index.links.hosts.only" |
public static final String |
LINKS_OUTLINKS_HOST |
"index.links.outlinks.host.ignore" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
BULK_CLOSE_TIMEOUT |
"elastic.bulk.close.timeout" |
public static final String |
CLUSTER |
"elastic.cluster" |
public static final String |
ELASTIC_PREFIX |
"elastic." |
public static final String |
EXPONENTIAL_BACKOFF_MILLIS |
"elastic.exponential.backoff.millis" |
public static final String |
EXPONENTIAL_BACKOFF_RETRIES |
"elastic.exponential.backoff.retries" |
public static final String |
HOSTS |
"elastic.host" |
public static final String |
INDEX |
"elastic.index" |
public static final String |
MAX_BULK_DOCS |
"elastic.max.bulk.docs" |
public static final String |
MAX_BULK_LENGTH |
"elastic.max.bulk.size" |
public static final String |
PORT |
"elastic.port" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
BOOST_FIELD |
"boost" |
public static final String |
COLLECTION |
"solr.collection" |
public static final String |
COMMIT_INDEX |
"solr.commit.index" |
public static final String |
COMMIT_SIZE |
"solr.commit.size" |
public static final String |
DIGEST_FIELD |
"digest" |
public static final String |
ID_FIELD |
"id" |
public static final String |
MAPPING_FILE |
"solr.mapping.file" |
public static final String |
PARAMS |
"solr.params" |
public static final String |
PASSWORD |
"solr.auth.password" |
public static final String |
SERVER_URL |
"solr.server.url" |
public static final String |
SOLR_PREFIX |
"solr." |
public static final String |
TIMESTAMP_FIELD |
"tstamp" |
public static final String |
URL_FIELD |
"url" |
public static final String |
USE_AUTH |
"solr.auth" |
public static final String |
USERNAME |
"solr.auth.username" |
public static final String |
ZOOKEEPER_HOSTS |
"solr.zookeeper.hosts" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
LICENSE_LOCATION |
"License-Location" |
public static final String |
LICENSE_URL |
"License-Url" |
public static final String |
WORK_TYPE |
"Work-Type" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
CONTRIBUTOR |
"contributor" |
public static final String |
COVERAGE |
"coverage" |
public static final String |
CREATOR |
"creator" |
public static final String |
DATE |
"date" |
public static final String |
DESCRIPTION |
"description" |
public static final String |
FORMAT |
"format" |
public static final String |
IDENTIFIER |
"identifier" |
public static final String |
LANGUAGE |
"language" |
public static final String |
MODIFIED |
"modified" |
public static final String |
PUBLISHER |
"publisher" |
public static final String |
RELATION |
"relation" |
public static final String |
RIGHTS |
"rights" |
public static final String |
SOURCE |
"source" |
public static final String |
SUBJECT |
"subject" |
public static final String |
TITLE |
"title" |
public static final String |
TYPE |
"type" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
FEED |
"feed" |
public static final String |
FEED_AUTHOR |
"author" |
public static final String |
FEED_PUBLISHED |
"published" |
public static final String |
FEED_TAGS |
"tag" |
public static final String |
FEED_UPDATED |
"updated" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
CONTENT_DISPOSITION |
"Content-Disposition" |
public static final String |
CONTENT_ENCODING |
"Content-Encoding" |
public static final String |
CONTENT_LANGUAGE |
"Content-Language" |
public static final String |
CONTENT_LENGTH |
"Content-Length" |
public static final String |
CONTENT_LOCATION |
"Content-Location" |
public static final String |
CONTENT_MD5 |
"Content-MD5" |
public static final String |
CONTENT_TYPE |
"Content-Type" |
public static final String |
LAST_MODIFIED |
"Last-Modified" |
public static final String |
LOCATION |
"Location" |
public static final String |
TRANSFER_ENCODING |
"Transfer-Encoding" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
ARG_CRAWLDB |
"crawldb" |
public static final String |
ARG_LINKDB |
"linkdb" |
public static final String |
ARG_SEEDDIR |
"url_dir" |
public static final String |
ARG_SEEDNAME |
"seedName" |
public static final String |
ARG_SEGMENT |
"segment" |
public static final String |
ARG_SEGMENTDIR |
"segment_dir" |
public static final String |
CACHING_FORBIDDEN_ALL |
"all" |
public static final String |
CACHING_FORBIDDEN_CONTENT |
"content" |
public static final String |
CACHING_FORBIDDEN_KEY |
"caching.forbidden" |
public static final String |
CACHING_FORBIDDEN_NONE |
"none" |
public static final String |
CHAR_ENCODING_FOR_CONVERSION |
"CharEncodingForConversion" |
public static final String |
CRAWL_ID_KEY |
"storage.crawl.id" |
public static final String |
FETCH_EVENT_CONTENTLANG |
"content-language" |
public static final String |
FETCH_EVENT_CONTENTTYPE |
"content-type" |
public static final String |
FETCH_EVENT_FETCHTIME |
"fetchTime" |
public static final String |
FETCH_EVENT_SCORE |
"score" |
public static final String |
FETCH_EVENT_TITLE |
"title" |
public static final String |
FETCH_STATUS_KEY |
"_fst_" |
public static final String |
FETCH_TIME_KEY |
"_ftk_" |
public static final String |
FIXED_INTERVAL_KEY |
"fixedInterval" |
public static final String |
GENERATE_TIME_KEY |
"_ngt_" |
public static final String |
ORIGINAL_CHAR_ENCODING |
"OriginalCharEncoding" |
public static final String |
PROTO_STATUS_KEY |
"_pst_" |
public static final String |
REPR_URL_KEY |
"_repr_" |
public static final String |
SCORE_KEY |
"nutch.crawl.score" |
public static final String |
SEGMENT_NAME_KEY |
"nutch.segment.name" |
public static final String |
SIGNATURE_KEY |
"nutch.content.digest" |
public static final String |
STAT_PROGRESS |
"progress" |
public static final String |
VAL_RESULT |
"result" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
REL_TAG |
"Rel-Tag" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
URLFILTER_ORDER |
"urlfilter.order" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
SCOPE_CRAWLDB |
"crawldb" |
public static final String |
SCOPE_DEFAULT |
"default" |
public static final String |
SCOPE_FETCHER |
"fetcher" |
public static final String |
SCOPE_GENERATE_HOST_COUNT |
"generate_host_count" |
public static final String |
SCOPE_INDEXER |
"indexer" |
public static final String |
SCOPE_INJECT |
"inject" |
public static final String |
SCOPE_LINKDB |
"linkdb" |
public static final String |
SCOPE_OUTLINK |
"outlink" |
public static final String |
SCOPE_PARTITION |
"partition" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
HTMLPARSEFILTER_ORDER |
"htmlparsefilter.order" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DIR_NAME |
"parse_data" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DEFAULT_PLUGIN |
"*" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
SKIP_TRUNCATED |
"parser.skip.truncated" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final byte |
FAILED |
2 |
public static final short |
FAILED_EXCEPTION |
200 |
public static final short |
FAILED_INVALID_FORMAT |
203 |
public static final short |
FAILED_MISSING_CONTENT |
205 |
public static final short |
FAILED_MISSING_PARTS |
204 |
public static final short |
FAILED_TRUNCATED |
202 |
public static final byte |
NOTPARSED |
0 |
public static final byte |
SUCCESS |
1 |
public static final short |
SUCCESS_REDIRECT |
100 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DIR_NAME |
"parse_text" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
CHARSET_UTF8 |
"charset=UTF-8" |
public static final String |
TEXT_PLAIN_CONTENT_TYPE |
"text/plain; charset=UTF-8" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DICTFILE_MODELFILTER |
"parsefilter.naivebayes.wordlist" |
public static final String |
TRAINFILE_MODELFILTER |
"parsefilter.naivebayes.trainfile" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DIR_NAME |
"content" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final int |
ACCESS_DENIED |
17 |
public static final int |
BLOCKED |
23 |
public static final int |
EXCEPTION |
16 |
public static final int |
FAILED |
2 |
public static final int |
GONE |
11 |
public static final int |
MOVED |
12 |
public static final int |
NOTFETCHING |
20 |
public static final int |
NOTFOUND |
14 |
public static final int |
NOTMODIFIED |
21 |
public static final int |
PROTO_NOT_FOUND |
10 |
public static final int |
REDIR_EXCEEDED |
19 |
public static final int |
RETRY |
15 |
public static final int |
ROBOTS_DENIED |
18 |
public static final int |
SUCCESS |
1 |
public static final int |
TEMP_MOVED |
13 |
public static final int |
WOULDBLOCK |
22 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final int |
BUFFER_SIZE |
8192 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
WWW_AUTHENTICATE |
"WWW-Authenticate" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final int |
DEFAULT_MAX_DEPTH |
1000 |
public static final String |
DEPTH_KEY |
"_depth_" |
public static final String |
MAX_DEPTH_KEY |
"_maxdepth_" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final byte |
INLINK |
1 |
public static final byte |
OUTLINK |
2 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DUMP_DIR |
"linkdump" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
INLINK_DIR |
"inlinks" |
public static final String |
LOCK_NAME |
".locked" |
public static final String |
NODE_DIR |
"nodes" |
public static final String |
OLD_OUTLINK_DIR |
"outlinks/old" |
public static final String |
OUTLINK_DIR |
"outlinks/current" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
URL_FILTERING |
"webgraph.url.filters" |
public static final String |
URL_NORMALIZING |
"webgraph.url.normalizers" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DEFAULT |
"default" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
MAX_WARC_FILE_SIZE |
"warc.file.size.max" |
public static final String |
TEMPLATE |
"${prefix}-${timestamp17}-${serialno}" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
CONFORMS_TO |
"conformsTo" |
public static final String |
FORMAT |
"format" |
public static final String |
HOSTNAME |
"hostname" |
public static final String |
HTTP_HEADER_FROM |
"http-header-from" |
public static final String |
HTTP_HEADER_USER_AGENT |
"http-header-user-agent" |
public static final String |
IP |
"ip" |
public static final String |
OPERATOR |
"operator" |
public static final String |
ROBOTS |
"robots" |
public static final String |
SOFTWARE |
"software" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
URL_VERSION |
"arc.url.version" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
URLFILTER_AUTOMATON_FILE |
"urlfilter.automaton.file" |
public static final String |
URLFILTER_AUTOMATON_RULES |
"urlfilter.automaton.rules" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE |
"db.ignore.external.exemptions.file" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
URLFILTER_REGEX_FILE |
"urlfilter.regex.file" |
public static final String |
URLFILTER_REGEX_RULES |
"urlfilter.regex.rules" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
MIN_CONFIDENCE_KEY |
"encodingdetector.charset.min.confidence" |
public static final int |
NO_THRESHOLD |
-1 |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final String |
UUID_KEY |
"nutch.conf.uuid" |
Modifier and Type | Constant Field | Value |
---|---|---|
public static final float |
DEFAULT_BOOST |
1.0f |
Copyright © 2017 The Apache Software Foundation