{ "name": "WebPage", "type": "record", "namespace": "org.apache.nutch.storage", "doc": "WebPage is the primary data structure in Nutch representing crawl data for a given WebPage at some point in time", "fields": [ { "name": "baseUrl", "type": [ "null", "string" ], "doc": "The original associated with this WebPage.", "default": null }, { "name": "status", "type": "int", "doc": "A crawl status associated with the WebPage, can be of value STATUS_UNFETCHED - WebPage was not fetched yet, STATUS_FETCHED - WebPage was successfully fetched, STATUS_GONE - WebPage no longer exists, STATUS_REDIR_TEMP - WebPage temporarily redirects to other page, STATUS_REDIR_PERM - WebPage permanently redirects to other page, STATUS_RETRY - Fetching unsuccessful, needs to be retried e.g. transient errors and STATUS_NOTMODIFIED - fetching successful - page is not modified", "default": 0 }, { "name": "fetchTime", "type": "long", "doc": "The system time in milliseconds for when the page was fetched.", "default": 0 }, { "name": "prevFetchTime", "type": "long", "doc": "The system time in milliseconds for when the page was last fetched if it was previously fetched which can be used to calculate time delta within a fetching schedule implementation", "default": 0 }, { "name": "fetchInterval", "type": "int", "doc": "The default number of seconds between re-fetches of a page. The default is considered as 30 days unless a custom fetch schedle is implemented.", "default": 0 }, { "name": "retriesSinceFetch", "type": "int", "doc": "The number of retried attempts at fetching the WebPage since it was last successfully fetched.", "default": 0 }, { "name": "modifiedTime", "type": "long", "doc": "The system time in milliseconds for when this WebPage was modified by the WebPage author, if this is not available we default to the server for this information. This is important to understand the changing nature of the WebPage.", "default": 0 }, { "name": "prevModifiedTime", "type": "long", "doc": "The system time in milliseconds for when this WebPage was previously modified by the author, if this is not available then we default to the server for this information. This is important to understand the changing nature of a WebPage.", "default": 0 }, { "name": "protocolStatus", "type": [ "null", { "name": "ProtocolStatus", "type": "record", "namespace": "org.apache.nutch.storage", "doc": "A nested container representing data captured from web server responses.", "fields": [ { "name": "code", "type": "int", "doc": "A protocol response code which can be one of SUCCESS - content was retrieved without errors, FAILED - Content was not retrieved. Any further errors may be indicated in args, PROTO_NOT_FOUND - This protocol was not found. Application may attempt to retry later, GONE - Resource is gone, MOVED - Resource has moved permanently. New url should be found in args, TEMP_MOVED - Resource has moved temporarily. New url should be found in args., NOTFOUND - Resource was not found, RETRY - Temporary failure. Application may retry immediately., EXCEPTION - Unspecified exception occured. Further information may be provided in args., ACCESS_DENIED - Access denied - authorization required, but missing\/incorrect., ROBOTS_DENIED - Access denied by robots.txt rules., REDIR_EXCEEDED - Too many redirects., NOTFETCHING - Not fetching., NOTMODIFIED - Unchanged since the last fetch., WOULDBLOCK - Request was refused by protocol plugins, because it would block. The expected number of milliseconds to wait before retry may be provided in args., BLOCKED - Thread was blocked http.max.delays times during fetching.", "default": 0 }, { "name": "args", "type": { "type": "array", "items": "string" }, "doc": "Optional arguments supplied to compliment and\/or justify the response code.", "default": [ ] }, { "name": "lastModified", "type": "long", "doc": "A server reponse indicating when this page was last modified, this can be unreliable at times hence this is used as a default fall back value for the preferred 'modifiedTime' and 'preModifiedTime' obtained from the WebPage itself.", "default": 0 } ] } ], "default": null }, { "name": "content", "type": [ "null", "bytes" ], "doc": "The entire raw document content e.g. raw XHTML", "default": null }, { "name": "contentType", "type": [ "null", "string" ], "doc": "The type of the content contained within the document itself. ContentType is an alias for MimeType. Historically, this parameter was only called MimeType, but since this is actually the value included in the HTTP Content-Type header, it can also include the character set encoding, which makes it more than just a MimeType specification. If MimeType is specified e.g. not None, that value is used. Otherwise, ContentType is used. If neither is given, the DEFAULT_CONTENT_TYPE setting is used.", "default": null }, { "name": "prevSignature", "type": [ "null", "bytes" ], "doc": "An implementation of a WebPage's previous signature from which it can be identified and referenced at any point in time. This can be used to uniquely identify WebPage deltas based on page fingerprints.", "default": null }, { "name": "signature", "type": [ "null", "bytes" ], "doc": "An implementation of a WebPage's signature from which it can be identified and referenced at any point in time. This is essentially the WebPage's fingerprint represnting its state for any point in time.", "default": null }, { "name": "title", "type": [ "null", "string" ], "doc": "The title of the WebPage.", "default": null }, { "name": "text", "type": [ "null", "string" ], "doc": "The textual content of the WebPage devoid from native markup.", "default": null }, { "name": "parseStatus", "type": [ "null", { "name": "ParseStatus", "type": "record", "namespace": "org.apache.nutch.storage", "doc": "A nested container representing parse status data captured from invocation of parsers on fetch of a WebPage", "fields": [ { "name": "majorCode", "type": "int", "doc": "Major parsing status' including NOTPARSED (Parsing was not performed), SUCCESS (Parsing succeeded), FAILED (General failure. There may be a more specific error message in arguments.)", "default": 0 }, { "name": "minorCode", "type": "int", "doc": "Minor parsing status' including SUCCESS_OK - Successful parse devoid of anomalies or issues, SUCCESS_REDIRECT - Parsed content contains a directive to redirect to another URL. The target URL can be retrieved from the arguments., FAILED_EXCEPTION - Parsing failed. An Exception occured which may be retrieved from the arguments., FAILED_TRUNCATED - Parsing failed. Content was truncated, but the parser cannot handle incomplete content., FAILED_INVALID_FORMAT - Parsing failed. Invalid format e.g. the content may be corrupted or of wrong type., FAILED_MISSING_PARTS - Parsing failed. Other related parts of the content are needed to complete parsing. The list of URLs to missing parts may be provided in arguments. The Fetcher may decide to fetch these parts at once, then put them into Content.metadata, and supply them for re-parsing., FAILED_MISING_CONTENT - Parsing failed. There was no content to be parsed - probably caused by errors at protocol stage.", "default": 0 }, { "name": "args", "type": { "type": "array", "items": "string" }, "doc": "Optional arguments supplied to compliment and\/or justify the parse status code.", "default": [ ] } ] } ], "default": null }, { "name": "score", "type": "float", "doc": "A score used to determine a WebPage's relevance within the web graph it is part of. This score may change over time based on graph characteristics.", "default": 0 }, { "name": "reprUrl", "type": [ "null", "string" ], "doc": "In the case where we are given two urls, a source and a destination of a redirect, we should determine and persist the representative url. The logic used to determine this is based largely on Yahoo!'s Slurp Crawler", "default": null }, { "name": "headers", "type": { "type": "map", "values": [ "null", "string" ] }, "doc": "Header information returned from the web server used to server the content which is subsequently fetched from. This includes keys such as TRANSFER_ENCODING, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_LOCATION, CONTENT_DISPOSITION, CONTENT_MD5, CONTENT_TYPE, LAST_MODIFIED and LOCATION.", "default": { } }, { "name": "outlinks", "type": { "type": "map", "values": [ "null", "string" ] }, "doc": "Embedded hyperlinks which direct outside of the current domain.", "default": { } }, { "name": "inlinks", "type": { "type": "map", "values": [ "null", "string" ] }, "doc": "Embedded hyperlinks which link to pages within the current domain.", "default": { } }, { "name": "markers", "type": { "type": "map", "values": [ "null", "string" ] }, "doc": "Markers flags which represent user and machine decisions which have affected influenced a WebPage's current state. Markers can be system specific and user machine driven in nature. They are assigned to a WebPage on a job-by-job basis and thier values indicative of what actions should be associated with a WebPage.", "default": { } }, { "name": "metadata", "type": { "type": "map", "values": [ "null", "bytes" ] }, "doc": "A multi-valued metadata container used for storing everything from structured WebPage characterists, to ad-hoc extraction and metadata augmentation for any given WebPage.", "default": { } }, { "name": "batchId", "type": [ "null", "string" ], "doc": "A batchId that this WebPage is assigned to. WebPage's are fetched in batches, called fetchlists. Pages are partitioned but can always be associated and fetched alongside pages of similar value (within a crawl cycle) based on batchId.", "default": null }, { "name": "sitemaps", "type": { "type": "map", "values": [ "null", "string" ] }, "doc": "Sitemap urls in robot.txt", "default": { }, { "name": "stmPriority", "type": "float", "doc": "", "default": 0 }, } ] }