org.apache.nutch.crawl
Class CrawlDbReader

java.lang.Object
  extended by org.apache.nutch.crawl.CrawlDbReader
All Implemented Interfaces:
Closeable

public class CrawlDbReader
extends Object
implements Closeable

Read utility for the CrawlDB.

Author:
Andrzej Bialecki

Nested Class Summary
static class CrawlDbReader.CrawlDatumCsvOutputFormat
           
static class CrawlDbReader.CrawlDbStatCombiner
           
static class CrawlDbReader.CrawlDbStatMapper
           
static class CrawlDbReader.CrawlDbStatReducer
           
static class CrawlDbReader.CrawlDbTopNMapper
           
static class CrawlDbReader.CrawlDbTopNReducer
           
 
Field Summary
static int CSV_FORMAT
           
static org.apache.commons.logging.Log LOG
           
static int STD_FORMAT
           
 
Constructor Summary
CrawlDbReader()
           
 
Method Summary
 void close()
           
 CrawlDatum get(String crawlDb, String url, Configuration config)
           
static void main(String[] args)
           
 void processDumpJob(String crawlDb, String output, Configuration config, int format)
           
 void processStatJob(String crawlDb, Configuration config, boolean sort)
           
 void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config)
           
 void readUrl(String crawlDb, String url, Configuration config)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

LOG

public static final org.apache.commons.logging.Log LOG

STD_FORMAT

public static final int STD_FORMAT
See Also:
Constant Field Values

CSV_FORMAT

public static final int CSV_FORMAT
See Also:
Constant Field Values
Constructor Detail

CrawlDbReader

public CrawlDbReader()
Method Detail

close

public void close()
Specified by:
close in interface Closeable

processStatJob

public void processStatJob(String crawlDb,
                           Configuration config,
                           boolean sort)
                    throws IOException
Throws:
IOException

get

public CrawlDatum get(String crawlDb,
                      String url,
                      Configuration config)
               throws IOException
Throws:
IOException

readUrl

public void readUrl(String crawlDb,
                    String url,
                    Configuration config)
             throws IOException
Throws:
IOException

processDumpJob

public void processDumpJob(String crawlDb,
                           String output,
                           Configuration config,
                           int format)
                    throws IOException
Throws:
IOException

processTopNJob

public void processTopNJob(String crawlDb,
                           long topN,
                           float min,
                           String output,
                           Configuration config)
                    throws IOException
Throws:
IOException

main

public static void main(String[] args)
                 throws IOException
Throws:
IOException


Copyright © 2011 The Apache Software Foundation