org.apache.nutch.parse
Class ParseSegment

java.lang.Object
  extended by org.apache.hadoop.conf.Configured
      extended by org.apache.nutch.parse.ParseSegment
All Implemented Interfaces:
Closeable, Configurable, JobConfigurable, Mapper<WritableComparable,Content,Text,ParseImpl>, Reducer<Text,Writable,Text,Writable>, Tool

public class ParseSegment
extends Configured
implements Tool, Mapper<WritableComparable,Content,Text,ParseImpl>, Reducer<Text,Writable,Text,Writable>


Field Summary
static org.slf4j.Logger LOG
           
static String SKIP_TRUNCATED
           
 
Constructor Summary
ParseSegment()
           
ParseSegment(Configuration conf)
           
 
Method Summary
 void close()
           
 void configure(JobConf job)
           
static boolean isTruncated(Content content)
          Checks if the page's content is truncated.
static void main(String[] args)
           
 void map(WritableComparable key, Content content, OutputCollector<Text,ParseImpl> output, Reporter reporter)
           
 void parse(Path segment)
           
 void reduce(Text key, Iterator<Writable> values, OutputCollector<Text,Writable> output, Reporter reporter)
           
 int run(String[] args)
           
 
Methods inherited from class org.apache.hadoop.conf.Configured
getConf, setConf
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface org.apache.hadoop.conf.Configurable
getConf, setConf
 

Field Detail

LOG

public static final org.slf4j.Logger LOG

SKIP_TRUNCATED

public static final String SKIP_TRUNCATED
See Also:
Constant Field Values
Constructor Detail

ParseSegment

public ParseSegment()

ParseSegment

public ParseSegment(Configuration conf)
Method Detail

configure

public void configure(JobConf job)
Specified by:
configure in interface JobConfigurable

close

public void close()
Specified by:
close in interface Closeable

map

public void map(WritableComparable key,
                Content content,
                OutputCollector<Text,ParseImpl> output,
                Reporter reporter)
         throws IOException
Specified by:
map in interface Mapper<WritableComparable,Content,Text,ParseImpl>
Throws:
IOException

isTruncated

public static boolean isTruncated(Content content)
Checks if the page's content is truncated.

Parameters:
content -
Returns:
If the page is truncated true. When it is not, or when it could be determined, false.

reduce

public void reduce(Text key,
                   Iterator<Writable> values,
                   OutputCollector<Text,Writable> output,
                   Reporter reporter)
            throws IOException
Specified by:
reduce in interface Reducer<Text,Writable,Text,Writable>
Throws:
IOException

parse

public void parse(Path segment)
           throws IOException
Throws:
IOException

main

public static void main(String[] args)
                 throws Exception
Throws:
Exception

run

public int run(String[] args)
        throws Exception
Specified by:
run in interface Tool
Throws:
Exception


Copyright © 2012 The Apache Software Foundation