/[Apache-SVN]/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
ViewVC logotype

Diff of /lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java	2005/12/29 15:25:20	359821
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java	2005/12/29 15:28:30	359822
@@ -21,6 +21,7 @@ import java.io.File;
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.net.*;
 import org.apache.nutch.util.*;
@@ -36,7 +37,7 @@ public class Fetcher extends NutchConfig
   public static final Logger LOG =
     LogFormatter.getLogger("org.apache.nutch.fetcher.Fetcher");
   
-  public static final String DIGEST_KEY = "nutch.content.digest";
+  public static final String SIGNATURE_KEY = "nutch.content.digest";
   public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
   public static final String SCORE_KEY = "nutch.crawl.score";
 
@@ -93,6 +94,7 @@ public class Fetcher extends NutchConfig
               break;                              // at eof, exit
             }
           } catch (IOException e) {
+            e.printStackTrace();
             LOG.severe("fetcher caught:"+e.toString());
             break;
           }
@@ -173,6 +175,7 @@ public class Fetcher extends NutchConfig
         }
 
       } catch (Throwable e) {
+        e.printStackTrace();
         LOG.severe("fetcher caught:"+e.toString());
       } finally {
         synchronized (Fetcher.this) {activeThreads--;} // count threads
@@ -193,11 +196,9 @@ public class Fetcher extends NutchConfig
 
       if (content == null) {
         String url = key.toString();
-        content = new Content(url,url,new byte[0],"",new ContentProperties());
+        content = new Content(url, url, new byte[0], "", new ContentProperties());
       }
 
-      content.getMetadata().setProperty           // add digest to metadata
-        (DIGEST_KEY, MD5Hash.digest(content.getContent()).toString());
       content.getMetadata().setProperty           // add segment to metadata
         (SEGMENT_NAME_KEY, segmentName);
       content.getMetadata().setProperty           // add score to metadata
@@ -213,9 +214,14 @@ public class Fetcher extends NutchConfig
           parseStatus = new ParseStatus(e);
         }
         if (!parseStatus.isSuccess()) {
-          LOG.warning("Error parsing: "+key+": "+parseStatus);
-          parse = null;
+          LOG.warning("Error parsing: " + key + ": " + parseStatus);
+          parse = parseStatus.getEmptyParse();
         }
+        // Calculate page signature. For non-parsing fetchers this will
+        // be done in ParseSegment
+        byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
+        parse.getData().getMetadata().setProperty(SIGNATURE_KEY, StringUtil.toHexString(signature));
+        datum.setSignature(signature);
       }
 
       try {
@@ -225,6 +231,7 @@ public class Fetcher extends NutchConfig
                              storingContent ? content : null,
                              parse != null ? new ParseImpl(parse) : null));
       } catch (IOException e) {
+        e.printStackTrace();
         LOG.severe("fetcher caught:"+e.toString());
       }
     }
@@ -310,7 +317,7 @@ public class Fetcher extends NutchConfig
     
   }
 
-  public void fetch(File segment, int threads)
+  public void fetch(File segment, int threads, boolean parsing)
     throws IOException {
 
     LOG.info("Fetcher: starting");
@@ -320,6 +327,7 @@ public class Fetcher extends NutchConfig
 
     job.setInt("fetcher.threads.fetch", threads);
     job.set(SEGMENT_NAME_KEY, segment.getName());
+    job.setBoolean("fetcher.parse", parsing);
 
     job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
     job.setInputFormat(InputFormat.class);
@@ -341,7 +349,7 @@ public class Fetcher extends NutchConfig
   /** Run the fetcher. */
   public static void main(String[] args) throws Exception {
 
-    String usage = "Usage: Fetcher <segment> [-threads n]";
+    String usage = "Usage: Fetcher <segment> [-threads n] [-noParsing]";
 
     if (args.length < 1) {
       System.err.println(usage);
@@ -353,16 +361,21 @@ public class Fetcher extends NutchConfig
     NutchConf conf = NutchConf.get();
 
     int threads = conf.getInt("fetcher.threads.fetch", 10);
+    boolean parsing = true;
 
     for (int i = 1; i < args.length; i++) {       // parse command line
       if (args[i].equals("-threads")) {           // found -threads option
         threads =  Integer.parseInt(args[++i]);
-      }
+      } else if (args[i].equals("-noParsing")) parsing = false;
     }
 
+    conf.setInt("fetcher.threads.fetch", threads);
+    if (!parsing) {
+      conf.setBoolean("fetcher.parse", parsing);
+    }
     Fetcher fetcher = new Fetcher(conf);          // make a Fetcher
     
-    fetcher.fetch(segment, threads);              // run the Fetcher
+    fetcher.fetch(segment, threads, parsing);              // run the Fetcher
 
   }
 }

 

infrastructure at apache.org
ViewVC Help
Powered by ViewVC 1.1.26