/[Apache-SVN]/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
ViewVC logotype

Diff of /lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java	2005/12/29 15:25:20	359821
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java	2005/12/29 15:28:30	359822
@@ -22,6 +22,7 @@ import org.apache.nutch.fetcher.Fetcher;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.mapred.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.net.*;
 
 import java.io.*;
@@ -63,6 +64,18 @@ public class ParseOutputFormat implement
           
           textOut.append(key, new ParseText(parse.getText()));
           dataOut.append(key, parse.getData());
+          
+          // recover the signature prepared by Fetcher or ParseSegment
+          String sig = parse.getData().getMetadata().getProperty(Fetcher.SIGNATURE_KEY);
+          if (sig != null) {
+            byte[] signature = StringUtil.fromHexString(sig);
+            if (signature != null) {
+              // append a CrawlDatum with a signature
+              CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
+              d.setSignature(signature);
+              crawlOut.append(key, d);
+            }
+          }
 
           // collect outlinks for subsequent db update
           Outlink[] links = parse.getData().getOutlinks();

 

infrastructure at apache.org
ViewVC Help
Powered by ViewVC 1.1.26