You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/03/17 22:50:31 UTC

svn commit: r1578616 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Author: snagel
Date: Mon Mar 17 21:50:30 2014
New Revision: 1578616

URL: http://svn.apache.org/r1578616
Log:
NUTCH-1671 indexchecker to add digest field

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1578616&r1=1578615&r2=1578616&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Mon Mar 17 21:50:30 2014
@@ -2,7 +2,7 @@ Nutch Change Log
 
 Nutch Current Development
 
-*
+* NUTCH-1671 indexchecker to add digest field (snagel, lufeng)
 
 Nutch 1.8      - 11/03/2014 (dd/mm/yyyy)
 Release Report - http://s.apache.org/oHY

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1578616&r1=1578615&r2=1578616&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Mon Mar 17 21:50:30 2014
@@ -17,11 +17,8 @@
  
 package org.apache.nutch.indexer;
 
-import java.util.Arrays;
 import java.util.List;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.Text;
@@ -29,10 +26,9 @@ import org.apache.hadoop.util.Tool;
 import org.apache.hadoop.util.ToolRunner;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.nutch.indexer.IndexingException;
-import org.apache.nutch.indexer.IndexingFilters;
-import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.Nutch;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseSegment;
@@ -42,7 +38,10 @@ import org.apache.nutch.protocol.Protoco
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * Reads and parses a URL and run the indexers on it. Displays the fields obtained and the first
@@ -122,6 +121,14 @@ public class IndexingFiltersChecker exte
 
     Inlinks inlinks = null;
     Parse parse = parseResult.get(urlText);
+
+    byte[] signature = SignatureFactory.getSignature(conf).calculate(content,
+        parse);
+    parse.getData().getContentMeta()
+        .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
+    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
+    doc.add("digest", digest);
+
     try {
       doc = indexers.filter(doc, parse, urlText, datum, inlinks);
     } catch (IndexingException e) {