You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/07/15 11:34:38 UTC

svn commit: r1610631 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReducer.java src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java src/test/org/apache/nutch/crawl/TestCrawlDbStates.java

Author: jnioche
Date: Tue Jul 15 09:34:38 2014
New Revision: 1610631

URL: http://svn.apache.org/r1610631
Log:
NUTCH-1422 Bypass signature comparison when a document is redirected (snagel)

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
    nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 15 09:34:38 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1422 Bypass signature comparison when a document is redirected (snagel)
+
 * NUTCH-1502 Test for CrawlDatum state transitions (snagel)
 
 * NUTCH-1804 Move JUnit dependency to test scope (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue Jul 15 09:34:38 2014
@@ -206,7 +206,9 @@ public class CrawlDbReducer implements R
       int modified = FetchSchedule.STATUS_UNKNOWN;
       if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
         modified = FetchSchedule.STATUS_NOTMODIFIED;
-      } else {
+      } else if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
+        // only successful fetches (but not redirects, NUTCH-1422)
+        // are detected as "not modified" by signature comparison
         if (oldSet && old.getSignature() != null && signature != null) {
           if (SignatureComparator._compare(old.getSignature(), signature) != 0) {
             modified = FetchSchedule.STATUS_MODIFIED;

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java Tue Jul 15 09:34:38 2014
@@ -165,78 +165,4 @@ public class TODOTestCrawlDbStates exten
 
   }
 
-  /**
-   * Test whether signatures are reset for "content-less" states
-   * (gone, redirect, etc.): otherwise, if this state is temporary
-   * and the document appears again with the old content, it may
-   * get marked as not_modified in CrawlDb just after the redirect
-   * state. In this case we cannot expect content in segments.
-   * Cf. NUTCH-1422: reset signature for redirects.
-   */
-  // TODO: can only test if solution is done in CrawlDbReducer
-  @Test
-  public void testSignatureReset() {
-    LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
-    Configuration conf = CrawlDBTestUtil.createConfiguration();
-    for (String sched : schedules) {
-      LOG.info("Testing reset signature with " + sched);
-      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl."+sched);
-      ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf);
-      if (!crawlUtil.run(20)) {
-        fail("failed: signature not reset");
-      }
-    }
-  }
-
-  private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
-
-    byte[][] noContentStates = {
-        { STATUS_FETCH_GONE,       STATUS_DB_GONE },
-        { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
-        { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } };
-
-    int counter = 0;
-    byte fetchState;
-
-    public CrawlTestSignatureReset(Configuration conf) {
-      super(conf);
-    }
-
-    @Override
-    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
-      datum = super.fetch(datum, currentTime);
-      counter++;
-      // flip-flopping between successful fetch and one of content-less states
-      if (counter%2 == 1) {
-        fetchState = STATUS_FETCH_SUCCESS;
-      } else {
-        fetchState = noContentStates[(counter%6)/2][0];
-      }
-      LOG.info("Step " + counter + ": fetched with "
-          + getStatusName(fetchState));
-      datum.setStatus(fetchState);
-     return datum;
-    }
-
-    @Override
-    protected boolean check(CrawlDatum result) {
-      if (result.getStatus() == STATUS_DB_NOTMODIFIED
-          && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) {
-        LOG.error("Should never get into state "
-            + getStatusName(STATUS_DB_NOTMODIFIED) + " from "
-            + getStatusName(fetchState));
-        return false;
-      }
-      if (result.getSignature() != null
-          && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) {
-        LOG.error("Signature not reset in state "
-            + getStatusName(result.getStatus()));
-        // ok here: since it's not the problem itself (the db_notmodified), but
-        // the reason for it
-      }
-      return true;
-    }
-
-  }
-
 }

Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java Tue Jul 15 09:34:38 2014
@@ -27,11 +27,14 @@ import org.apache.hadoop.mapred.JobConf;
 import org.apache.hadoop.util.StringUtils;
 
 import org.apache.nutch.crawl.CrawlDatum;
+
 import static org.apache.nutch.crawl.CrawlDatum.*;
+
 import org.apache.nutch.scoring.ScoringFilterException;
 import org.apache.nutch.scoring.ScoringFilters;
 
 import static org.junit.Assert.*;
+
 import org.junit.Test;
 
 import org.slf4j.Logger;
@@ -482,5 +485,81 @@ public class TestCrawlDbStates {
     }
   }
 
+
+  /**
+   * Test whether signatures are reset for "content-less" states
+   * (gone, redirect, etc.): otherwise, if this state is temporary
+   * and the document appears again with the old content, it may
+   * get marked as not_modified in CrawlDb just after the redirect
+   * state. In this case we cannot expect content in segments.
+   * Cf. NUTCH-1422: reset signature for redirects.
+   */
+  // TODO: can only test if solution is done in CrawlDbReducer
+  @Test
+  public void testSignatureReset() {
+    LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
+    Configuration conf = CrawlDBTestUtil.createConfiguration();
+    for (String sched : schedules) {
+      LOG.info("Testing reset signature with " + sched);
+      conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl."+sched);
+      ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf);
+      if (!crawlUtil.run(20)) {
+        fail("failed: signature not reset");
+      }
+    }
+  }
+
+  private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
+
+    byte[][] noContentStates = {
+        { STATUS_FETCH_GONE,       STATUS_DB_GONE },
+        { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+        { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } };
+
+    int counter = 0;
+    byte fetchState;
+
+    public CrawlTestSignatureReset(Configuration conf) {
+      super(conf);
+    }
+
+    @Override
+    protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+      datum = super.fetch(datum, currentTime);
+      counter++;
+      // flip-flopping between successful fetch and one of content-less states
+      if (counter%2 == 1) {
+        fetchState = STATUS_FETCH_SUCCESS;
+      } else {
+        fetchState = noContentStates[(counter%6)/2][0];
+      }
+      LOG.info("Step " + counter + ": fetched with "
+          + getStatusName(fetchState));
+      datum.setStatus(fetchState);
+     return datum;
+    }
+
+    @Override
+    protected boolean check(CrawlDatum result) {
+      if (result.getStatus() == STATUS_DB_NOTMODIFIED
+          && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) {
+        LOG.error("Should never get into state "
+            + getStatusName(STATUS_DB_NOTMODIFIED) + " from "
+            + getStatusName(fetchState));
+        return false;
+      }
+      if (result.getSignature() != null
+          && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) {
+        LOG.error("Signature not reset in state "
+            + getStatusName(result.getStatus()));
+        // ok here: since it's not the problem itself (the db_notmodified), but
+        // the reason for it
+      }
+      return true;
+    }
+
+  }
+
+  
 }