You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/07/15 11:34:38 UTC
svn commit: r1610631 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/CrawlDbReducer.java
src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
Author: jnioche
Date: Tue Jul 15 09:34:38 2014
New Revision: 1610631
URL: http://svn.apache.org/r1610631
Log:
NUTCH-1422 Bypass signature comparison when a document is redirected (snagel)
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue Jul 15 09:34:38 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1422 Bypass signature comparison when a document is redirected (snagel)
+
* NUTCH-1502 Test for CrawlDatum state transitions (snagel)
* NUTCH-1804 Move JUnit dependency to test scope (jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Tue Jul 15 09:34:38 2014
@@ -206,7 +206,9 @@ public class CrawlDbReducer implements R
int modified = FetchSchedule.STATUS_UNKNOWN;
if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
modified = FetchSchedule.STATUS_NOTMODIFIED;
- } else {
+ } else if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_SUCCESS) {
+ // only successful fetches (but not redirects, NUTCH-1422)
+ // are detected as "not modified" by signature comparison
if (oldSet && old.getSignature() != null && signature != null) {
if (SignatureComparator._compare(old.getSignature(), signature) != 0) {
modified = FetchSchedule.STATUS_MODIFIED;
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TODOTestCrawlDbStates.java Tue Jul 15 09:34:38 2014
@@ -165,78 +165,4 @@ public class TODOTestCrawlDbStates exten
}
- /**
- * Test whether signatures are reset for "content-less" states
- * (gone, redirect, etc.): otherwise, if this state is temporary
- * and the document appears again with the old content, it may
- * get marked as not_modified in CrawlDb just after the redirect
- * state. In this case we cannot expect content in segments.
- * Cf. NUTCH-1422: reset signature for redirects.
- */
- // TODO: can only test if solution is done in CrawlDbReducer
- @Test
- public void testSignatureReset() {
- LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
- Configuration conf = CrawlDBTestUtil.createConfiguration();
- for (String sched : schedules) {
- LOG.info("Testing reset signature with " + sched);
- conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl."+sched);
- ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf);
- if (!crawlUtil.run(20)) {
- fail("failed: signature not reset");
- }
- }
- }
-
- private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
-
- byte[][] noContentStates = {
- { STATUS_FETCH_GONE, STATUS_DB_GONE },
- { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
- { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } };
-
- int counter = 0;
- byte fetchState;
-
- public CrawlTestSignatureReset(Configuration conf) {
- super(conf);
- }
-
- @Override
- protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
- datum = super.fetch(datum, currentTime);
- counter++;
- // flip-flopping between successful fetch and one of content-less states
- if (counter%2 == 1) {
- fetchState = STATUS_FETCH_SUCCESS;
- } else {
- fetchState = noContentStates[(counter%6)/2][0];
- }
- LOG.info("Step " + counter + ": fetched with "
- + getStatusName(fetchState));
- datum.setStatus(fetchState);
- return datum;
- }
-
- @Override
- protected boolean check(CrawlDatum result) {
- if (result.getStatus() == STATUS_DB_NOTMODIFIED
- && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) {
- LOG.error("Should never get into state "
- + getStatusName(STATUS_DB_NOTMODIFIED) + " from "
- + getStatusName(fetchState));
- return false;
- }
- if (result.getSignature() != null
- && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) {
- LOG.error("Signature not reset in state "
- + getStatusName(result.getStatus()));
- // ok here: since it's not the problem itself (the db_notmodified), but
- // the reason for it
- }
- return true;
- }
-
- }
-
}
Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java?rev=1610631&r1=1610630&r2=1610631&view=diff
==============================================================================
--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java (original)
+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbStates.java Tue Jul 15 09:34:38 2014
@@ -27,11 +27,14 @@ import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
+
import static org.apache.nutch.crawl.CrawlDatum.*;
+
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import static org.junit.Assert.*;
+
import org.junit.Test;
import org.slf4j.Logger;
@@ -482,5 +485,81 @@ public class TestCrawlDbStates {
}
}
+
+ /**
+ * Test whether signatures are reset for "content-less" states
+ * (gone, redirect, etc.): otherwise, if this state is temporary
+ * and the document appears again with the old content, it may
+ * get marked as not_modified in CrawlDb just after the redirect
+ * state. In this case we cannot expect content in segments.
+ * Cf. NUTCH-1422: reset signature for redirects.
+ */
+ // TODO: can only test if solution is done in CrawlDbReducer
+ @Test
+ public void testSignatureReset() {
+ LOG.info("NUTCH-1422 must reset signature for redirects and similar states");
+ Configuration conf = CrawlDBTestUtil.createConfiguration();
+ for (String sched : schedules) {
+ LOG.info("Testing reset signature with " + sched);
+ conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl."+sched);
+ ContinuousCrawlTestUtil crawlUtil = new CrawlTestSignatureReset(conf);
+ if (!crawlUtil.run(20)) {
+ fail("failed: signature not reset");
+ }
+ }
+ }
+
+ private class CrawlTestSignatureReset extends ContinuousCrawlTestUtil {
+
+ byte[][] noContentStates = {
+ { STATUS_FETCH_GONE, STATUS_DB_GONE },
+ { STATUS_FETCH_REDIR_TEMP, STATUS_DB_REDIR_TEMP },
+ { STATUS_FETCH_REDIR_PERM, STATUS_DB_REDIR_PERM } };
+
+ int counter = 0;
+ byte fetchState;
+
+ public CrawlTestSignatureReset(Configuration conf) {
+ super(conf);
+ }
+
+ @Override
+ protected CrawlDatum fetch(CrawlDatum datum, long currentTime) {
+ datum = super.fetch(datum, currentTime);
+ counter++;
+ // flip-flopping between successful fetch and one of content-less states
+ if (counter%2 == 1) {
+ fetchState = STATUS_FETCH_SUCCESS;
+ } else {
+ fetchState = noContentStates[(counter%6)/2][0];
+ }
+ LOG.info("Step " + counter + ": fetched with "
+ + getStatusName(fetchState));
+ datum.setStatus(fetchState);
+ return datum;
+ }
+
+ @Override
+ protected boolean check(CrawlDatum result) {
+ if (result.getStatus() == STATUS_DB_NOTMODIFIED
+ && !(fetchState == STATUS_FETCH_SUCCESS || fetchState == STATUS_FETCH_NOTMODIFIED)) {
+ LOG.error("Should never get into state "
+ + getStatusName(STATUS_DB_NOTMODIFIED) + " from "
+ + getStatusName(fetchState));
+ return false;
+ }
+ if (result.getSignature() != null
+ && !(result.getStatus() == STATUS_DB_FETCHED || result.getStatus() == STATUS_DB_NOTMODIFIED)) {
+ LOG.error("Signature not reset in state "
+ + getStatusName(result.getStatus()));
+ // ok here: since it's not the problem itself (the db_notmodified), but
+ // the reason for it
+ }
+ return true;
+ }
+
+ }
+
+
}