You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2015/07/01 08:56:32 UTC

svn commit: r1688561 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Author: markus
Date: Wed Jul  1 06:56:32 2015
New Revision: 1688561

URL: http://svn.apache.org/r1688561
Log:
NUTCH-1684 ParseMeta to be added before fetch schedulers are run

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1688561&r1=1688560&r2=1688561&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Jul  1 06:56:32 2015
@@ -2,6 +2,8 @@ Nutch Change Log
   
 Nutch Current Development 1.11-SNAPSHOT
 
+* NUTCH-1684 ParseMeta to be added before fetch schedulers are run (markus)
+
 * NUTCH-2038 fix for NUTCH-2038: Naive Bayes classifier based html Parse filter (for filtering outlinks) 
   (Asitang Mishra, snagel via mattmann)
 

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=1688561&r1=1688560&r2=1688561&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Wed Jul  1 06:56:32 2015
@@ -209,6 +209,13 @@ public class CrawlDbReducer implements
     case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
     case CrawlDatum.STATUS_FETCH_REDIR_PERM:
     case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
+      // https://issues.apache.org/jira/browse/NUTCH-1656
+      if (metaFromParse != null) {
+        for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
+          result.getMetaData().put(e.getKey(), e.getValue());
+        }
+      }
+      
       // determine the modification status
       int modified = FetchSchedule.STATUS_UNKNOWN;
       if (fetch.getStatus() == CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
@@ -260,13 +267,6 @@ public class CrawlDbReducer implements
         result.setSignature(signature);
       }
 
-      // https://issues.apache.org/jira/browse/NUTCH-1656
-      if (metaFromParse != null) {
-        for (Entry<Writable, Writable> e : metaFromParse.entrySet()) {
-          result.getMetaData().put(e.getKey(), e.getValue());
-        }
-      }
-
       // if fetchInterval is larger than the system-wide maximum, trigger
       // an unconditional recrawl. This prevents the page to be stuck at
       // NOTMODIFIED state, when the old fetched copy was already removed with