You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/03/07 19:15:50 UTC

svn commit: r1575351 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Author: snagel
Date: Fri Mar  7 18:15:50 2014
New Revision: 1575351

URL: http://svn.apache.org/r1575351
Log:
NUTCH-1706 IndexerMapReduce does not remove db_redir_temp

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1575351&r1=1575350&r2=1575351&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar  7 18:15:50 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1706 IndexerMapReduce does not remove db_redir_temp (markus, snagel)
+
 * NUTCH-1113 SegmentMerger can now be safely used to merge segments (Edward Drapkin, markus, snagel)
 
 * NUTCH-1729 Upgrade to Tika 1.5 (jnioche)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1575351&r1=1575350&r2=1575351&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Mar  7 18:15:50 2014
@@ -180,36 +180,10 @@ implements Mapper<Text, Writable, Text, 
           dbDatum = datum;
         }
         else if (CrawlDatum.hasFetchStatus(datum)) {
-
           // don't index unmodified (empty) pages
           if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
             fetchDatum = datum;
-
-            /**
-             * Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT.
-             */
-            if (delete) {
-              if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
-                reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
-
-                NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
-                output.collect(key, action);
-                return;
-              }
-              if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
-                  fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
-                  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
-                  dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
-                reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
-                reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
-
-                NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
-                output.collect(key, action);
-                return;
-              }
-            }
           }
-
         } else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
                    CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
                    CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
@@ -239,6 +213,29 @@ implements Mapper<Text, Writable, Text, 
         LOG.warn("Unrecognized type: "+value.getClass());
       }
     }
+    
+    // Whether to delete GONE or REDIRECTS
+    if (delete && fetchDatum != null && dbDatum != null) {    
+      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
+        reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
+
+        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+        output.collect(key, action);
+        return;
+      }
+      
+      if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
+          fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
+          dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+          dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+        reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
+        reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
+
+        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+        output.collect(key, action);
+        return;
+      }
+    }
 
     if (fetchDatum == null || dbDatum == null
         || parseText == null || parseData == null) {