You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/08 09:37:43 UTC

svn commit: r1347909 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Author: markus
Date: Fri Jun  8 07:37:42 2012
New Revision: 1347909

URL: http://svn.apache.org/viewvc?rev=1347909&view=rev
Log:
NUTCH-1336 Optionally not index db_notmodified pages

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347909&r1=1347908&r2=1347909&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun  8 07:37:42 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1336 Optionally not index db_notmodified pages (markus)
+
 * NUTCH-1346 Follow outlinks to ignore external (markus)
 
 * NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1347909&r1=1347908&r2=1347909&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun  8 07:37:42 2012
@@ -875,6 +875,13 @@
   </description>
 </property>
 
+<property>
+  <name>indexer.skip.notmodified</name>
+  <value>false</value>
+  <description>Whether the indexer will skip records with a db_notmodified status.
+  </description>
+</property>
+
 <!-- URL normalizer properties -->
 
 <property>

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1347909&r1=1347908&r2=1347909&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jun  8 07:37:42 2012
@@ -55,7 +55,9 @@ implements Mapper<Text, Writable, Text, 
   public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
 
   public static final String INDEXER_DELETE = "indexer.delete";
+  public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
 
+  private boolean skip = false;
   private boolean delete = false;
   private IndexingFilters filters;
   private ScoringFilters scfilters;
@@ -65,6 +67,7 @@ implements Mapper<Text, Writable, Text, 
     this.filters = new IndexingFilters(getConf());
     this.scfilters = new ScoringFilters(getConf());
     this.delete = job.getBoolean(INDEXER_DELETE, false);
+    this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
   }
 
   public void map(Text key, Writable value,
@@ -87,8 +90,15 @@ implements Mapper<Text, Writable, Text, 
         inlinks = (Inlinks)value;
       } else if (value instanceof CrawlDatum) {
         final CrawlDatum datum = (CrawlDatum)value;
-        if (CrawlDatum.hasDbStatus(datum))
+        if (CrawlDatum.hasDbStatus(datum)) {
           dbDatum = datum;
+
+          // Whether to skip DB_NOTMODIFIED pages
+          if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+            reporter.incrCounter("IndexerStatus", "Skipped", 1);
+            return;
+          }
+        }
         else if (CrawlDatum.hasFetchStatus(datum)) {
 
           // don't index unmodified (empty) pages
@@ -104,14 +114,14 @@ implements Mapper<Text, Writable, Text, 
 
                 NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
                 output.collect(key, action);
-                continue;
+                return;
               }
               if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) {
                 reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
 
                 NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
                 output.collect(key, action);
-                continue;
+                return;
               }
             }
           }