You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/08/23 09:43:08 UTC

svn commit: r1376394 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Author: markus
Date: Thu Aug 23 07:43:08 2012
New Revision: 1376394

URL: http://svn.apache.org/viewvc?rev=1376394&view=rev
Log:
NUTCH-1434 Indexer to delete robots noindex

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1376394&r1=1376393&r2=1376394&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Aug 23 07:43:08 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1434 Indexer to delete robots noindex (markus)
+
 * NUTCH-1443 Solr schema version is invalid (markus)
 
 * NUTCH-1417 Remove o.a.n.metadata.Office (lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1376394&r1=1376393&r2=1376394&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Thu Aug 23 07:43:08 2012
@@ -57,15 +57,17 @@ implements Mapper<Text, Writable, Text, 
   public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
 
   public static final String INDEXER_DELETE = "indexer.delete";
+  public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
   public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
   public static final String URL_FILTERING = "indexer.url.filters";
   public static final String URL_NORMALIZING = "indexer.url.normalizers";
 
   private boolean skip = false;
   private boolean delete = false;
+  private boolean deleteRobotsNoIndex = false;
   private IndexingFilters filters;
   private ScoringFilters scfilters;
-  
+
   // using normalizers and/or filters
   private boolean normalize = false;
   private boolean filter = false;
@@ -79,6 +81,7 @@ implements Mapper<Text, Writable, Text, 
     this.filters = new IndexingFilters(getConf());
     this.scfilters = new ScoringFilters(getConf());
     this.delete = job.getBoolean(INDEXER_DELETE, false);
+    this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false);
     this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
 
     normalize = job.getBoolean(URL_NORMALIZING, false);
@@ -174,12 +177,6 @@ implements Mapper<Text, Writable, Text, 
         final CrawlDatum datum = (CrawlDatum)value;
         if (CrawlDatum.hasDbStatus(datum)) {
           dbDatum = datum;
-
-          // Whether to skip DB_NOTMODIFIED pages
-          if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
-            reporter.incrCounter("IndexerStatus", "Skipped", 1);
-            return;
-          }
         }
         else if (CrawlDatum.hasFetchStatus(datum)) {
 
@@ -217,6 +214,20 @@ implements Mapper<Text, Writable, Text, 
         }
       } else if (value instanceof ParseData) {
         parseData = (ParseData)value;
+
+        // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
+        if (deleteRobotsNoIndex) {
+          // Get the robots meta data
+          String robotsMeta = parseData.getMeta("robots");
+
+          // Has it a noindex for this url?
+          if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
+            // Delete it!
+            NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+            output.collect(key, action);
+            return;
+          }
+        }
       } else if (value instanceof ParseText) {
         parseText = (ParseText)value;
       } else if (LOG.isWarnEnabled()) {
@@ -229,6 +240,12 @@ implements Mapper<Text, Writable, Text, 
       return;                                     // only have inlinks
     }
 
+    // Whether to skip DB_NOTMODIFIED pages
+    if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+      reporter.incrCounter("IndexerStatus", "Skipped", 1);
+      return;
+    }
+
     if (!parseData.getStatus().isSuccess() ||
         fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
       return;