You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/01/08 12:10:39 UTC

svn commit: r1723688 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Author: markus
Date: Fri Jan  8 11:10:38 2016
New Revision: 1723688

URL: http://svn.apache.org/viewvc?rev=1723688&view=rev
Log:
NUTCH-1449 Optionally delete documents skipped by IndexingFilters


Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723688&r1=1723687&r2=1723688&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan  8 11:10:38 2016
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)
+
 * NUTCH-2189 Domain filter must deactivate if no rules are present (markus)
 
 * NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency (joyce)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1723688&r1=1723687&r2=1723688&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jan  8 11:10:38 2016
@@ -1043,6 +1043,20 @@
   </description>
 </property>
 
+<property>
+  <name>indexer.delete.robots.noindex</name>
+  <value>false</value>
+  <description>Whether the indexer will delete documents marked by robots=noindex
+  </description>
+</property>
+
+<property>
+  <name>indexer.delete.skipped.by.indexingfilter</name>
+  <value>false</value>
+  <description>Whether the indexer will delete documents that were skipped by indexing filters
+  </description>
+</property>
+
 <!-- URL normalizer properties -->
 
 <property>

Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1723688&r1=1723687&r2=1723688&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jan  8 11:10:38 2016
@@ -63,6 +63,7 @@ public class IndexerMapReduce extends Co
   public static final String INDEXER_PARAMS = "indexer.additional.params";
   public static final String INDEXER_DELETE = "indexer.delete";
   public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
+  public static final String INDEXER_DELETE_SKIPPED = "indexer.delete.skipped.by.indexingfilter";
   public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
   public static final String URL_FILTERING = "indexer.url.filters";
   public static final String URL_NORMALIZING = "indexer.url.normalizers";
@@ -71,6 +72,7 @@ public class IndexerMapReduce extends Co
   private boolean skip = false;
   private boolean delete = false;
   private boolean deleteRobotsNoIndex = false;
+  private boolean deleteSkippedByIndexingFilter = false;
   private boolean base64 = false;
   private IndexingFilters filters;
   private ScoringFilters scfilters;
@@ -94,6 +96,8 @@ public class IndexerMapReduce extends Co
     this.delete = job.getBoolean(INDEXER_DELETE, false);
     this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX,
         false);
+    this.deleteSkippedByIndexingFilter = job.getBoolean(INDEXER_DELETE_SKIPPED,
+        false);
     this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
     this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
 
@@ -245,7 +249,7 @@ public class IndexerMapReduce extends Co
           || fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
           || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
           || dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
-        reporter.incrCounter("IndexerStatus", "deleted redirects", 1);
+        reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);
         output.collect(key, DELETE_ACTION);
         return;
       }
@@ -258,7 +262,7 @@ public class IndexerMapReduce extends Co
 
     // Whether to delete pages marked as duplicates
     if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
-      reporter.incrCounter("IndexerStatus", "deleted duplicates", 1);
+      reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
       output.collect(key, DELETE_ACTION);
       return;
     }
@@ -284,8 +288,25 @@ public class IndexerMapReduce extends Co
 
     // add digest, used by dedup
     doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
-
+    
     final Parse parse = new ParseImpl(parseText, parseData);
+    float boost = 1.0f;
+    // run scoring filters
+    try {
+      boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
+          inlinks, boost);
+    } catch (final ScoringFilterException e) {
+      reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1);
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Error calculating score {}: {}", key, e);
+      }
+      return;
+    }
+    // apply boost to all indexed fields.
+    doc.setWeight(boost);
+    // store boost for use by explain and dedup
+    doc.add("boost", Float.toString(boost));
+
     try {
       // Indexing filters may also be interested in the signature
       fetchDatum.setSignature(dbDatum.getSignature());
@@ -317,26 +338,16 @@ public class IndexerMapReduce extends Co
 
     // skip documents discarded by indexing filters
     if (doc == null) {
-      reporter.incrCounter("IndexerStatus", "skipped by indexing filters", 1);
-      return;
-    }
-
-    float boost = 1.0f;
-    // run scoring filters
-    try {
-      boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
-          inlinks, boost);
-    } catch (final ScoringFilterException e) {
-      reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1);
-      if (LOG.isWarnEnabled()) {
-        LOG.warn("Error calculating score {}: {}", key, e);
+      // https://issues.apache.org/jira/browse/NUTCH-1449
+      if (deleteSkippedByIndexingFilter) {
+        NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+        output.collect(key, action);
+        reporter.incrCounter("IndexerStatus", "deleted (IndexingFilter)", 1);
+      } else {
+        reporter.incrCounter("IndexerStatus", "skipped (IndexingFilter)", 1);
       }
       return;
     }
-    // apply boost to all indexed fields.
-    doc.setWeight(boost);
-    // store boost for use by explain and dedup
-    doc.add("boost", Float.toString(boost));
 
     if (content != null) {
       // Get the original unencoded content