You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2016/01/08 12:10:39 UTC
svn commit: r1723688 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: markus
Date: Fri Jan 8 11:10:38 2016
New Revision: 1723688
URL: http://svn.apache.org/viewvc?rev=1723688&view=rev
Log:
NUTCH-1449 Optionally delete documents skipped by IndexingFilters
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1723688&r1=1723687&r2=1723688&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jan 8 11:10:38 2016
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1449 Optionally delete documents skipped by IndexingFilters (markus)
+
* NUTCH-2189 Domain filter must deactivate if no rules are present (markus)
* NUTCH-2182 Make reverseUrlDirs file dumper option hash the URL for consistency (joyce)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1723688&r1=1723687&r2=1723688&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jan 8 11:10:38 2016
@@ -1043,6 +1043,20 @@
</description>
</property>
+<property>
+ <name>indexer.delete.robots.noindex</name>
+ <value>false</value>
+ <description>Whether the indexer will delete documents marked by robots=noindex
+ </description>
+</property>
+
+<property>
+ <name>indexer.delete.skipped.by.indexingfilter</name>
+ <value>false</value>
+ <description>Whether the indexer will delete documents that were skipped by indexing filters
+ </description>
+</property>
+
<!-- URL normalizer properties -->
<property>
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1723688&r1=1723687&r2=1723688&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jan 8 11:10:38 2016
@@ -63,6 +63,7 @@ public class IndexerMapReduce extends Co
public static final String INDEXER_PARAMS = "indexer.additional.params";
public static final String INDEXER_DELETE = "indexer.delete";
public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
+ public static final String INDEXER_DELETE_SKIPPED = "indexer.delete.skipped.by.indexingfilter";
public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
public static final String URL_FILTERING = "indexer.url.filters";
public static final String URL_NORMALIZING = "indexer.url.normalizers";
@@ -71,6 +72,7 @@ public class IndexerMapReduce extends Co
private boolean skip = false;
private boolean delete = false;
private boolean deleteRobotsNoIndex = false;
+ private boolean deleteSkippedByIndexingFilter = false;
private boolean base64 = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
@@ -94,6 +96,8 @@ public class IndexerMapReduce extends Co
this.delete = job.getBoolean(INDEXER_DELETE, false);
this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX,
false);
+ this.deleteSkippedByIndexingFilter = job.getBoolean(INDEXER_DELETE_SKIPPED,
+ false);
this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
this.base64 = job.getBoolean(INDEXER_BINARY_AS_BASE64, false);
@@ -245,7 +249,7 @@ public class IndexerMapReduce extends Co
|| fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM
|| dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
- reporter.incrCounter("IndexerStatus", "deleted redirects", 1);
+ reporter.incrCounter("IndexerStatus", "deleted (redirects)", 1);
output.collect(key, DELETE_ACTION);
return;
}
@@ -258,7 +262,7 @@ public class IndexerMapReduce extends Co
// Whether to delete pages marked as duplicates
if (delete && dbDatum.getStatus() == CrawlDatum.STATUS_DB_DUPLICATE) {
- reporter.incrCounter("IndexerStatus", "deleted duplicates", 1);
+ reporter.incrCounter("IndexerStatus", "deleted (duplicates)", 1);
output.collect(key, DELETE_ACTION);
return;
}
@@ -284,8 +288,25 @@ public class IndexerMapReduce extends Co
// add digest, used by dedup
doc.add("digest", metadata.get(Nutch.SIGNATURE_KEY));
-
+
final Parse parse = new ParseImpl(parseText, parseData);
+ float boost = 1.0f;
+ // run scoring filters
+ try {
+ boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
+ inlinks, boost);
+ } catch (final ScoringFilterException e) {
+ reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1);
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("Error calculating score {}: {}", key, e);
+ }
+ return;
+ }
+ // apply boost to all indexed fields.
+ doc.setWeight(boost);
+ // store boost for use by explain and dedup
+ doc.add("boost", Float.toString(boost));
+
try {
// Indexing filters may also be interested in the signature
fetchDatum.setSignature(dbDatum.getSignature());
@@ -317,26 +338,16 @@ public class IndexerMapReduce extends Co
// skip documents discarded by indexing filters
if (doc == null) {
- reporter.incrCounter("IndexerStatus", "skipped by indexing filters", 1);
- return;
- }
-
- float boost = 1.0f;
- // run scoring filters
- try {
- boost = this.scfilters.indexerScore(key, doc, dbDatum, fetchDatum, parse,
- inlinks, boost);
- } catch (final ScoringFilterException e) {
- reporter.incrCounter("IndexerStatus", "errors (ScoringFilter)", 1);
- if (LOG.isWarnEnabled()) {
- LOG.warn("Error calculating score {}: {}", key, e);
+ // https://issues.apache.org/jira/browse/NUTCH-1449
+ if (deleteSkippedByIndexingFilter) {
+ NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+ output.collect(key, action);
+ reporter.incrCounter("IndexerStatus", "deleted (IndexingFilter)", 1);
+ } else {
+ reporter.incrCounter("IndexerStatus", "skipped (IndexingFilter)", 1);
}
return;
}
- // apply boost to all indexed fields.
- doc.setWeight(boost);
- // store boost for use by explain and dedup
- doc.add("boost", Float.toString(boost));
if (content != null) {
// Get the original unencoded content