You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/03/07 19:15:50 UTC
svn commit: r1575351 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: snagel
Date: Fri Mar 7 18:15:50 2014
New Revision: 1575351
URL: http://svn.apache.org/r1575351
Log:
NUTCH-1706 IndexerMapReduce does not remove db_redir_temp
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1575351&r1=1575350&r2=1575351&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Mar 7 18:15:50 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Development Trunk
+* NUTCH-1706 IndexerMapReduce does not remove db_redir_temp (markus, snagel)
+
* NUTCH-1113 SegmentMerger can now be safely used to merge segments (Edward Drapkin, markus, snagel)
* NUTCH-1729 Upgrade to Tika 1.5 (jnioche)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1575351&r1=1575350&r2=1575351&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Mar 7 18:15:50 2014
@@ -180,36 +180,10 @@ implements Mapper<Text, Writable, Text,
dbDatum = datum;
}
else if (CrawlDatum.hasFetchStatus(datum)) {
-
// don't index unmodified (empty) pages
if (datum.getStatus() != CrawlDatum.STATUS_FETCH_NOTMODIFIED) {
fetchDatum = datum;
-
- /**
- * Check if we need to delete 404 NOT FOUND and 301 PERMANENT REDIRECT.
- */
- if (delete) {
- if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
- reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
-
- NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
- output.collect(key, action);
- return;
- }
- if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
- fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
- dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
- dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
- reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
- reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
-
- NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
- output.collect(key, action);
- return;
- }
- }
}
-
} else if (CrawlDatum.STATUS_LINKED == datum.getStatus() ||
CrawlDatum.STATUS_SIGNATURE == datum.getStatus() ||
CrawlDatum.STATUS_PARSE_META == datum.getStatus()) {
@@ -239,6 +213,29 @@ implements Mapper<Text, Writable, Text,
LOG.warn("Unrecognized type: "+value.getClass());
}
}
+
+ // Whether to delete GONE or REDIRECTS
+ if (delete && fetchDatum != null && dbDatum != null) {
+ if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE || dbDatum.getStatus() == CrawlDatum.STATUS_DB_GONE) {
+ reporter.incrCounter("IndexerStatus", "Documents deleted", 1);
+
+ NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+ output.collect(key, action);
+ return;
+ }
+
+ if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM ||
+ fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_TEMP ||
+ dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM ||
+ dbDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
+ reporter.incrCounter("IndexerStatus", "Deleted redirects", 1);
+ reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
+
+ NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+ output.collect(key, action);
+ return;
+ }
+ }
if (fetchDatum == null || dbDatum == null
|| parseText == null || parseData == null) {