You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/06/08 09:37:43 UTC
svn commit: r1347909 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml
src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: markus
Date: Fri Jun 8 07:37:42 2012
New Revision: 1347909
URL: http://svn.apache.org/viewvc?rev=1347909&view=rev
Log:
NUTCH-1336 Optionally not index db_notmodified pages
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1347909&r1=1347908&r2=1347909&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 8 07:37:42 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1336 Optionally not index db_notmodified pages (markus)
+
* NUTCH-1346 Follow outlinks to ignore external (markus)
* NUTCH-1320 IndexChecker and ParseChecker choke on IDN's (markus)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1347909&r1=1347908&r2=1347909&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun 8 07:37:42 2012
@@ -875,6 +875,13 @@
</description>
</property>
+<property>
+ <name>indexer.skip.notmodified</name>
+ <value>false</value>
+ <description>Whether the indexer will skip records with a db_notmodified status.
+ </description>
+</property>
+
<!-- URL normalizer properties -->
<property>
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1347909&r1=1347908&r2=1347909&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Fri Jun 8 07:37:42 2012
@@ -55,7 +55,9 @@ implements Mapper<Text, Writable, Text,
public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
public static final String INDEXER_DELETE = "indexer.delete";
+ public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
+ private boolean skip = false;
private boolean delete = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
@@ -65,6 +67,7 @@ implements Mapper<Text, Writable, Text,
this.filters = new IndexingFilters(getConf());
this.scfilters = new ScoringFilters(getConf());
this.delete = job.getBoolean(INDEXER_DELETE, false);
+ this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
}
public void map(Text key, Writable value,
@@ -87,8 +90,15 @@ implements Mapper<Text, Writable, Text,
inlinks = (Inlinks)value;
} else if (value instanceof CrawlDatum) {
final CrawlDatum datum = (CrawlDatum)value;
- if (CrawlDatum.hasDbStatus(datum))
+ if (CrawlDatum.hasDbStatus(datum)) {
dbDatum = datum;
+
+ // Whether to skip DB_NOTMODIFIED pages
+ if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+ reporter.incrCounter("IndexerStatus", "Skipped", 1);
+ return;
+ }
+ }
else if (CrawlDatum.hasFetchStatus(datum)) {
// don't index unmodified (empty) pages
@@ -104,14 +114,14 @@ implements Mapper<Text, Writable, Text,
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
- continue;
+ return;
}
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_REDIR_PERM) {
reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1);
NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
output.collect(key, action);
- continue;
+ return;
}
}
}