You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/08/23 09:43:08 UTC
svn commit: r1376394 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Author: markus
Date: Thu Aug 23 07:43:08 2012
New Revision: 1376394
URL: http://svn.apache.org/viewvc?rev=1376394&view=rev
Log:
NUTCH-1434 Indexer to delete robots noindex
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1376394&r1=1376393&r2=1376394&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Aug 23 07:43:08 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1434 Indexer to delete robots noindex (markus)
+
* NUTCH-1443 Solr schema version is invalid (markus)
* NUTCH-1417 Remove o.a.n.metadata.Office (lewismc)
Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java?rev=1376394&r1=1376393&r2=1376394&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexerMapReduce.java Thu Aug 23 07:43:08 2012
@@ -57,15 +57,17 @@ implements Mapper<Text, Writable, Text,
public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
public static final String INDEXER_DELETE = "indexer.delete";
+ public static final String INDEXER_DELETE_ROBOTS_NOINDEX = "indexer.delete.robots.noindex";
public static final String INDEXER_SKIP_NOTMODIFIED = "indexer.skip.notmodified";
public static final String URL_FILTERING = "indexer.url.filters";
public static final String URL_NORMALIZING = "indexer.url.normalizers";
private boolean skip = false;
private boolean delete = false;
+ private boolean deleteRobotsNoIndex = false;
private IndexingFilters filters;
private ScoringFilters scfilters;
-
+
// using normalizers and/or filters
private boolean normalize = false;
private boolean filter = false;
@@ -79,6 +81,7 @@ implements Mapper<Text, Writable, Text,
this.filters = new IndexingFilters(getConf());
this.scfilters = new ScoringFilters(getConf());
this.delete = job.getBoolean(INDEXER_DELETE, false);
+ this.deleteRobotsNoIndex = job.getBoolean(INDEXER_DELETE_ROBOTS_NOINDEX, false);
this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);
normalize = job.getBoolean(URL_NORMALIZING, false);
@@ -174,12 +177,6 @@ implements Mapper<Text, Writable, Text,
final CrawlDatum datum = (CrawlDatum)value;
if (CrawlDatum.hasDbStatus(datum)) {
dbDatum = datum;
-
- // Whether to skip DB_NOTMODIFIED pages
- if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
- reporter.incrCounter("IndexerStatus", "Skipped", 1);
- return;
- }
}
else if (CrawlDatum.hasFetchStatus(datum)) {
@@ -217,6 +214,20 @@ implements Mapper<Text, Writable, Text,
}
} else if (value instanceof ParseData) {
parseData = (ParseData)value;
+
+ // Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
+ if (deleteRobotsNoIndex) {
+ // Get the robots meta data
+ String robotsMeta = parseData.getMeta("robots");
+
+ // Has it a noindex for this url?
+ if (robotsMeta != null && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
+ // Delete it!
+ NutchIndexAction action = new NutchIndexAction(null, NutchIndexAction.DELETE);
+ output.collect(key, action);
+ return;
+ }
+ }
} else if (value instanceof ParseText) {
parseText = (ParseText)value;
} else if (LOG.isWarnEnabled()) {
@@ -229,6 +240,12 @@ implements Mapper<Text, Writable, Text,
return; // only have inlinks
}
+ // Whether to skip DB_NOTMODIFIED pages
+ if (skip && dbDatum.getStatus() == CrawlDatum.STATUS_DB_NOTMODIFIED) {
+ reporter.incrCounter("IndexerStatus", "Skipped", 1);
+ return;
+ }
+
if (!parseData.getStatus().isSuccess() ||
fetchDatum.getStatus() != CrawlDatum.STATUS_FETCH_SUCCESS) {
return;