You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/11/22 15:56:47 UTC
svn commit: r1412573 - in /nutch/trunk: CHANGES.txt
src/java/org/apache/nutch/crawl/Injector.java
Author: lewismc
Date: Thu Nov 22 14:56:46 2012
New Revision: 1412573
URL: http://svn.apache.org/viewvc?rev=1412573&view=rev
Log:
NUTCH-1370 Expose exact number of urls injected @runtime
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1412573&r1=1412572&r2=1412573&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Nov 22 14:56:46 2012
@@ -2,6 +2,8 @@ Nutch Change Log
(trunk) Current Development:
+* NUTCH-1370 Expose exact number of urls injected @runtime (snagel via lewismc)
+
* NUTCH-1117 JUnit test for index-anchor (lewismc)
* NUTCH-1451 Upgrade automaton jar to 1.11-8 (lewismc)
Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1412573&r1=1412572&r2=1412573&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Nov 22 14:56:46 2012
@@ -134,7 +134,9 @@ public class Injector extends Configured
if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
url = null;
}
- if (url != null) { // if it passes
+ if (url == null) {
+ reporter.getCounter("injector", "urls_filtered").increment(1);
+ } else { // if it passes
value.set(url); // collect it
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_INJECTED);
@@ -166,6 +168,7 @@ public class Injector extends Configured
+ ", using default (" + e.getMessage() + ")");
}
}
+ reporter.getCounter("injector", "urls_injected").increment(1);
output.collect(value, datum);
}
}
@@ -275,7 +278,13 @@ public class Injector extends Configured
sortJob.setOutputKeyClass(Text.class);
sortJob.setOutputValueClass(CrawlDatum.class);
sortJob.setLong("injector.current.time", System.currentTimeMillis());
- JobClient.runJob(sortJob);
+ RunningJob mapJob = JobClient.runJob(sortJob);
+
+ long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
+ long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
+ LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
+ LOG.info("Injector: total number of urls injected after normalization and filtering: "
+ + urlsInjected);
// merge with existing crawl db
if (LOG.isInfoEnabled()) {