You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2012/11/22 15:56:47 UTC

svn commit: r1412573 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Injector.java

Author: lewismc
Date: Thu Nov 22 14:56:46 2012
New Revision: 1412573

URL: http://svn.apache.org/viewvc?rev=1412573&view=rev
Log:
NUTCH-1370 Expose exact number of urls injected @runtime

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1412573&r1=1412572&r2=1412573&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Nov 22 14:56:46 2012
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk) Current Development:
 
+* NUTCH-1370 Expose exact number of urls injected @runtime (snagel via lewismc)
+
 * NUTCH-1117 JUnit test for index-anchor (lewismc)
 
 * NUTCH-1451 Upgrade automaton jar to 1.11-8 (lewismc)

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1412573&r1=1412572&r2=1412573&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Thu Nov 22 14:56:46 2012
@@ -134,7 +134,9 @@ public class Injector extends Configured
         if (LOG.isWarnEnabled()) { LOG.warn("Skipping " +url+":"+e); }
         url = null;
       }
-      if (url != null) {                          // if it passes
+      if (url == null) {
+        reporter.getCounter("injector", "urls_filtered").increment(1);
+      } else {                                   // if it passes
         value.set(url);                           // collect it
         CrawlDatum datum = new CrawlDatum();
         datum.setStatus(CrawlDatum.STATUS_INJECTED);
@@ -166,6 +168,7 @@ public class Injector extends Configured
         				+ ", using default (" + e.getMessage() + ")");
         	}
         }
+        reporter.getCounter("injector", "urls_injected").increment(1);
         output.collect(value, datum);
       }
     }
@@ -275,7 +278,13 @@ public class Injector extends Configured
     sortJob.setOutputKeyClass(Text.class);
     sortJob.setOutputValueClass(CrawlDatum.class);
     sortJob.setLong("injector.current.time", System.currentTimeMillis());
-    JobClient.runJob(sortJob);
+    RunningJob mapJob = JobClient.runJob(sortJob);
+
+    long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue();
+    long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue();
+    LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered);
+    LOG.info("Injector: total number of urls injected after normalization and filtering: "
+        + urlsInjected);
 
     // merge with existing crawl db
     if (LOG.isInfoEnabled()) {