You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/17 15:15:05 UTC

[nutch] branch master updated: NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL filters (patch contributed by Luis Lopez)

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 961c725  NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL filters (patch contributed by Luis Lopez)
961c725 is described below

commit 961c725aba2d6013a343dca66f595d6f28293a7b
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Sun Dec 17 16:13:09 2017 +0100

    NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL filters
    (patch contributed by Luis Lopez)
---
 src/java/org/apache/nutch/crawl/CrawlDb.java       | 12 +++++++++++-
 src/java/org/apache/nutch/crawl/CrawlDbFilter.java |  5 ++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 080b037..9f37447 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -115,8 +115,9 @@ public class CrawlDb extends NutchTool implements Tool {
     if (LOG.isInfoEnabled()) {
       LOG.info("CrawlDb update: Merging segment data into db.");
     }
+    RunningJob crawlDBJob = null;
     try {
-      JobClient.runJob(job);
+      crawlDBJob = JobClient.runJob(job);
     } catch (IOException e) {
       FileSystem fs = crawlDb.getFileSystem(getConf());
       LockUtil.removeLockFile(fs, lock);
@@ -127,6 +128,15 @@ public class CrawlDb extends NutchTool implements Tool {
     }
 
     CrawlDb.install(job, crawlDb);
+
+    if (filter) {
+      long urlsFiltered = crawlDBJob.getCounters()
+          .findCounter("CrawlDB filter", "URLs filtered").getValue();
+      LOG.info(
+          "CrawlDb update: Total number of existing URLs in CrawlDb rejected by URL filters: {}",
+          urlsFiltered);
+    }
+
     long end = System.currentTimeMillis();
     LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
         + TimingUtil.elapsedTime(start, end));
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index 7b2aa80..8b46ecb 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -111,7 +111,10 @@ public class CrawlDbFilter implements
         url = null;
       }
     }
-    if (url != null) { // if it passes
+    if (url == null) {
+      reporter.getCounter("CrawlDB filter", "URLs filtered").increment(1);
+    } else {
+      // URL has passed filters
       newKey.set(url); // collect it
       output.collect(newKey, value);
     }

-- 
To stop receiving notification emails like this one, please contact
['"commits@nutch.apache.org" <co...@nutch.apache.org>'].