You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/12/18 15:50:06 UTC
[nutch] 22/23: NUTCH-2034 CrawlDB update job to count documents in
CrawlDb rejected by URL filters (patch contributed by Luis Lopez)
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit e0a27c7870d632966d584cf45399b98ba77e2bd6
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Sun Dec 17 16:13:09 2017 +0100
NUTCH-2034 CrawlDB update job to count documents in CrawlDb rejected by URL filters
(patch contributed by Luis Lopez)
---
src/java/org/apache/nutch/crawl/CrawlDb.java | 12 +++++++++++-
src/java/org/apache/nutch/crawl/CrawlDbFilter.java | 5 ++++-
2 files changed, 15 insertions(+), 2 deletions(-)
diff --git a/src/java/org/apache/nutch/crawl/CrawlDb.java b/src/java/org/apache/nutch/crawl/CrawlDb.java
index 080b037..9f37447 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDb.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDb.java
@@ -115,8 +115,9 @@ public class CrawlDb extends NutchTool implements Tool {
if (LOG.isInfoEnabled()) {
LOG.info("CrawlDb update: Merging segment data into db.");
}
+ RunningJob crawlDBJob = null;
try {
- JobClient.runJob(job);
+ crawlDBJob = JobClient.runJob(job);
} catch (IOException e) {
FileSystem fs = crawlDb.getFileSystem(getConf());
LockUtil.removeLockFile(fs, lock);
@@ -127,6 +128,15 @@ public class CrawlDb extends NutchTool implements Tool {
}
CrawlDb.install(job, crawlDb);
+
+ if (filter) {
+ long urlsFiltered = crawlDBJob.getCounters()
+ .findCounter("CrawlDB filter", "URLs filtered").getValue();
+ LOG.info(
+ "CrawlDb update: Total number of existing URLs in CrawlDb rejected by URL filters: {}",
+ urlsFiltered);
+ }
+
long end = System.currentTimeMillis();
LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
+ TimingUtil.elapsedTime(start, end));
diff --git a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
index 7b2aa80..8b46ecb 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDbFilter.java
@@ -111,7 +111,10 @@ public class CrawlDbFilter implements
url = null;
}
}
- if (url != null) { // if it passes
+ if (url == null) {
+ reporter.getCounter("CrawlDB filter", "URLs filtered").increment(1);
+ } else {
+ // URL has passed filters
newKey.set(url); // collect it
output.collect(newKey, value);
}
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.