You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/06/11 11:21:29 UTC
[nutch] branch master updated: NUTCH-2791 Handle GCS URLs in stats
commands
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 6b6e74c NUTCH-2791 Handle GCS URLs in stats commands
new 59d0d95 Merge pull request #533 from pmezard/NUTCH-2791
6b6e74c is described below
commit 6b6e74c5a33c74ec30e3691c04797e6742350456
Author: Patrick Mezard <pa...@mezard.eu>
AuthorDate: Tue Jun 9 17:39:41 2020 +0200
NUTCH-2791 Handle GCS URLs in stats commands
- Handle Google Cloud Storage URLs as crawldb inputs in domainstats,
protocolstats and crawlcomplete commands.
- Correctly resolve numReducers in protocolstats.
- Align crawlcomplete -inputDirs behaviour on the other commands: expect
directories containing "current", not "crawldb/current".
---
src/java/org/apache/nutch/util/CrawlCompletionStats.java | 6 ++----
src/java/org/apache/nutch/util/ProtocolStatusStatistics.java | 7 +++----
src/java/org/apache/nutch/util/domain/DomainStatistics.java | 3 +--
3 files changed, 6 insertions(+), 10 deletions(-)
diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index f3e808b..8a23fbf 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -74,7 +74,7 @@ public class CrawlCompletionStats extends Configured implements Tool {
Option inDirs = OptionBuilder
.withArgName("inputDirs")
.isRequired()
- .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")")
+ .withDescription("Comma separated list of crawldb directories (e.g., \"./crawl1/crawldb,./crawl2/crawldb\")")
.hasArgs()
.create("inputDirs");
@SuppressWarnings("static-access")
@@ -153,9 +153,7 @@ public class CrawlCompletionStats extends Configured implements Tool {
String[] inputDirsSpecs = inputDir.split(",");
for (int i = 0; i < inputDirsSpecs.length; i++) {
- File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current");
- FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
-
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current"));
}
job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index f52a9c5..213c1c2 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -82,8 +82,8 @@ public class ProtocolStatusStatistics extends Configured implements Tool {
int numOfReducers = 1;
- if (args.length > 3) {
- numOfReducers = Integer.parseInt(args[3]);
+ if (args.length > 2) {
+ numOfReducers = Integer.parseInt(args[2]);
}
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@@ -100,8 +100,7 @@ public class ProtocolStatusStatistics extends Configured implements Tool {
String[] inputDirsSpecs = inputDir.split(",");
for (int i = 0; i < inputDirsSpecs.length; i++) {
- File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
- FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current"));
}
job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index fd2f940..24e7a1c 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -119,8 +119,7 @@ public class DomainStatistics extends Configured implements Tool {
String[] inputDirsSpecs = inputDir.split(",");
for (int i = 0; i < inputDirsSpecs.length; i++) {
- File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
- FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+ FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current"));
}
job.setInputFormatClass(SequenceFileInputFormat.class);