You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/06/11 11:21:29 UTC

[nutch] branch master updated: NUTCH-2791 Handle GCS URLs in stats commands

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 6b6e74c  NUTCH-2791 Handle GCS URLs in stats commands
     new 59d0d95  Merge pull request #533 from pmezard/NUTCH-2791
6b6e74c is described below

commit 6b6e74c5a33c74ec30e3691c04797e6742350456
Author: Patrick Mezard <pa...@mezard.eu>
AuthorDate: Tue Jun 9 17:39:41 2020 +0200

    NUTCH-2791 Handle GCS URLs in stats commands
    
    - Handle Google Cloud Storage URLs as crawldb inputs in domainstats,
      protocolstats and crawlcomplete commands.
    - Correctly resolve numReducers in protocolstats.
    - Align crawlcomplete -inputDirs behaviour on the other commands: expect
      directories containing "current", not "crawldb/current".
---
 src/java/org/apache/nutch/util/CrawlCompletionStats.java     | 6 ++----
 src/java/org/apache/nutch/util/ProtocolStatusStatistics.java | 7 +++----
 src/java/org/apache/nutch/util/domain/DomainStatistics.java  | 3 +--
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/src/java/org/apache/nutch/util/CrawlCompletionStats.java b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
index f3e808b..8a23fbf 100644
--- a/src/java/org/apache/nutch/util/CrawlCompletionStats.java
+++ b/src/java/org/apache/nutch/util/CrawlCompletionStats.java
@@ -74,7 +74,7 @@ public class CrawlCompletionStats extends Configured implements Tool {
     Option inDirs = OptionBuilder
         .withArgName("inputDirs")
         .isRequired()
-        .withDescription("Comma separated list of crawl directories (e.g., \"./crawl1,./crawl2\")")
+        .withDescription("Comma separated list of crawldb directories (e.g., \"./crawl1/crawldb,./crawl2/crawldb\")")
         .hasArgs()
         .create("inputDirs");
     @SuppressWarnings("static-access")
@@ -153,9 +153,7 @@ public class CrawlCompletionStats extends Configured implements Tool {
 
     String[] inputDirsSpecs = inputDir.split(",");
     for (int i = 0; i < inputDirsSpecs.length; i++) {
-      File completeInputPath = new File(new File(inputDirsSpecs[i]), "crawldb/current");
-      FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
-      
+      FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current"));
     }
 
     job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
index f52a9c5..213c1c2 100644
--- a/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
+++ b/src/java/org/apache/nutch/util/ProtocolStatusStatistics.java
@@ -82,8 +82,8 @@ public class ProtocolStatusStatistics extends Configured implements Tool {
 
     int numOfReducers = 1;
 
-    if (args.length > 3) {
-      numOfReducers = Integer.parseInt(args[3]);
+    if (args.length > 2) {
+      numOfReducers = Integer.parseInt(args[2]);
     }
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@@ -100,8 +100,7 @@ public class ProtocolStatusStatistics extends Configured implements Tool {
 
     String[] inputDirsSpecs = inputDir.split(",");
     for (int i = 0; i < inputDirsSpecs.length; i++) {
-      File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
-      FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+      FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current"));
     }
 
     job.setInputFormatClass(SequenceFileInputFormat.class);
diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
index fd2f940..24e7a1c 100644
--- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java
+++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java
@@ -119,8 +119,7 @@ public class DomainStatistics extends Configured implements Tool {
 
     String[] inputDirsSpecs = inputDir.split(",");
     for (int i = 0; i < inputDirsSpecs.length; i++) {
-      File completeInputPath = new File(new File(inputDirsSpecs[i]), "current");
-      FileInputFormat.addInputPath(job, new Path(completeInputPath.toString()));
+      FileInputFormat.addInputPath(job, new Path(inputDirsSpecs[i], "current"));
     }
 
     job.setInputFormatClass(SequenceFileInputFormat.class);