You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/01/19 20:34:04 UTC

[nutch] branch master updated: NUTCH-2759 bin/crawl: Rename option --num-slaves - renamed to --num-fetchers

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 040d71d  NUTCH-2759 bin/crawl: Rename option --num-slaves - renamed to --num-fetchers
     new a118c85  Merge pull request #491 from sebastian-nagel/NUTCH-2759-bin-crawl-rename-num-slaves
040d71d is described below

commit 040d71d6ece7fcbf76a1ad1849d76fa21133be3f
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Thu Jan 9 13:32:30 2020 +0100

    NUTCH-2759 bin/crawl: Rename option --num-slaves
    - renamed to --num-fetchers
---
 src/bin/crawl | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/bin/crawl b/src/bin/crawl
index 81d30cc..56bb237 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -34,8 +34,9 @@
 #   --hostdbupdate                        Boolean indicator if we call hostdbupdate or not
 #   --hostdbgenerate                      Boolean indicator if we use hostdb in generate or not
 #
-#   --num-slaves <num_slaves>             Number of slave nodes [default: 1]
-#                                         Note: This can only be set when running in distribution mode
+#   --num-fetchers <num_fetchers>         Number of tasks used for fetching (fetcher map tasks) [default: 1]
+#                                         Note: This can only be set when running in distributed mode and
+#                                               should correspond to the number of worker nodes in the cluster.
 #   --num-tasks <num_tasks>               Number of reducer tasks [default: 2]
 #   --size-fetchlist <size_fetchlist>     Number of URLs to fetch in one iteration [default: 50000]
 #   --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
@@ -83,10 +84,11 @@ function __print_usage {
   echo -e "  \t\t\t\t\tspecified second is used by default. [default: -1]"
   echo -e "  -s <seed_dir>\t\t\t\tPath to seeds file(s)"
   echo -e "  -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
-  echo -e "  --hostdbupdate\t\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
+  echo -e "  --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
   echo -e "  --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
-  echo -e "  --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]"
-  echo -e "  \t\t\t\t\tNote: This can only be set when running in distribution mode"
+  echo -e "  --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
+  echo -e "  \t\t\t\t\tNote: This can only be set when running in distributed mode and"
+  echo -e "  \t\t\t\t\t      should correspond to the number of worker nodes in the cluster."
   echo -e "  --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
   echo -e "  --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
   echo -e "  --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
@@ -107,8 +109,8 @@ HOSTDBGENERATE=false
 JAVA_PROPERTIES=""
 WAIT=-1 # don't wait if there are no URLs to fetch
 SEEDDIR=""
-NUM_SLAVES=1
-NUM_TASKS=2 # 2 x NUM_SLAVES
+NUM_FETCHERS=1
+NUM_TASKS=2 # 2 x NUM_FETCHERS
 SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
 TIME_LIMIT_FETCH=180
 NUM_THREADS=50
@@ -138,7 +140,12 @@ do
             shift 2
             ;;
         --num-slaves)
-            NUM_SLAVES="${2}"
+            # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
+            NUM_FETCHERS="${2}"
+            shift 2
+            ;;
+        --num-fetchers)
+            NUM_FETCHERS="${2}"
             shift 2
             ;;
         --num-tasks)
@@ -203,7 +210,10 @@ if [ -f "${bin}"/../*nutch*.job ]; then
   mode=distributed
 fi
 if [[ "$mode" = "local" ]]; then
-  NUM_SLAVES=1
+  if [[ "$NUM_FETCHERS" -ne 1 ]]; then
+    echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
+  fi
+  NUM_FETCHERS=1
 fi
 
 # note that some of the options listed here could be set in the
@@ -296,9 +306,9 @@ do
 
   echo "Generating a new segment"
   if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
-   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter -hostdb "$CRAWL_PATH"/hostdb)
+   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
   else
-   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
+   generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
   fi
 
   echo "$bin/nutch generate ${generate_args[@]}"