You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2020/01/19 20:34:04 UTC
[nutch] branch master updated: NUTCH-2759 bin/crawl: Rename option
--num-slaves - renamed to --num-fetchers
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 040d71d NUTCH-2759 bin/crawl: Rename option --num-slaves - renamed to --num-fetchers
new a118c85 Merge pull request #491 from sebastian-nagel/NUTCH-2759-bin-crawl-rename-num-slaves
040d71d is described below
commit 040d71d6ece7fcbf76a1ad1849d76fa21133be3f
Author: Sebastian Nagel <sn...@apache.org>
AuthorDate: Thu Jan 9 13:32:30 2020 +0100
NUTCH-2759 bin/crawl: Rename option --num-slaves
- renamed to --num-fetchers
---
src/bin/crawl | 32 +++++++++++++++++++++-----------
1 file changed, 21 insertions(+), 11 deletions(-)
diff --git a/src/bin/crawl b/src/bin/crawl
index 81d30cc..56bb237 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -34,8 +34,9 @@
# --hostdbupdate Boolean indicator if we call hostdbupdate or not
# --hostdbgenerate Boolean indicator if we use hostdb in generate or not
#
-# --num-slaves <num_slaves> Number of slave nodes [default: 1]
-# Note: This can only be set when running in distribution mode
+# --num-fetchers <num_fetchers> Number of tasks used for fetching (fetcher map tasks) [default: 1]
+# Note: This can only be set when running in distributed mode and
+# should correspond to the number of worker nodes in the cluster.
# --num-tasks <num_tasks> Number of reducer tasks [default: 2]
# --size-fetchlist <size_fetchlist> Number of URLs to fetch in one iteration [default: 50000]
# --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
@@ -83,10 +84,11 @@ function __print_usage {
echo -e " \t\t\t\t\tspecified second is used by default. [default: -1]"
echo -e " -s <seed_dir>\t\t\t\tPath to seeds file(s)"
echo -e " -sm <sitemap_dir>\t\t\tPath to sitemap URL file(s)"
- echo -e " --hostdbupdate\t\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
+ echo -e " --hostdbupdate\t\t\tBoolean flag showing if we either update or not update hostdb for each round"
echo -e " --hostdbgenerate\t\t\tBoolean flag showing if we use hostdb in generate or not"
- echo -e " --num-slaves <num_slaves>\t\tNumber of slave nodes [default: 1]"
- echo -e " \t\t\t\t\tNote: This can only be set when running in distribution mode"
+ echo -e " --num-fetchers <num_fetchers>\t\tNumber of tasks used for fetching (fetcher map tasks) [default: 1]"
+ echo -e " \t\t\t\t\tNote: This can only be set when running in distributed mode and"
+ echo -e " \t\t\t\t\t should correspond to the number of worker nodes in the cluster."
echo -e " --num-tasks <num_tasks>\t\tNumber of reducer tasks [default: 2]"
echo -e " --size-fetchlist <size_fetchlist>\tNumber of URLs to fetch in one iteration [default: 50000]"
echo -e " --time-limit-fetch <time_limit_fetch>\tNumber of minutes allocated to the fetching [default: 180]"
@@ -107,8 +109,8 @@ HOSTDBGENERATE=false
JAVA_PROPERTIES=""
WAIT=-1 # don't wait if there are no URLs to fetch
SEEDDIR=""
-NUM_SLAVES=1
-NUM_TASKS=2 # 2 x NUM_SLAVES
+NUM_FETCHERS=1
+NUM_TASKS=2 # 2 x NUM_FETCHERS
SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
TIME_LIMIT_FETCH=180
NUM_THREADS=50
@@ -138,7 +140,12 @@ do
shift 2
;;
--num-slaves)
- NUM_SLAVES="${2}"
+ # back-ward compatibility: NUTCH-2759 renamed option --num-slaves to --num-fetchers
+ NUM_FETCHERS="${2}"
+ shift 2
+ ;;
+ --num-fetchers)
+ NUM_FETCHERS="${2}"
shift 2
;;
--num-tasks)
@@ -203,7 +210,10 @@ if [ -f "${bin}"/../*nutch*.job ]; then
mode=distributed
fi
if [[ "$mode" = "local" ]]; then
- NUM_SLAVES=1
+ if [[ "$NUM_FETCHERS" -ne 1 ]]; then
+ echo "Ignoring configured number of fetchers (--num_fetchers): a single fetcher task is used when running in local mode."
+ fi
+ NUM_FETCHERS=1
fi
# note that some of the options listed here could be set in the
@@ -296,9 +306,9 @@ do
echo "Generating a new segment"
if [[ "$HOSTDBGENERATE" == "true" ]] && __directory_exists "$CRAWL_PATH"/hostdb; then
- generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter -hostdb "$CRAWL_PATH"/hostdb)
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter -hostdb "$CRAWL_PATH"/hostdb)
else
- generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_SLAVES -noFilter)
+ generate_args=($commonOptions "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments -topN $SIZE_FETCHLIST -numFetchers $NUM_FETCHERS -noFilter)
fi
echo "$bin/nutch generate ${generate_args[@]}"