You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2020/12/17 16:59:40 UTC
[nutch] branch master updated: Add possibility to setup
deduplication group mode in crawl script (#557)
This is an automated email from the ASF dual-hosted git repository.
lewismc pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 88a17f2 Add possibility to setup deduplication group mode in crawl script (#557)
88a17f2 is described below
commit 88a17f26b4160720bacb3ead1cad71ae24a559bc
Author: Jakob Berlin <j....@ewerk.com>
AuthorDate: Thu Dec 17 17:59:30 2020 +0100
Add possibility to setup deduplication group mode in crawl script (#557)
---
src/bin/crawl | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/src/bin/crawl b/src/bin/crawl
index 23a2940..db42218 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -48,6 +48,8 @@
# --time-limit-fetch <time_limit_fetch> Number of minutes allocated to the fetching [default: 180]
# --num-threads <num_threads> Number of threads for fetching / sitemap processing [default: 50]
#
+# -dedup-group <none|host|domain> Deduplication group method [default: none]
+#
function __to_seconds() {
NUMBER=$(echo $1 | tr -dc '0-9')
@@ -107,6 +109,7 @@ function __print_usage {
echo -e " \t\t\t\t\t - never [default]"
echo -e " \t\t\t\t\t - always (processing takes place in every iteration)"
echo -e " \t\t\t\t\t - once (processing only takes place in the first iteration)"
+ echo -e " -dedup-group <none|host|domain>\tDeduplication group method [default: none]"
exit 1
}
@@ -124,6 +127,7 @@ SIZE_FETCHLIST=50000 # 25K x NUM_TASKS
TIME_LIMIT_FETCH=180
NUM_THREADS=50
SITEMAPS_FROM_HOSTDB_FREQUENCY=never
+DEDUP_GROUP=none
while [[ $# > 0 ]]
do
@@ -177,6 +181,10 @@ do
SITEMAPS_FROM_HOSTDB_FREQUENCY="${2}"
shift 2
;;
+ --dedup-group)
+ DEDUP_GROUP="${2}"
+ shift 2
+ ;;
--hostdbupdate)
HOSTDBUPDATE=true
shift
@@ -197,6 +205,12 @@ if [[ ! "$SITEMAPS_FROM_HOSTDB_FREQUENCY" =~ ^(never|always|once)$ ]]; then
__print_usage
fi
+if [[ ! "$DEDUP_GROUP" =~ ^(none|host|domain)$ ]]; then
+ echo "Error: --dedup-group <mode> has to be one of none, host, domain."
+ echo -e ""
+ __print_usage
+fi
+
if [[ $# != 2 ]]; then
__print_usage
fi
@@ -385,7 +399,7 @@ do
__bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter
echo "Dedup on crawldb"
- __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
+ __bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb -group "$DEDUP_GROUP"
if $INDEXFLAG; then
echo "Indexing $SEGMENT to index"