You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2018/03/12 11:53:10 UTC
[nutch] branch master updated: fixed hdfs file checks in crawl
script
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
The following commit(s) were added to refs/heads/master by this push:
new 0e28af6 fixed hdfs file checks in crawl script
0e28af6 is described below
commit 0e28af65d8594e92d5818fe1c5a83b8f3c491d28
Author: Semyon Semyonov <se...@mail.com>
AuthorDate: Tue Mar 6 11:19:04 2018 +0100
fixed hdfs file checks in crawl script
---
src/bin/crawl | 15 +++++++++++++--
1 file changed, 13 insertions(+), 2 deletions(-)
diff --git a/src/bin/crawl b/src/bin/crawl
index 7a32be2..dc32367 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -219,8 +219,19 @@ function __bin_nutch {
fi
}
+#check if directory exists locally or on hdfs
+function __directory_exists {
+ if [[ "$mode" == local && -d "$1" ]]; then
+ return 0
+ elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
+ return 0
+ else
+ return 1
+ fi
+}
+
function __update_hostdb {
- if [[ -d "$CRAWL_PATH"/crawldb ]]; then
+ if __directory_exists "$CRAWL_PATH"/crawldb; then
echo "Updating HostDB"
__bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
fi
@@ -261,7 +272,7 @@ do
[[ $a -eq 1 ]] && __update_hostdb
# sitemap processing based on HostDB
- if [[ -d "$CRAWL_PATH"/hostdb ]]; then
+ if __directory_exists "$CRAWL_PATH"/hostdb; then
echo "Processing sitemaps based on hosts in HostDB"
__bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
fi
--
To stop receiving notification emails like this one, please contact
snagel@apache.org.