You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2018/03/12 11:53:10 UTC

[nutch] branch master updated: fixed hdfs file checks in crawl script

This is an automated email from the ASF dual-hosted git repository.

snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e28af6  fixed hdfs file checks in crawl script
0e28af6 is described below

commit 0e28af65d8594e92d5818fe1c5a83b8f3c491d28
Author: Semyon Semyonov <se...@mail.com>
AuthorDate: Tue Mar 6 11:19:04 2018 +0100

    fixed hdfs file checks in crawl script
---
 src/bin/crawl | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/bin/crawl b/src/bin/crawl
index 7a32be2..dc32367 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -219,8 +219,19 @@ function __bin_nutch {
     fi
 }
 
+#check if directory exists locally or on hdfs
+function __directory_exists {
+  if [[ "$mode" == local  &&  -d "$1" ]]; then
+    return 0
+  elif [[ "$mode" == distributed ]] && hadoop fs -test -d "$1"; then
+    return 0
+  else
+    return 1
+  fi
+}
+
 function __update_hostdb {
-  if [[ -d "$CRAWL_PATH"/crawldb ]]; then
+  if __directory_exists "$CRAWL_PATH"/crawldb; then
     echo "Updating HostDB"
     __bin_nutch updatehostdb -crawldb "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb
   fi
@@ -261,7 +272,7 @@ do
     [[ $a -eq 1 ]] && __update_hostdb
 
     # sitemap processing based on HostDB
-    if [[ -d "$CRAWL_PATH"/hostdb ]]; then
+    if __directory_exists "$CRAWL_PATH"/hostdb; then
       echo "Processing sitemaps based on hosts in HostDB"
       __bin_nutch sitemap "$CRAWL_PATH"/crawldb -hostdb "$CRAWL_PATH"/hostdb -threads $NUM_THREADS
     fi

-- 
To stop receiving notification emails like this one, please contact
snagel@apache.org.