You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/03/22 03:47:54 UTC

[2/4] impala git commit: IMPALA-6394: Restart HDFS only when no replication progress is made

IMPALA-6394: Restart HDFS only when no replication progress is made

In wait-hdfs-replication, the frequent and eager restart might slow the
HDFS replication down. HDFS should be restarted only if no progress is
made in a certain amount of time, and we should wait longer before
failing the data loading.

Testing: It's tested with a fake HDFS fsck script.

Change-Id: Ib059480254643dc032731b4b3c55204a93b61e77
Reviewed-on: http://gerrit.cloudera.org:8080/9698
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d03b66ca
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d03b66ca
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d03b66ca

Branch: refs/heads/master
Commit: d03b66ca3550a525282e33a8002f4bbdc9a2cb60
Parents: c9cb593
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Thu Mar 15 13:32:16 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Mar 22 00:41:16 2018 +0000

----------------------------------------------------------------------
 testdata/bin/create-load-data.sh | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/d03b66ca/testdata/bin/create-load-data.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 787baca..311029d 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -450,25 +450,34 @@ function copy-and-load-ext-data-source {
 }
 
 function wait-hdfs-replication {
-  MAX_RETRIES=6
-  for ((RESTART_COUNT = 0; RESTART_COUNT <= MAX_RETRIES; ++RESTART_COUNT)); do
-    sleep "$((RESTART_COUNT * 10))"
+  MAX_FSCK=30
+  SLEEP_SEC=120
+  LAST_NUMBER_UNDER_REPLICATED=-1
+  for ((FSCK_COUNT = 0; FSCK_COUNT <= MAX_FSCK; FSCK_COUNT++)); do
     FSCK_OUTPUT="$(hdfs fsck /test-warehouse)"
     echo "$FSCK_OUTPUT"
-    if grep "Under-replicated blocks:[[:space:]]*0" <<< "$FSCK_OUTPUT"; then
+    NUMBER_UNDER_REPLICATED=$(
+        grep -oP "Under-replicated blocks:[[:space:]]*\K[[:digit:]]*" <<< "$FSCK_OUTPUT")
+    if [[ "$NUMBER_UNDER_REPLICATED" -eq 0 ]] ; then
       # All the blocks are fully-replicated. The data loading can continue.
       return
     fi
-    if [[ "$RESTART_COUNT" -eq "$MAX_RETRIES" ]] ; then
-      echo "Some HDFS blocks are still under-replicated after restarting HDFS"\
-          "$MAX_RETRIES times."
+    if [[ $(($FSCK_COUNT + 1)) -eq "$MAX_FSCK" ]] ; then
+      echo "Some HDFS blocks are still under-replicated after running HDFS fsck"\
+          "$MAX_FSCK times."
       echo "Some tests cannot pass without fully-replicated blocks (IMPALA-3887)."
       echo "Failing the data loading."
       exit 1
     fi
-    echo "There are under-replicated blocks in HDFS. Attempting to restart HDFS to"\
-        "resolve this issue."
-    ${IMPALA_HOME}/testdata/bin/run-mini-dfs.sh
+    if [[ "$NUMBER_UNDER_REPLICATED" -eq "$LAST_NUMBER_UNDER_REPLICATED" ]] ; then
+      echo "There are under-replicated blocks in HDFS and HDFS is not making progress"\
+          "in $SLEEP_SEC seconds. Attempting to restart HDFS to resolve this issue."
+      ${IMPALA_HOME}/testdata/bin/run-mini-dfs.sh
+    fi
+    LAST_NUMBER_UNDER_REPLICATED="$NUMBER_UNDER_REPLICATED"
+    echo "$NUMBER_UNDER_REPLICATED under replicated blocks remaining."
+    echo "Sleeping for $SLEEP_SEC seconds before rechecking."
+    sleep "$SLEEP_SEC"
   done
 }