You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/03/22 03:47:54 UTC
[2/4] impala git commit: IMPALA-6394: Restart HDFS only when no
replication progress is made
IMPALA-6394: Restart HDFS only when no replication progress is made
In wait-hdfs-replication, the frequent and eager restart might slow the
HDFS replication down. HDFS should be restarted only if no progress is
made in a certain amount of time, and we should wait longer before
failing the data loading.
Testing: It's tested with a fake HDFS fsck script.
Change-Id: Ib059480254643dc032731b4b3c55204a93b61e77
Reviewed-on: http://gerrit.cloudera.org:8080/9698
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d03b66ca
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d03b66ca
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d03b66ca
Branch: refs/heads/master
Commit: d03b66ca3550a525282e33a8002f4bbdc9a2cb60
Parents: c9cb593
Author: Tianyi Wang <tw...@cloudera.com>
Authored: Thu Mar 15 13:32:16 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Mar 22 00:41:16 2018 +0000
----------------------------------------------------------------------
testdata/bin/create-load-data.sh | 29 +++++++++++++++++++----------
1 file changed, 19 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/d03b66ca/testdata/bin/create-load-data.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 787baca..311029d 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -450,25 +450,34 @@ function copy-and-load-ext-data-source {
}
function wait-hdfs-replication {
- MAX_RETRIES=6
- for ((RESTART_COUNT = 0; RESTART_COUNT <= MAX_RETRIES; ++RESTART_COUNT)); do
- sleep "$((RESTART_COUNT * 10))"
+ MAX_FSCK=30
+ SLEEP_SEC=120
+ LAST_NUMBER_UNDER_REPLICATED=-1
+ for ((FSCK_COUNT = 0; FSCK_COUNT <= MAX_FSCK; FSCK_COUNT++)); do
FSCK_OUTPUT="$(hdfs fsck /test-warehouse)"
echo "$FSCK_OUTPUT"
- if grep "Under-replicated blocks:[[:space:]]*0" <<< "$FSCK_OUTPUT"; then
+ NUMBER_UNDER_REPLICATED=$(
+ grep -oP "Under-replicated blocks:[[:space:]]*\K[[:digit:]]*" <<< "$FSCK_OUTPUT")
+ if [[ "$NUMBER_UNDER_REPLICATED" -eq 0 ]] ; then
# All the blocks are fully-replicated. The data loading can continue.
return
fi
- if [[ "$RESTART_COUNT" -eq "$MAX_RETRIES" ]] ; then
- echo "Some HDFS blocks are still under-replicated after restarting HDFS"\
- "$MAX_RETRIES times."
+ if [[ $(($FSCK_COUNT + 1)) -eq "$MAX_FSCK" ]] ; then
+ echo "Some HDFS blocks are still under-replicated after running HDFS fsck"\
+ "$MAX_FSCK times."
echo "Some tests cannot pass without fully-replicated blocks (IMPALA-3887)."
echo "Failing the data loading."
exit 1
fi
- echo "There are under-replicated blocks in HDFS. Attempting to restart HDFS to"\
- "resolve this issue."
- ${IMPALA_HOME}/testdata/bin/run-mini-dfs.sh
+ if [[ "$NUMBER_UNDER_REPLICATED" -eq "$LAST_NUMBER_UNDER_REPLICATED" ]] ; then
+ echo "There are under-replicated blocks in HDFS and HDFS is not making progress"\
+ "in $SLEEP_SEC seconds. Attempting to restart HDFS to resolve this issue."
+ ${IMPALA_HOME}/testdata/bin/run-mini-dfs.sh
+ fi
+ LAST_NUMBER_UNDER_REPLICATED="$NUMBER_UNDER_REPLICATED"
+ echo "$NUMBER_UNDER_REPLICATED under replicated blocks remaining."
+ echo "Sleeping for $SLEEP_SEC seconds before rechecking."
+ sleep "$SLEEP_SEC"
done
}