You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/16 23:01:48 UTC
[impala] 03/03: IMPALA-8841: Try to fix Tez related dataload
flakiness
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit df2c6f200f66e6849e17ef177c99adf035766d6a
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Fri Aug 16 16:18:29 2019 +0200
IMPALA-8841: Try to fix Tez related dataload flakiness
The flakiness may be related to starting Hive queries in parallel which
triggers initializing Tez resources in parallel (only needed at the
first statement that uses Tez). Doing a non-parallel statement at first
may solve the issue.
Also includes a fix for a recent issue in 'build-and-copy-hive-udfs'
introduced by the version bump
in https://gerrit.cloudera.org/#/c/14043/
Change-Id: Id21d57483fe7a4f72f450fb71f8f53b3c1ef6327
Reviewed-on: http://gerrit.cloudera.org:8080/14081
Reviewed-by: Vihang Karajgaonkar <vi...@cloudera.com>
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Tim Armstrong <ta...@cloudera.com>
---
testdata/bin/create-load-data.sh | 15 +++++++++++++++
tests/test-hive-udfs/pom.xml | 4 ++++
2 files changed, 19 insertions(+)
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 74f0f63..a081280 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -51,6 +51,7 @@ SNAPSHOT_FILE=""
LOAD_DATA_ARGS=""
EXPLORATION_STRATEGY="exhaustive"
export JDBC_URL="jdbc:hive2://${HS2_HOST_PORT}/default;"
+HIVE_CMD="beeline -n $USER -u $JDBC_URL"
# For logging when using run-step.
LOG_DIR=${IMPALA_DATA_LOADING_LOGS_DIR}
@@ -588,6 +589,15 @@ function check-hdfs-health {
done
}
+function warm-up-hive {
+ echo "Running warm up Hive statements"
+ $HIVE_CMD -e "create database if not exists functional;"
+ $HIVE_CMD -e "create table if not exists hive_warm_up_tbl (i int);"
+ # The insert below starts a Tez session (if Hive uses Tez) and initializes
+ # .hiveJars directory in HDFS, see IMPALA-8841.
+ $HIVE_CMD -e "insert overwrite table hive_warm_up_tbl values (1);"
+}
+
# For kerberized clusters, use kerberos
if ${CLUSTER_DIR}/admin is_kerberized; then
LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}"
@@ -607,6 +617,11 @@ if [[ "${TARGET_FILESYSTEM}" == "hdfs" ]]; then
fi
if [ $SKIP_METADATA_LOAD -eq 0 ]; then
+ # Using Hive in non-parallel mode before starting parallel execution may help with some
+ # flakiness during data load, see IMPALA-8841. The problem only occurs in Hive 3
+ # environment, but always doing the warm up shouldn't hurt much and may make it easier
+ # to investigate future issues where Hive doesn't work at all.
+ warm-up-hive
run-step "Loading custom schemas" load-custom-schemas.log load-custom-schemas
# Run some steps in parallel, with run-step-backgroundable / run-step-wait-all.
# This is effective on steps that take a long time and don't depend on each
diff --git a/tests/test-hive-udfs/pom.xml b/tests/test-hive-udfs/pom.xml
index eb3ff82..e51d598 100644
--- a/tests/test-hive-udfs/pom.xml
+++ b/tests/test-hive-udfs/pom.xml
@@ -68,6 +68,10 @@ under the License.
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
</exclusion>
+ <exclusion>
+ <groupId>org.apache.hive.shims</groupId>
+ <artifactId>hive-shims-0.20</artifactId>
+ </exclusion>
</exclusions>
</dependency>
<dependency>