You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/16 23:01:48 UTC

[impala] 03/03: IMPALA-8841: Try to fix Tez related dataload flakiness

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit df2c6f200f66e6849e17ef177c99adf035766d6a
Author: Csaba Ringhofer <cs...@cloudera.com>
AuthorDate: Fri Aug 16 16:18:29 2019 +0200

    IMPALA-8841: Try to fix Tez related dataload flakiness
    
    The flakiness may be related to starting Hive queries in parallel which
    triggers initializing Tez resources in parallel (only needed at the
    first statement that uses Tez). Doing a non-parallel statement at first
    may solve the issue.
    
    Also includes a fix for a recent issue in  'build-and-copy-hive-udfs'
    introduced by the version bump
    in https://gerrit.cloudera.org/#/c/14043/
    
    Change-Id: Id21d57483fe7a4f72f450fb71f8f53b3c1ef6327
    Reviewed-on: http://gerrit.cloudera.org:8080/14081
    Reviewed-by: Vihang Karajgaonkar <vi...@cloudera.com>
    Reviewed-by: Tim Armstrong <ta...@cloudera.com>
    Tested-by: Tim Armstrong <ta...@cloudera.com>
---
 testdata/bin/create-load-data.sh | 15 +++++++++++++++
 tests/test-hive-udfs/pom.xml     |  4 ++++
 2 files changed, 19 insertions(+)

diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 74f0f63..a081280 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -51,6 +51,7 @@ SNAPSHOT_FILE=""
 LOAD_DATA_ARGS=""
 EXPLORATION_STRATEGY="exhaustive"
 export JDBC_URL="jdbc:hive2://${HS2_HOST_PORT}/default;"
+HIVE_CMD="beeline -n $USER -u $JDBC_URL"
 
 # For logging when using run-step.
 LOG_DIR=${IMPALA_DATA_LOADING_LOGS_DIR}
@@ -588,6 +589,15 @@ function check-hdfs-health {
   done
 }
 
+function warm-up-hive {
+  echo "Running warm up Hive statements"
+  $HIVE_CMD -e "create database if not exists functional;"
+  $HIVE_CMD -e "create table if not exists hive_warm_up_tbl (i int);"
+  # The insert below starts a Tez session (if Hive uses Tez) and initializes
+  # .hiveJars directory in HDFS, see IMPALA-8841.
+  $HIVE_CMD -e "insert overwrite table hive_warm_up_tbl values (1);"
+}
+
 # For kerberized clusters, use kerberos
 if ${CLUSTER_DIR}/admin is_kerberized; then
   LOAD_DATA_ARGS="${LOAD_DATA_ARGS} --use_kerberos --principal=${MINIKDC_PRINC_HIVE}"
@@ -607,6 +617,11 @@ if [[ "${TARGET_FILESYSTEM}" == "hdfs" ]]; then
 fi
 
 if [ $SKIP_METADATA_LOAD -eq 0 ]; then
+  # Using Hive in non-parallel mode before starting parallel execution may help with some
+  # flakiness during data load, see IMPALA-8841. The problem only occurs in Hive 3
+  # environment, but always doing the warm up shouldn't hurt much and may make it easier
+  # to investigate future issues where Hive doesn't work at all.
+  warm-up-hive
   run-step "Loading custom schemas" load-custom-schemas.log load-custom-schemas
   # Run some steps in parallel, with run-step-backgroundable / run-step-wait-all.
   # This is effective on steps that take a long time and don't depend on each
diff --git a/tests/test-hive-udfs/pom.xml b/tests/test-hive-udfs/pom.xml
index eb3ff82..e51d598 100644
--- a/tests/test-hive-udfs/pom.xml
+++ b/tests/test-hive-udfs/pom.xml
@@ -68,6 +68,10 @@ under the License.
           <groupId>net.minidev</groupId>
           <artifactId>json-smart</artifactId>
         </exclusion>
+        <exclusion>
+          <groupId>org.apache.hive.shims</groupId>
+          <artifactId>hive-shims-0.20</artifactId>
+        </exclusion>
       </exclusions>
     </dependency>
     <dependency>