You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by as...@apache.org on 2022/02/19 01:29:46 UTC

[impala] 01/03: IMPALA-11124: Reuse local TPCH/TPCDS data in testdata loading

This is an automated email from the ASF dual-hosted git repository.

asherman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 1697af02d6c96b82f19cc75235719afcb864ebe2
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Tue Feb 15 16:21:15 2022 +0800

    IMPALA-11124: Reuse local TPCH/TPCDS data in testdata loading
    
    When loading testdata for TPC-H/TPC-DS, we first run a preload script to
    generate local data, and then upload them to HDFS to be used by Hive.
    The preload script currently always generates the data, which is
    time-consuming in large scale factors.
    
    This patch modifies the preload scripts to check if the last run
    succeeded, and reuse the data if it does. Otherwise, generate the data
    and leave a success marker in the data directory.
    
    Tests:
     - Verified the scripts locally.
    
    Change-Id: Ied40e599cda009ae0ad88ad13385e7bb86428bb4
    Reviewed-on: http://gerrit.cloudera.org:8080/18233
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 testdata/datasets/tpcds/preload | 7 +++++++
 testdata/datasets/tpch/preload  | 7 +++++++
 2 files changed, 14 insertions(+)

diff --git a/testdata/datasets/tpcds/preload b/testdata/datasets/tpcds/preload
index ba9358d..99b1f7b 100755
--- a/testdata/datasets/tpcds/preload
+++ b/testdata/datasets/tpcds/preload
@@ -30,6 +30,11 @@ then
   TPC_DS_DATA=${TPC_DS_DATA}${SCALE_FACTOR}
 fi
 
+if [ -f ${TPC_DS_DATA}/SUCCESS ]; then
+  echo "Reuse existing TPC-DS data in ${TPC_DS_DATA}"
+  exit 0
+fi
+
 TPC_DS_DIRNAME=tpc-ds-${IMPALA_TPC_DS_VERSION}
 TPC_DS_HOME=${IMPALA_TOOLCHAIN_PACKAGES_HOME}/${TPC_DS_DIRNAME}
 
@@ -65,3 +70,5 @@ for FILE in *.dat; do
   mkdir -p ${FILE_DIR}
   mv ${FILE} ${FILE_DIR}
 done
+
+touch SUCCESS
diff --git a/testdata/datasets/tpch/preload b/testdata/datasets/tpch/preload
index 5b2d0e1..4bbfe06 100755
--- a/testdata/datasets/tpch/preload
+++ b/testdata/datasets/tpch/preload
@@ -30,6 +30,11 @@ then
   TPC_H_DATA=${TPC_H_DATA}${SCALE_FACTOR}
 fi
 
+if [ -f ${TPC_H_DATA}/SUCCESS ]; then
+  echo "Reuse existing TPC-H data in ${TPC_H_DATA}"
+  exit 0
+fi
+
 TPC_H_HOME=${IMPALA_TOOLCHAIN_PACKAGES_HOME}/tpc-h-${IMPALA_TPC_H_VERSION}
 TPC_H_DBGEN=${TPC_H_HOME}/bin/dbgen
 
@@ -63,3 +68,5 @@ for FILE in *.tbl; do
   mkdir -p ${FILE_DIR}
   mv ${FILE} ${FILE_DIR}
 done
+
+touch SUCCESS