You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/04/09 15:45:08 UTC

[impala] 02/02: IMPALA-9618: fix some usability issues with dev env

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 5989900ae81a98d6977bdd60f2281da47e9f69b7
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Tue Apr 7 17:08:00 2020 -0500

    IMPALA-9618: fix some usability issues with dev env
    
    Automatically assume IMPALA_HOME is the source directory
    in a couple of places.
    
    Delete the cache_tables.py script and MINI_DFS_BASE_DATA_DIR
    config var which had both bit-rotted and were unused.
    
    Allow setting IMPALA_CLUSTER_NODES_DIR to put the minicluster
    nodes, most important the data, in a different location, e.g.
    on a different filesystem.
    
    Testing:
    I set up a dev environment using this code and was able to
    load data and run some tests.
    
    Change-Id: Ibd8b42a6d045d73e3ea29015aa6ccbbde278eec7
    Reviewed-on: http://gerrit.cloudera.org:8080/15687
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 bin/bootstrap_system.sh      |   3 +-
 bin/impala-config.sh         |   4 +-
 buildall.sh                  |   4 ++
 testdata/bin/cache_tables.py | 105 -------------------------------------------
 testdata/cluster/admin       |  23 +++++-----
 tests/comparison/cluster.py  |   5 +--
 6 files changed, 21 insertions(+), 123 deletions(-)

diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh
index 115a9eb..3f32a5e 100755
--- a/bin/bootstrap_system.sh
+++ b/bin/bootstrap_system.sh
@@ -44,7 +44,8 @@
 
 set -eu -o pipefail
 
-: ${IMPALA_HOME:=~/Impala}
+: ${IMPALA_HOME:=$(cd "$(dirname $0)"/..; pwd)}
+export IMPALA_HOME
 
 if [[ -t 1 ]] # if on an interactive terminal
 then
diff --git a/bin/impala-config.sh b/bin/impala-config.sh
index be6d025..66bf94b 100755
--- a/bin/impala-config.sh
+++ b/bin/impala-config.sh
@@ -366,6 +366,7 @@ export EXTERNAL_LISTEN_HOST="${EXTERNAL_LISTEN_HOST-0.0.0.0}"
 export DEFAULT_FS="${DEFAULT_FS-hdfs://${INTERNAL_LISTEN_HOST}:20500}"
 export WAREHOUSE_LOCATION_PREFIX="${WAREHOUSE_LOCATION_PREFIX-}"
 export LOCAL_FS="file:${WAREHOUSE_LOCATION_PREFIX}"
+export IMPALA_CLUSTER_NODES_DIR="${IMPALA_CLUSTER_NODES_DIR-$IMPALA_HOME/testdata/cluster/cdh$CDH_MAJOR_VERSION}"
 
 ESCAPED_IMPALA_HOME=$(sed "s/[^0-9a-zA-Z]/_/g" <<< "$IMPALA_HOME")
 if $USE_CDP_HIVE; then
@@ -612,7 +613,6 @@ HADOOP_CLASSPATH="$LZO_JAR_PATH"
 # minicluster.
 HADOOP_CLASSPATH="${HADOOP_CLASSPATH}:${HADOOP_HOME}/share/hadoop/tools/lib/*"
 
-export MINI_DFS_BASE_DATA_DIR="$IMPALA_HOME/cdh-${CDH_MAJOR_VERSION}-hdfs-data"
 export PATH="$HADOOP_HOME/bin:$PATH"
 
 export SENTRY_HOME="$CDH_COMPONENTS_HOME/sentry-${IMPALA_SENTRY_VERSION}"
@@ -802,7 +802,7 @@ echo "HADOOP_HOME             = $HADOOP_HOME"
 echo "HADOOP_CONF_DIR         = $HADOOP_CONF_DIR"
 echo "HADOOP_INCLUDE_DIR      = $HADOOP_INCLUDE_DIR"
 echo "HADOOP_LIB_DIR          = $HADOOP_LIB_DIR"
-echo "MINI_DFS_BASE_DATA_DIR  = $MINI_DFS_BASE_DATA_DIR"
+echo "IMPALA_CLUSTER_NODES_DIR= $IMPALA_CLUSTER_NODES_DIR"
 echo "HIVE_HOME               = $HIVE_HOME"
 echo "HIVE_CONF_DIR           = $HIVE_CONF_DIR"
 echo "HIVE_SRC_DIR            = $HIVE_SRC_DIR"
diff --git a/buildall.sh b/buildall.sh
index 08dfa3b..a697c20 100755
--- a/buildall.sh
+++ b/buildall.sh
@@ -18,6 +18,10 @@
 # under the License.
 
 set -euo pipefail
+
+: ${IMPALA_HOME:=$(cd "$(dirname $0)"; pwd)}
+export IMPALA_HOME
+
 . $IMPALA_HOME/bin/report_build_error.sh
 setup_report_build_error
 
diff --git a/testdata/bin/cache_tables.py b/testdata/bin/cache_tables.py
deleted file mode 100755
index 36eb0df..0000000
--- a/testdata/bin/cache_tables.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env impala-python
-##############################################################################
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-##############################################################################
-#
-# This script will warm up the buffer cache with the tables required to run the input
-# query.  This only works on a mini-dfs cluster.  This is remarkably difficult to do
-# since hdfs which tries to hide the details of the block locations from users.
-# The only way to do this is to
-#   1. use the java APIs (deprecated, of course) to extract the block ids.
-#   2. find the files with those block ids on the file system and read them
-#
-# First run testdata/bin/generate-block-ids.sh.  This will output the block locations
-# to testdata/block-ids.  This file is good as long as the mini-dfs cluster does not
-# get new files.  If the block-ids file is not there, this script will run
-# generate-block-ids.sh.
-#
-# Run this script, passing it the query and it will go read every replica of every
-# block of every table in the query.
-import math
-import os
-import re
-import sys
-import subprocess
-import tempfile
-from optparse import OptionParser
-
-# Options
-parser = OptionParser()
-parser.add_option("-q", "--query", dest="query", default = "",
-                  help="Query to run.  If none specified, runs all queries.")
-
-(options, args) = parser.parse_args()
-
-block_ids_file = 'testdata/block-ids'
-data_node_root = os.environ['MINI_DFS_BASE_DATA_DIR'] + '/dfs/data'
-block_ids = {}
-
-# Parse the block ids file to all the block ids for all the tables
-# the format of the file is:
-# <table name>: <block_id1> <block_id2> <etc>
-def parse_block_ids():
-  full_path = os.environ['IMPALA_HOME'] + "/" + block_ids_file;
-  if not os.path.isfile(full_path):
-    cmd = os.environ['IMPALA_HOME'] + '/testdata/bin/generate-block-ids.sh'
-    os.system(cmd)
-
-  if not os.path.isfile(full_path):
-    raise Exception("Could not find/generate block id files: " + full_path)
-
-  f = open(full_path);
-  for line in f:
-    tokens = line.split(':')
-    blocks = tokens[1].strip().split(' ')
-    block_ids[tokens[0].strip()] = blocks
-  
-# Parse for the tables used in this query
-def parse_tables(query):
-  table_predecessor = ['from', 'join']
-  tokens = query.split(' ')
-  tables = []
-  next_is_table = False
-  for t in tokens:
-    t = t.lower()
-    if next_is_table:
-      tables.append(t)
-      next_is_table = False
-    if t in table_predecessor:
-      next_is_table = True
-  return tables
-
-# Warm the buffer cache by cat-ing all the blocks to /dev/null
-def warm_buffer_cache(table):
-  if table not in block_ids:
-    raise Exception("Table not found: " + table)
-
-  blocks = block_ids[table]
-  for block in blocks:
-    cmd = 'find %s -type f -name blk_%s* -exec cat {} > /dev/null \;' % \
-          (data_node_root, block)
-    os.system(cmd)
-
-tables = parse_tables(options.query)
-parse_block_ids()
-
-if len(tables) == 0:
-  raise Exception("Could not parse tables in: " + options.query)
-
-for table in tables:
-  warm_buffer_cache(table)
diff --git a/testdata/cluster/admin b/testdata/cluster/admin
index dee8dbf..d777133 100755
--- a/testdata/cluster/admin
+++ b/testdata/cluster/admin
@@ -48,7 +48,6 @@ done
 shift $(($OPTIND-1))
 
 DIR=$(dirname $0)
-NODES_DIR="$DIR/cdh$CDH_MAJOR_VERSION"
 NODE_COUNT=3
 if [[ "$TARGET_FILESYSTEM" == "hdfs" && "$ERASURE_CODING" = true ]]; then
   NODE_COUNT=5
@@ -191,13 +190,13 @@ function is_kerberized {
 
 function cluster_exists {
   # Just use the first node as an indicator...
-  if [[ ! -e "$NODES_DIR/${NODE_PREFIX}1" ]]; then
+  if [[ ! -e "$IMPALA_CLUSTER_NODES_DIR/${NODE_PREFIX}1" ]]; then
     return 1
   fi
 }
 
 function create_cluster {
-  mkdir -p "$NODES_DIR"
+  mkdir -p "$IMPALA_CLUSTER_NODES_DIR"
 
   # Used to populate config templates later
   GROUP=$(id -gn)
@@ -384,7 +383,7 @@ function exec_init_script {
   local CMD="$1"
 
   local PIDS=()
-  for SCRIPT in $(find "$NODES_DIR" -path "*/$NODE_PREFIX*/etc/init.d/$SCRIPT_NAME" \
+  for SCRIPT in $(find "$IMPALA_CLUSTER_NODES_DIR" -path "*/$NODE_PREFIX*/etc/init.d/$SCRIPT_NAME" \
       $FIND_EXECUTABLE_FILTER -type f); do
     if "$SCRIPT" status &>/dev/null; then
       RUNNING=true
@@ -419,7 +418,7 @@ function check_cluster_status {
 
   ROLE_COUNT=0
   NOT_RUNNING=()
-  for NODE_DIR in "$NODES_DIR/$NODE_PREFIX"*; do
+  for NODE_DIR in "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*; do
     for SERVICE in ${SUPPORTED_SERVICES[@]-}; do
       for SCRIPT in $(find "$NODE_DIR" -path "*/etc/init.d/$SERVICE*" $FIND_EXECUTABLE_FILTER \
           -type f); do
@@ -472,30 +471,30 @@ function restart {
 
 function delete_data {
   # Delete namenode, datanode and KMS data while preserving directory structure.
-  rm -rf "$NODES_DIR/$NODE_PREFIX"*/data/dfs/{nn,dn}/*
-  rm -f "$NODES_DIR/$NODE_PREFIX"*/data/kms.keystore
+  rm -rf "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*/data/dfs/{nn,dn}/*
+  rm -f "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*/data/kms.keystore
   delete_kudu_data
 }
 
 function delete_kudu_data {
-  rm -rf "$NODES_DIR/$NODE_PREFIX"*/var/lib/kudu/{master,ts}/*
+  rm -rf "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"*/var/lib/kudu/{master,ts}/*
 }
 
 function delete_cluster {
   pkill -u $USER -f $KILL_CLUSTER_MARKER || true
-  rm -rf "$NODES_DIR"
+  rm -rf "$IMPALA_CLUSTER_NODES_DIR"
 }
 
 function get_node_dir {
   if $IS_OSX; then
-    greadlink -f "$NODES_DIR/$1"
+    greadlink -f "$IMPALA_CLUSTER_NODES_DIR/$1"
   else
-    readlink -f "$NODES_DIR/$1"
+    readlink -f "$IMPALA_CLUSTER_NODES_DIR/$1"
   fi
 }
 
 function get_hadoop_client_conf_dir {
-  echo "$NODES_DIR/$NODE_PREFIX"1/etc/hadoop/conf
+  echo "$IMPALA_CLUSTER_NODES_DIR/$NODE_PREFIX"1/etc/hadoop/conf
 }
 
 COMMAND=$1
diff --git a/tests/comparison/cluster.py b/tests/comparison/cluster.py
index 4951d51..6f850e2 100644
--- a/tests/comparison/cluster.py
+++ b/tests/comparison/cluster.py
@@ -209,9 +209,8 @@ class MiniCluster(Cluster):
       shutil.copy(os.path.join(other_conf_dir, file_name), self._local_hadoop_conf_dir)
 
   def _get_node_conf_dir(self):
-    return os.path.join(os.environ["IMPALA_HOME"], "testdata", "cluster",
-                        "cdh%s" % os.environ["CDH_MAJOR_VERSION"], "node-1",
-                        "etc", "hadoop", "conf")
+    return os.path.join(os.environ["IMPALA_CLUSTER_NODES_DIR"],
+                        "node-1", "etc", "hadoop", "conf")
 
   def _get_other_conf_dir(self):
     return os.path.join(os.environ["IMPALA_HOME"], "fe", "src", "test",