You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by ta...@apache.org on 2017/09/21 17:38:21 UTC

[1/2] incubator-impala git commit: IMPALA-5927: Fix enable_distcc for zsh

Repository: incubator-impala
Updated Branches:
  refs/heads/master fc275fab6 -> f87da848f


IMPALA-5927: Fix enable_distcc for zsh

enable_distcc didn't work on zsh anymore since it relies on automatic
variable splitting, which only works in bash.

This change moves clean-up actions for cmake files into an own bash
script.

This change also unifies variable quoting in clean.sh.

Change-Id: I88284e4f68c309bb46ce4b5a842ccc576cd487ed
Reviewed-on: http://gerrit.cloudera.org:8080/8049
Reviewed-by: Lars Volker <lv...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/fa93a47d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/fa93a47d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/fa93a47d

Branch: refs/heads/master
Commit: fa93a47dd70ad23d45dfc8b9498d00a4c52e54da
Parents: fc275fa
Author: Lars Volker <lv...@cloudera.com>
Authored: Tue Sep 12 21:32:58 2017 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Thu Sep 21 00:56:36 2017 +0000

----------------------------------------------------------------------
 bin/clean-cmake.sh       | 36 ++++++++++++++++++++++++++++++++++++
 bin/clean.sh             | 30 +++++++++++++-----------------
 bin/distcc/distcc_env.sh |  7 +------
 3 files changed, 50 insertions(+), 23 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fa93a47d/bin/clean-cmake.sh
----------------------------------------------------------------------
diff --git a/bin/clean-cmake.sh b/bin/clean-cmake.sh
new file mode 100755
index 0000000..a708391
--- /dev/null
+++ b/bin/clean-cmake.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Removes artifacts generated by cmake.
+
+set -euo pipefail
+trap 'echo Error in ${0} at line ${LINENO}: $(cd "'${PWD}'" && awk "NR == ${LINENO}" \
+  ${0})' ERR
+
+if [[ -z "${IMPALA_HOME}" || ! -d "${IMPALA_HOME}" ]]; then
+  echo IMPALA_HOME=${IMPALA_HOME} is not valid. 1>&2
+  exit 1
+fi
+# Remove trailing /
+ROOT_DIR=${IMPALA_HOME%%/}
+for loc in "${ROOT_DIR}/ -maxdepth 1" "${ROOT_DIR}/be/" "${ROOT_DIR}/fe/" "${ROOT_DIR}/common/"\
+           "${ROOT_DIR}/ext-data-source/"; do
+  find ${loc} \( -iname CMakeCache.txt -o -iname CMakeFiles \
+       -o -iname CTestTestfile.cmake -o -iname cmake_install.cmake \) -exec rm -Rf {} +
+done

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fa93a47d/bin/clean.sh
----------------------------------------------------------------------
diff --git a/bin/clean.sh b/bin/clean.sh
index aeb6067..a8a42c1 100755
--- a/bin/clean.sh
+++ b/bin/clean.sh
@@ -22,36 +22,37 @@
 # branch to a non-toolchain branch due to caching in CMake generated files.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+trap 'echo Error in ${0} at line ${LINENO}: $(cd "'${PWD}'" && awk "NR == ${LINENO}" \
+  ${0})' ERR
 
 # If the project was never build, no Makefile will exist and thus make clean will fail.
 # Combine the make command with the bash noop to always return true.
 "${MAKE_CMD:-make}" clean || :
 
 # clean the external data source project
-pushd ${IMPALA_HOME}/ext-data-source
+pushd "${IMPALA_HOME}/ext-data-source"
 rm -rf api/generated-sources/*
 ${IMPALA_HOME}/bin/mvn-quiet.sh clean
 popd
 
 # clean fe
 # don't use git clean because we need to retain Eclipse conf files
-pushd $IMPALA_FE_DIR
+pushd "${IMPALA_FE_DIR}"
 rm -rf target
 rm -f src/test/resources/{core,hbase,hive}-site.xml
 rm -rf generated-sources/*
-[ -z "$IMPALA_LOGS_DIR" ] || rm -rf "${IMPALA_LOGS_DIR}"/*
-mkdir -p $IMPALA_ALL_LOGS_DIRS
+[ -z "${IMPALA_LOGS_DIR}" ] || rm -rf "${IMPALA_LOGS_DIR}"/*
+mkdir -p "${IMPALA_ALL_LOGS_DIRS}"
 popd
 
 # clean be
-pushd "$IMPALA_HOME/be"
+pushd "${IMPALA_HOME}/be"
 # remove everything listed in .gitignore
 git rev-parse 2>/dev/null && git clean -Xdfq
 popd
 
 # clean shell build artifacts
-pushd "$IMPALA_HOME/shell"
+pushd "${IMPALA_HOME}/shell"
 # remove everything listed in .gitignore
 git rev-parse 2>/dev/null && git clean -Xdfq
 popd
@@ -63,20 +64,15 @@ find . -type d -name "__pycache__" -delete
 popd
 
 # clean llvm
-rm -f "$IMPALA_HOME/llvm-ir/"impala*.ll
-rm -f "$IMPALA_HOME/be/generated-sources/impala-ir/"*
+rm -f "${IMPALA_HOME}/llvm-ir/"impala*.ll
+rm -f "${IMPALA_HOME}/be/generated-sources/impala-ir/"*
 
 # Cleanup Impala-lzo
-if [ -e "$IMPALA_LZO" ]; then
-  pushd "$IMPALA_LZO"
+if [ -e "${IMPALA_LZO}" ]; then
+  pushd "${IMPALA_LZO}"
   git rev-parse 2>/dev/null && git clean -fdx
   popd
 fi
 
 # When switching to and from toolchain, make sure to remove all CMake generated files
-ROOT_DIR=${IMPALA_HOME%%/}
-for loc in "${ROOT_DIR}/ -maxdepth 1" "$ROOT_DIR/be/" "$ROOT_DIR/fe/" "$ROOT_DIR/common/"\
-           "$ROOT_DIR/ext-data-source/"; do
-  find $loc \( -iname CMakeCache.txt -o -iname CMakeFiles \
-       -o -iname CTestTestfile.cmake -o -iname cmake_install.cmake \) -exec rm -Rf {} +
-done
+"${IMPALA_HOME}/bin/clean-cmake.sh"

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/fa93a47d/bin/distcc/distcc_env.sh
----------------------------------------------------------------------
diff --git a/bin/distcc/distcc_env.sh b/bin/distcc/distcc_env.sh
index e1129e3..92e27a7 100644
--- a/bin/distcc/distcc_env.sh
+++ b/bin/distcc/distcc_env.sh
@@ -114,12 +114,7 @@ function clean_cmake_files {
     echo IMPALA_HOME=$IMPALA_HOME is not valid. 1>&2
     return 1
   fi
-  ROOT_DIR=${IMPALA_HOME%%/}
-  for loc in "${ROOT_DIR}/ -maxdepth 1" "$ROOT_DIR/be/" "$ROOT_DIR/fe/" \
-             "$ROOT_DIR/common/" "$ROOT_DIR/ext-data-source/"; do
-    find $loc \( -iname CMakeCache.txt -o -iname CMakeFiles \
-         -o -iname CTestTestfile.cmake -o -iname cmake_install.cmake \) -exec rm -Rf {} +
-  done
+  $IMPALA_HOME/bin/clean-cmake.sh
 }
 
 function switch_compiler {

[2/2] incubator-impala git commit: IMPALA-4863/IMPALA-5311: Correctly account the file type and compression codec

Posted by ta...@apache.org.

IMPALA-4863/IMPALA-5311: Correctly account the file type and compression codec

If a scan range is skipped at runtime the scan node skips reading
the range and never figures out the underlying compression codec used
to compress the files. In such a scenario we default the compression
codec to NONE which can be misleading. This change marks these files
as filtered in the scan node profile

e.g. - File Formats: TEXT/NONE:364 TEXT/NONE(Skipped):1460

Change-Id: I797916505f62e568f4159e07099481b8ff571da2
Reviewed-on: http://gerrit.cloudera.org:8080/7245
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Tim Armstrong <ta...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/f87da848
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/f87da848
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/f87da848

Branch: refs/heads/master
Commit: f87da848f5f204ae0dc84ffd9de64007e197c4d9
Parents: fa93a47
Author: aphadke <ap...@cloudera.com>
Authored: Mon Jun 19 16:34:57 2017 -0700
Committer: Tim Armstrong <ta...@cloudera.com>
Committed: Thu Sep 21 17:38:08 2017 +0000

----------------------------------------------------------------------
 be/src/exec/hdfs-parquet-scanner.cc             |  9 ++++--
 be/src/exec/hdfs-scan-node-base.cc              | 28 ++++++++++++++----
 be/src/exec/hdfs-scan-node-base.h               | 12 ++++++--
 be/src/exec/hdfs-scan-node.cc                   |  4 +--
 be/src/exec/hdfs-scan-node.h                    |  2 +-
 .../queries/QueryTest/hdfs_scanner_profile.test | 30 ++++++++++++++++++--
 tests/query_test/test_scanners.py               |  6 ++--
 7 files changed, 71 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f87da848/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index 4cd4340..57f1e24 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -315,9 +315,12 @@ void HdfsParquetScanner::Close(RowBatch* row_batch) {
   assemble_rows_timer_.ReleaseCounter();
 
   // If this was a metadata only read (i.e. count(*)), there are no columns.
-  if (compression_types.empty()) compression_types.push_back(THdfsCompression::NONE);
-  scan_node_->RangeComplete(THdfsFileFormat::PARQUET, compression_types);
-
+  if (compression_types.empty()) {
+    compression_types.push_back(THdfsCompression::NONE);
+    scan_node_->RangeComplete(THdfsFileFormat::PARQUET, compression_types, true);
+  } else {
+    scan_node_->RangeComplete(THdfsFileFormat::PARQUET, compression_types);
+  }
   if (schema_resolver_.get() != nullptr) schema_resolver_.reset();
 
   ScalarExprEvaluator::Close(min_max_conjunct_evals_, state_);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f87da848/be/src/exec/hdfs-scan-node-base.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.cc b/be/src/exec/hdfs-scan-node-base.cc
index b5169a8..e74efcd 100644
--- a/be/src/exec/hdfs-scan-node-base.cc
+++ b/be/src/exec/hdfs-scan-node-base.cc
@@ -553,7 +553,7 @@ bool HdfsScanNodeBase::FilePassesFilterPredicates(
           filter_ctxs)) {
     for (int j = 0; j < file->splits.size(); ++j) {
       // Mark range as complete to ensure progress.
-      RangeComplete(format, file->file_compression);
+      RangeComplete(format, file->file_compression, true);
     }
     return false;
   }
@@ -775,18 +775,18 @@ bool HdfsScanNodeBase::PartitionPassesFilters(int32_t partition_id,
 }
 
 void HdfsScanNodeBase::RangeComplete(const THdfsFileFormat::type& file_type,
-    const THdfsCompression::type& compression_type) {
+    const THdfsCompression::type& compression_type, bool skipped) {
   vector<THdfsCompression::type> types;
   types.push_back(compression_type);
-  RangeComplete(file_type, types);
+  RangeComplete(file_type, types, skipped);
 }
 
 void HdfsScanNodeBase::RangeComplete(const THdfsFileFormat::type& file_type,
-    const vector<THdfsCompression::type>& compression_types) {
+    const vector<THdfsCompression::type>& compression_types, bool skipped) {
   scan_ranges_complete_counter()->Add(1);
   progress_.Update(1);
   for (int i = 0; i < compression_types.size(); ++i) {
-    ++file_type_counts_[make_pair(file_type, compression_types[i])];
+    ++file_type_counts_[std::make_tuple(file_type, skipped, compression_types[i])];
   }
 }
 
@@ -871,7 +871,23 @@ void HdfsScanNodeBase::StopAndFinalizeCounters() {
     {
       for (FileTypeCountsMap::const_iterator it = file_type_counts_.begin();
           it != file_type_counts_.end(); ++it) {
-        ss << it->first.first << "/" << it->first.second << ":" << it->second << " ";
+
+        THdfsFileFormat::type file_format = std::get<0>(it->first);
+        bool skipped = std::get<1>(it->first);
+        THdfsCompression::type compression_type = std::get<2>(it->first);
+
+        if (skipped) {
+          if (file_format == THdfsFileFormat::PARQUET) {
+            // If a scan range stored as parquet is skipped, its compression type
+            // cannot be figured out without reading the data.
+            ss << file_format << "/" << "Unknown" << "(Skipped):" << it->second << " ";
+          } else {
+            ss << file_format << "/" << compression_type << "(Skipped):"
+               << it->second << " ";
+          }
+        } else {
+          ss << file_format << "/" << compression_type << ":" << it->second << " ";
+        }
       }
     }
     runtime_profile_->AddInfoString("File Formats", ss.str());

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f87da848/be/src/exec/hdfs-scan-node-base.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node-base.h b/be/src/exec/hdfs-scan-node-base.h
index e33de5a..7e9d322 100644
--- a/be/src/exec/hdfs-scan-node-base.h
+++ b/be/src/exec/hdfs-scan-node-base.h
@@ -23,6 +23,7 @@
 #include <memory>
 #include <unordered_set>
 #include <vector>
+#include <tuple>
 
 #include <boost/unordered_map.hpp>
 #include <boost/scoped_ptr.hpp>
@@ -252,11 +253,15 @@ class HdfsScanNodeBase : public ScanNode {
   /// Otherwise, scan nodes using a RowBatch queue may lose the last batch due
   /// to racing with shutting down the queue.
   void RangeComplete(const THdfsFileFormat::type& file_type,
-      const THdfsCompression::type& compression_type);
+      const THdfsCompression::type& compression_type, bool skipped = false);
+
   /// Same as above except for when multiple compression codecs were used
   /// in the file. The metrics are incremented for each compression_type.
+  /// 'skipped' is set to true in the following cases -
+  /// 1. when a scan range is filtered at runtime
+  /// 2. scan range is a metadata read only(e.x. count(*) on parquet files)
   virtual void RangeComplete(const THdfsFileFormat::type& file_type,
-      const std::vector<THdfsCompression::type>& compression_type);
+      const std::vector<THdfsCompression::type>& compression_type, bool skipped = false);
 
   /// Utility function to compute the order in which to materialize slots to allow for
   /// computing conjuncts as slots get materialized (on partial tuples).
@@ -492,7 +497,8 @@ class HdfsScanNodeBase : public ScanNode {
   /// Mapping of file formats (file type, compression type) to the number of
   /// splits of that type and the lock protecting it.
   typedef std::map<
-      std::pair<THdfsFileFormat::type, THdfsCompression::type>, int> FileTypeCountsMap;
+     std::tuple<THdfsFileFormat::type, bool, THdfsCompression::type>,
+     int> FileTypeCountsMap;
   FileTypeCountsMap file_type_counts_;
 
   /// Performs dynamic partition pruning, i.e., applies runtime filters to files, and

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f87da848/be/src/exec/hdfs-scan-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node.cc b/be/src/exec/hdfs-scan-node.cc
index 528e290..64eece3 100644
--- a/be/src/exec/hdfs-scan-node.cc
+++ b/be/src/exec/hdfs-scan-node.cc
@@ -242,9 +242,9 @@ void HdfsScanNode::Close(RuntimeState* state) {
 }
 
 void HdfsScanNode::RangeComplete(const THdfsFileFormat::type& file_type,
-    const std::vector<THdfsCompression::type>& compression_type) {
+    const std::vector<THdfsCompression::type>& compression_type, bool skipped) {
   lock_guard<SpinLock> l(file_type_counts_);
-  HdfsScanNodeBase::RangeComplete(file_type, compression_type);
+  HdfsScanNodeBase::RangeComplete(file_type, compression_type, skipped);
 }
 
 void HdfsScanNode::TransferToScanNodePool(MemPool* pool) {

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f87da848/be/src/exec/hdfs-scan-node.h
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scan-node.h b/be/src/exec/hdfs-scan-node.h
index 18a74ad..782f530 100644
--- a/be/src/exec/hdfs-scan-node.h
+++ b/be/src/exec/hdfs-scan-node.h
@@ -93,7 +93,7 @@ class HdfsScanNode : public HdfsScanNodeBase {
   /// batch queue. Otherwise, we may lose the last batch due to racing with shutting down
   /// the RowBatch queue.
   virtual void RangeComplete(const THdfsFileFormat::type& file_type,
-      const std::vector<THdfsCompression::type>& compression_type);
+      const std::vector<THdfsCompression::type>& compression_type, bool skipped = false);
 
   /// Transfers all memory from 'pool' to 'scan_node_pool_'.
   virtual void TransferToScanNodePool(MemPool* pool);

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f87da848/testdata/workloads/functional-query/queries/QueryTest/hdfs_scanner_profile.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/hdfs_scanner_profile.test b/testdata/workloads/functional-query/queries/QueryTest/hdfs_scanner_profile.test
index ea459e4..0fe5fb3 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/hdfs_scanner_profile.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/hdfs_scanner_profile.test
@@ -1,8 +1,34 @@
-====
----- QUERY
 # This query will do a full table scan to count the num of rows
 # read during a scan
 select * from alltypesagg
 ---- RUNTIME_PROFILE
 row_regex: .*RowsRead: 11.00K .
 ====
+---- QUERY
+# This query verifies that a scan range is marked as skipped
+# in the profile if the correct compression cannot be inferred
+# for a scan range
+select count(*) from tpcds_parquet.store_sales
+---- RUNTIME_PROFILE
+row_regex: .*File Formats: PARQUET/Unknown\(Skipped\):.*
+====
+---- QUERY
+# This query verifies that a when a parquet scan range is runtime
+# filtered, it is marked as skipped and the compression codec is
+# marked as unknown.
+set runtime_filter_wait_time_ms=500000;
+select count(*) from tpcds_parquet.store_sales
+join tpcds_parquet.date_dim on
+ss_sold_date_sk = d_date_sk where d_qoy=1
+---- RUNTIME_PROFILE
+row_regex: .*File Formats: PARQUET/NONE:.* PARQUET/Unknown\(Skipped\).*
+====
+---- QUERY
+# This query verifies that a when a text scan range is runtime
+# filtered, it is marked as skipped.
+set runtime_filter_wait_time_ms=100000;
+select count(*) from tpcds.store_sales join tpcds.date_dim on
+ss_sold_date_sk = d_date_sk where d_qoy=1
+---- RUNTIME_PROFILE
+row_regex: .*File Formats: TEXT/NONE:.* TEXT/NONE\(Skipped\):.*
+====

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f87da848/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index d355081..f4f2fd6 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -73,10 +73,10 @@ class TestScannersAllTableFormats(ImpalaTestSuite):
     self.run_test_case('QueryTest/scanners', new_vector)
 
   def test_hdfs_scanner_profile(self, vector):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['num_nodes'] = 1
-    if new_vector.get_value('table_format').file_format in ('kudu', 'hbase'):
+    if vector.get_value('table_format').file_format in ('kudu', 'hbase'):
       pytest.skip()
+    new_vector = deepcopy(vector)
+    new_vector.get_value('exec_option')['num_nodes'] = 0
     self.run_test_case('QueryTest/hdfs_scanner_profile', new_vector)
 
 # Test all the scanners with a simple limit clause. The limit clause triggers