You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by ta...@apache.org on 2018/04/19 18:12:06 UTC

[01/15] impala git commit: IMPALA-6483: [DOCS] Document the new EXEC_TIME_LIMIT_S query option

Repository: impala
Updated Branches:
  refs/heads/2.x eb92c1462 -> b6d558a20


IMPALA-6483: [DOCS] Document the new EXEC_TIME_LIMIT_S query option

Change-Id: I7a83aa42e6fffc7cb71112936129a0f1917ec2fd
Reviewed-on: http://gerrit.cloudera.org:8080/10043
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/915ea30c
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/915ea30c
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/915ea30c

Branch: refs/heads/2.x
Commit: 915ea30cd52505077e367eb826bd1a278b52b291
Parents: d9d3ce7
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Wed Apr 11 20:20:00 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 docs/impala.ditamap                      |  1 +
 docs/impala_keydefs.ditamap              |  1 +
 docs/shared/impala_common.xml            |  8 +++
 docs/topics/impala_exec_time_limit_s.xml | 93 +++++++++++++++++++++++++++
 4 files changed, 103 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/915ea30c/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 757b54f..7b92028 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -189,6 +189,7 @@ under the License.
           <topicref rev="2.5.0" href="topics/impala_disable_streaming_preaggregations.xml"/>
           <topicref href="topics/impala_disable_unsafe_spills.xml"/>
           <topicref href="topics/impala_exec_single_node_rows_threshold.xml"/>
+          <topicref href="topics/impala_exec_time_limit_s.xml"/>
           <topicref href="topics/impala_explain_level.xml"/>
           <topicref href="topics/impala_hbase_cache_blocks.xml"/>
           <topicref href="topics/impala_hbase_caching.xml"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/915ea30c/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index 4eb4d9b..dea94b9 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10786,6 +10786,7 @@ under the License.
   <keydef href="topics/impala_disable_streaming_preaggregations.xml" keys="disable_streaming_preaggregations"/>
   <keydef href="topics/impala_disable_unsafe_spills.xml" keys="disable_unsafe_spills"/>
   <keydef href="topics/impala_exec_single_node_rows_threshold.xml" keys="exec_single_node_rows_threshold"/>
+  <keydef href="topics/impala_exec_time_limit_s.xml" keys="exec_time_limit_s"/>
   <keydef href="topics/impala_explain_level.xml" keys="explain_level"/>
   <keydef href="topics/impala_hbase_cache_blocks.xml" keys="hbase_cache_blocks"/>
   <keydef href="topics/impala_hbase_caching.xml" keys="hbase_caching"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/915ea30c/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index 6c25b8e..fec6224 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -2829,6 +2829,14 @@ flight_num:           INT32 SNAPPY DO:83456393 FPO:83488603 SZ:10216514/11474301
         <b>Internal details:</b> Represented in memory as a byte array with the minimum size needed to represent
         each value.
       </p>
+      <p rev="3.0" id="added_in_30">
+        <b>Added in:</b>
+        <keyword keyref="impala30_full"/>
+      </p>
+      <p rev="2.12.0" id="added_in_212">
+        <b>Added in:</b>
+        <keyword keyref="impala212_full"/>
+      </p>
 
       <p rev="2.11.0" id="added_in_2110">
         <b>Added in:</b> <keyword keyref="impala2_11_0"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/915ea30c/docs/topics/impala_exec_time_limit_s.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_exec_time_limit_s.xml b/docs/topics/impala_exec_time_limit_s.xml
new file mode 100644
index 0000000..a0320b8
--- /dev/null
+++ b/docs/topics/impala_exec_time_limit_s.xml
@@ -0,0 +1,93 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept rev="2.12.0" id="exec_time_limit_s">
+
+  <title>EXEC_TIME_LIMIT_S Query Option (<keyword keyref="impala212_full"/> or higher only)</title>
+
+  <titlealts audience="PDF">
+
+    <navtitle>EXEC_TIME_LIMIT_S</navtitle>
+
+  </titlealts>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Querying"/>
+      <data name="Category" value="Developers"/>
+      <data name="Category" value="Data Analysts"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p rev="2.12.0">
+      The <codeph>EXEC_TIME_LIMIT_S</codeph> query option sets a time limit on query execution.
+      If a query is still executing when time limit expires, it is automatically canceled. The
+      option is intended to prevent runaway queries that execute for much longer than intended.
+    </p>
+
+    <p>
+      For example, an Impala administrator could set a default value of
+      <codeph>EXEC_TIME_LIMIT_S=3600</codeph> for a resource pool to automatically kill queries
+      that execute for longer than one hour (see
+      <xref href="impala_admission.xml#admission_control"/> for information about default query
+      options). Then, if a user accidentally runs a large query that executes for more than one
+      hour, it will be automatically killed after the time limit expires to free up resources.
+      Users can override the default value per query or per session if they do not want the
+      default <codeph>EXEC_TIME_LIMIT_S</codeph> value to apply to a specific query or a
+      session.
+    </p>
+
+    <note>
+      <p>
+        The time limit only starts once the query is executing. Time spent planning the query,
+        scheduling the query, or in admission control is not counted towards the execution time
+        limit. <codeph>SELECT</codeph> statements are eligible for automatic cancellation until
+        the client has fetched all result rows. DML queries are eligible for automatic
+        cancellation until the DML statement has finished.
+      </p>
+    </note>
+
+    <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
+
+<codeblock>SET EXEC_TIME_LIMIT_S=<varname>seconds</varname>;</codeblock>
+
+    <p>
+      <b>Type:</b> numeric
+    </p>
+
+    <p>
+      <b>Default:</b> 0 (no time limit )
+    </p>
+
+    <p conref="../shared/impala_common.xml#common/added_in_212"/>
+
+    <p conref="../shared/impala_common.xml#common/related_info"/>
+
+    <p>
+      <xref href="impala_timeouts.xml#timeouts"/>
+    </p>
+
+  </conbody>
+
+</concept>

[04/15] impala git commit: IMPALA-6809: Allow bootstrap_system.sh in non ~/Impala directory

Posted by ta...@apache.org.

IMPALA-6809: Allow bootstrap_system.sh in non ~/Impala directory

Testing:
- Ran bootstrap_development.sh with IMPALA_HOME set to non ~/Impala
  directory
- Ran bootstrap_development.sh with IMPALA_HOME not set

Change-Id: I3241c180b5fb28f1b5f939200f72461ad6fd7d7a
Reviewed-on: http://gerrit.cloudera.org:8080/9994
Reviewed-by: Fredy Wijaya <fw...@cloudera.com>
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b173c530
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b173c530
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b173c530

Branch: refs/heads/2.x
Commit: b173c530dda06c00bb7a258fa0b0a37d8c103c69
Parents: eb92c14
Author: Fredy Wijaya <fw...@cloudera.com>
Authored: Wed Apr 11 09:02:48 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 bin/bootstrap_system.sh | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/b173c530/bin/bootstrap_system.sh
----------------------------------------------------------------------
diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh
index 3f88dd3..bce161b 100755
--- a/bin/bootstrap_system.sh
+++ b/bin/bootstrap_system.sh
@@ -22,6 +22,8 @@
 # configurations, so it is best to run this in a fresh install. It also sets up the
 # ~/.bashrc for the calling user and impala-config-local.sh with some environment
 # variables to make Impala compile and run after this script is complete.
+# When IMPALA_HOME is set, the script will bootstrap Impala development in the
+# location specified.
 #
 # The intended user is a person who wants to start contributing code to Impala. This
 # script serves as an executable reference point for how to get started.
@@ -98,11 +100,13 @@ apt-get --yes install git
 echo ">>> Checking out Impala"
 
 # If there is no Impala git repo, get one now
-if ! [[ -d ~/Impala ]]
+
+: ${IMPALA_HOME:=~/Impala}
+if ! [[ -d "$IMPALA_HOME" ]]
 then
-  time -p git clone https://git-wip-us.apache.org/repos/asf/impala.git ~/Impala
+  time -p git clone https://git-wip-us.apache.org/repos/asf/impala.git "$IMPALA_HOME"
 fi
-cd ~/Impala
+cd "$IMPALA_HOME"
 SET_IMPALA_HOME="export IMPALA_HOME=$(pwd)"
 echo "$SET_IMPALA_HOME" >> ~/.bashrc
 eval "$SET_IMPALA_HOME"
@@ -199,17 +203,19 @@ echo "* - nofile 1048576" | sudo tee -a /etc/security/limits.conf
 
 # LZO is not needed to compile or run Impala, but it is needed for the data load
 echo ">>> Checking out Impala-lzo"
-if ! [[ -d ~/Impala-lzo ]]
+: ${IMPALA_LZO_HOME:="${IMPALA_HOME}/../Impala-lzo"}
+if ! [[ -d "$IMPALA_LZO_HOME" ]]
 then
-  git clone https://github.com/cloudera/impala-lzo.git ~/Impala-lzo
+  git clone https://github.com/cloudera/impala-lzo.git "$IMPALA_LZO_HOME"
 fi
 
 echo ">>> Checking out and building hadoop-lzo"
 
-if ! [[ -d ~/hadoop-lzo ]]
+: ${HADOOP_LZO_HOME:="${IMPALA_HOME}/../hadoop-lzo"}
+if ! [[ -d "$HADOOP_LZO_HOME" ]]
 then
-  git clone https://github.com/cloudera/hadoop-lzo.git ~/hadoop-lzo
+  git clone https://github.com/cloudera/hadoop-lzo.git "$HADOOP_LZO_HOME"
 fi
-cd ~/hadoop-lzo/
+cd "$HADOOP_LZO_HOME"
 time -p ant package
 cd "$IMPALA_HOME"

[09/15] impala git commit: Fix test_query_concurrency exception handling.

Posted by ta...@apache.org.

Fix test_query_concurrency exception handling.

Fixes use of an undefined variable.

I saw the following message in a build failure, which
clearly wasn't intended:

  MainThread: Debug webpage not yet available.
  Exception in thread Thread-862:
  Traceback (most recent call last):
    File "/usr/lib64/python2.7/threading.py", line 811, in __bootstrap_inner
      self.run()
    File "/usr/lib64/python2.7/threading.py", line 764, in run
      self.__target(*self.__args, **self.__kwargs)
    File "/data/jenkins/workspace/impala-asf-2.x-exhaustive-rhel7/repos/Impala/tests/custom_cluster/test_query_concurrency.py", line 58, in poll_query_page
      except e:
  NameError: global name 'e' is not defined

Change-Id: If507409b8945b16a9510bb6195343eed7d8538fc
Reviewed-on: http://gerrit.cloudera.org:8080/10049
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b7a624a8
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b7a624a8
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b7a624a8

Branch: refs/heads/2.x
Commit: b7a624a82b8d366bf03c91873e603141fe74d608
Parents: e63b725
Author: Philip Zeyliger <ph...@cloudera.com>
Authored: Thu Apr 12 13:00:06 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 tests/custom_cluster/test_query_concurrency.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/b7a624a8/tests/custom_cluster/test_query_concurrency.py
----------------------------------------------------------------------
diff --git a/tests/custom_cluster/test_query_concurrency.py b/tests/custom_cluster/test_query_concurrency.py
index 53bc72b..63cb173 100644
--- a/tests/custom_cluster/test_query_concurrency.py
+++ b/tests/custom_cluster/test_query_concurrency.py
@@ -16,13 +16,9 @@
 # under the License.
 
 import pytest
-import requests
 import time
-from time import localtime, strftime
 from threading import Thread
-from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
-from tests.common.impala_cluster import ImpalaCluster
 from tests.common.skip import SkipIfBuildType
 
 @SkipIfBuildType.not_dev_build
@@ -55,7 +51,7 @@ class TestQueryConcurrency(CustomClusterTestSuite):
     while time.time() - start < self.POLLING_TIMEOUT_S:
       try:
         impalad.service.read_debug_webpage("query_plan?query_id=" + query_id)
-      except e:
+      except Exception:
         pass
       time.sleep(1)

[11/15] impala git commit: IMPALA-6790: Upgrade sqlparse to 0.1.19

Posted by ta...@apache.org.

IMPALA-6790: Upgrade sqlparse to 0.1.19

Some remote cluster tests have failed to load data
due to sqlparse failing to split SQL statements
appropriately. The SQL file itself is identical
to our usual dataload, so it must be a unique
environment. The current version of sqlparse is
0.1.15.

This upgrades sqlparse to 0.1.19. When running on
the same environment with the newer version,
the problem does not occur. Note that this
is not the version used for the Impala shell.
Impala shell has sqlparse checked-in under
shell/ext-py.

Change-Id: Ic5289f86b78f1d77d91a8fa47d63b7a7eaa3af38
Reviewed-on: http://gerrit.cloudera.org:8080/10044
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b6d558a2
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b6d558a2
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b6d558a2

Branch: refs/heads/2.x
Commit: b6d558a20127d2df39610cdc5889dc07ef70896e
Parents: d8815fb
Author: Joe McDonnell <jo...@cloudera.com>
Authored: Thu Apr 12 10:20:14 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:48 2018 +0000

----------------------------------------------------------------------
 infra/python/deps/requirements.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/b6d558a2/infra/python/deps/requirements.txt
----------------------------------------------------------------------
diff --git a/infra/python/deps/requirements.txt b/infra/python/deps/requirements.txt
index d55bc19..06868f8 100644
--- a/infra/python/deps/requirements.txt
+++ b/infra/python/deps/requirements.txt
@@ -55,7 +55,10 @@ requests == 2.7.0
 setuptools == 36.8.0
 setuptools-scm == 1.15.4
 sh == 1.11
-sqlparse == 0.1.15
+# Note: This version for sqlparse is not what is used for the shell. The shell uses
+# a checked-in version of sqlparse (see shell/ext-py). This version is used primarily
+# for dataload.
+sqlparse == 0.1.19
 texttable == 0.8.3
 
 # For dev purposes, not used in scripting. Version 1.2.1 is the latest that supports 2.6.

[02/15] impala git commit: IMPALA-6120: Add thread timers for reporting codegen time

Posted by ta...@apache.org.

IMPALA-6120: Add thread timers for reporting codegen time

Add thread times for accurate reporting of codegen time.
Also cleaned up a few places where time elapsed was being counted twice.

Sample Profile:

Query: SELECT count(*) FROM tpch_parquet.lineitem
WHERE l_partkey in (1,6,11,16,21,26,31,36,41);

CodeGen:(Total: 37.948ms, non-child: 37.948ms, % non-child: 100.00%)
   - CodegenInvoluntaryContextSwitches: 0 (0)
   - CodegenTotalWallClockTime: 37.942ms
     - CodegenSysTime: 0.000ns
     - CodegenUserTime: 36.938ms
   - CodegenVoluntaryContextSwitches: 0 (0)
   - CompileTime: 2.065ms
   - IrGenerationTime: 392.351us
   - LoadTime: 0.000ns
   - ModuleBitcodeSize: 2.26 MB (2373148)
   - NumFunctions: 22 (22)
   - NumInstructions: 381 (381)
   - OptimizationTime: 21.416ms
   - PeakMemoryUsage: 190.50 KB (195072)
   - PrepareTime: 13.496ms

Sample Profile with an added 2 sec sleep time to "OptimizationTime":

CodeGen:(Total: 2s037ms, non-child: 2s037ms, % non-child: 100.00%)
   - CodegenInvoluntaryContextSwitches: 0 (0)
   - CodegenTotalWallClockTime: 2s037ms
     - CodegenSysTime: 0.000ns
     - CodegenUserTime: 37.672ms
   - CodegenVoluntaryContextSwitches: 1 (1)
   - CompileTime: 2.032ms
   - IrGenerationTime: 386.948us
   - LoadTime: 0.000ns
   - ModuleBitcodeSize: 2.26 MB (2373148)
   - NumFunctions: 22 (22)
   - NumInstructions: 381 (381)
   - OptimizationTime: 2s023ms
   - PeakMemoryUsage: 190.50 KB (195072)
   - PrepareTime: 11.598ms

Change-Id: I24d5a46b8870bc959b89045432d2e86af72b30e5
Reviewed-on: http://gerrit.cloudera.org:8080/9960
Reviewed-by: Bikramjeet Vig <bi...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d9d3ce7c
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d9d3ce7c
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d9d3ce7c

Branch: refs/heads/2.x
Commit: d9d3ce7c2ec124290d503880f3963fe7c4a78c58
Parents: 857d2b0
Author: Bikramjeet Vig <bi...@cloudera.com>
Authored: Mon Apr 9 14:03:52 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 be/src/codegen/llvm-codegen.cc              | 21 +++++++++------------
 be/src/codegen/llvm-codegen.h               | 18 ++++++++++--------
 be/src/exec/hdfs-avro-scanner.cc            |  2 --
 be/src/exec/hdfs-parquet-scanner.cc         |  1 -
 be/src/exec/hdfs-scanner.cc                 |  2 --
 be/src/exec/partitioned-aggregation-node.cc |  5 -----
 be/src/exec/select-node.cc                  |  1 -
 be/src/exec/text-converter.cc               |  1 -
 be/src/runtime/fragment-instance-state.cc   | 16 +++++++++++-----
 be/src/runtime/tuple.cc                     |  1 -
 be/src/util/tuple-row-compare.cc            |  1 -
 11 files changed, 30 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/codegen/llvm-codegen.cc
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.cc b/be/src/codegen/llvm-codegen.cc
index c8fd8eb..5d5ed15 100644
--- a/be/src/codegen/llvm-codegen.cc
+++ b/be/src/codegen/llvm-codegen.cc
@@ -201,11 +201,12 @@ LlvmCodeGen::LlvmCodeGen(RuntimeState* state, ObjectPool* pool,
   load_module_timer_ = ADD_TIMER(profile_, "LoadTime");
   prepare_module_timer_ = ADD_TIMER(profile_, "PrepareTime");
   module_bitcode_size_ = ADD_COUNTER(profile_, "ModuleBitcodeSize", TUnit::BYTES);
-  codegen_timer_ = ADD_TIMER(profile_, "CodegenTime");
+  ir_generation_timer_ = ADD_TIMER(profile_, "IrGenerationTime");
   optimization_timer_ = ADD_TIMER(profile_, "OptimizationTime");
   compile_timer_ = ADD_TIMER(profile_, "CompileTime");
   num_functions_ = ADD_COUNTER(profile_, "NumFunctions", TUnit::UNIT);
   num_instructions_ = ADD_COUNTER(profile_, "NumInstructions", TUnit::UNIT);
+  llvm_thread_counters_ = ADD_THREAD_COUNTERS(profile_, "Codegen");
 }
 
 Status LlvmCodeGen::CreateFromFile(RuntimeState* state, ObjectPool* pool,
@@ -213,6 +214,7 @@ Status LlvmCodeGen::CreateFromFile(RuntimeState* state, ObjectPool* pool,
     scoped_ptr<LlvmCodeGen>* codegen) {
   codegen->reset(new LlvmCodeGen(state, pool, parent_mem_tracker, id));
   SCOPED_TIMER((*codegen)->profile_->total_time_counter());
+  SCOPED_THREAD_COUNTER_MEASUREMENT((*codegen)->llvm_thread_counters());
 
   unique_ptr<llvm::Module> loaded_module;
   Status status = (*codegen)->LoadModuleFromFile(file, &loaded_module);
@@ -229,6 +231,8 @@ Status LlvmCodeGen::CreateFromMemory(RuntimeState* state, ObjectPool* pool,
     MemTracker* parent_mem_tracker, const string& id, scoped_ptr<LlvmCodeGen>* codegen) {
   codegen->reset(new LlvmCodeGen(state, pool, parent_mem_tracker, id));
   SCOPED_TIMER((*codegen)->profile_->total_time_counter());
+  SCOPED_TIMER((*codegen)->prepare_module_timer_);
+  SCOPED_THREAD_COUNTER_MEASUREMENT((*codegen)->llvm_thread_counters());
 
   // Select the appropriate IR version. We cannot use LLVM IR with SSE4.2 instructions on
   // a machine without SSE4.2 support.
@@ -282,7 +286,6 @@ Status LlvmCodeGen::LoadModuleFromFile(
 Status LlvmCodeGen::LoadModuleFromMemory(unique_ptr<llvm::MemoryBuffer> module_ir_buf,
     string module_name, unique_ptr<llvm::Module>* module) {
   DCHECK(!module_name.empty());
-  SCOPED_TIMER(prepare_module_timer_);
   COUNTER_ADD(module_bitcode_size_, module_ir_buf->getMemBufferRef().getBufferSize());
   llvm::Expected<unique_ptr<llvm::Module>> tmp_module =
       getOwningLazyBitcodeModule(move(module_ir_buf), context());
@@ -305,7 +308,6 @@ Status LlvmCodeGen::LoadModuleFromMemory(unique_ptr<llvm::MemoryBuffer> module_i
 
 // TODO: Create separate counters/timers (file size, load time) for each module linked
 Status LlvmCodeGen::LinkModuleFromLocalFs(const string& file) {
-  SCOPED_TIMER(profile_->total_time_counter());
   unique_ptr<llvm::Module> new_module;
   RETURN_IF_ERROR(LoadModuleFromFile(file, &new_module));
 
@@ -366,6 +368,7 @@ Status LlvmCodeGen::CreateImpalaCodegen(RuntimeState* state,
   // Parse module for cross compiled functions and types
   SCOPED_TIMER(codegen->profile_->total_time_counter());
   SCOPED_TIMER(codegen->prepare_module_timer_);
+  SCOPED_THREAD_COUNTER_MEASUREMENT(codegen->llvm_thread_counters_);
 
   // Get type for StringValue
   codegen->string_value_type_ = codegen->GetStructType<StringValue>();
@@ -621,7 +624,7 @@ void LlvmCodeGen::CreateIfElseBlocks(llvm::Function* fn, const string& if_name,
   *else_block = llvm::BasicBlock::Create(context(), else_name, fn, insert_before);
 }
 
-Status LlvmCodeGen::MaterializeFunctionHelper(llvm::Function* fn) {
+Status LlvmCodeGen::MaterializeFunction(llvm::Function* fn) {
   DCHECK(!is_compiled_);
   if (fn->isIntrinsic() || !fn->isMaterializable()) return Status::OK();
 
@@ -642,18 +645,12 @@ Status LlvmCodeGen::MaterializeFunctionHelper(llvm::Function* fn) {
     for (const string& callee : *callees) {
       llvm::Function* callee_fn = module_->getFunction(callee);
       DCHECK(callee_fn != nullptr);
-      RETURN_IF_ERROR(MaterializeFunctionHelper(callee_fn));
+      RETURN_IF_ERROR(MaterializeFunction(callee_fn));
     }
   }
   return Status::OK();
 }
 
-Status LlvmCodeGen::MaterializeFunction(llvm::Function* fn) {
-  SCOPED_TIMER(profile_->total_time_counter());
-  SCOPED_TIMER(prepare_module_timer_);
-  return MaterializeFunctionHelper(fn);
-}
-
 llvm::Function* LlvmCodeGen::GetFunction(const string& symbol, bool clone) {
   llvm::Function* fn = module_->getFunction(symbol.c_str());
   if (fn == NULL) {
@@ -1038,7 +1035,6 @@ Status LlvmCodeGen::MaterializeModule() {
 
 // It's okay to call this function even if the module has been materialized.
 Status LlvmCodeGen::FinalizeLazyMaterialization() {
-  SCOPED_TIMER(prepare_module_timer_);
   for (llvm::Function& fn : module_->functions()) {
     if (fn.isMaterializable()) {
       DCHECK(!module_->isMaterialized());
@@ -1078,6 +1074,7 @@ Status LlvmCodeGen::FinalizeModule() {
 
   if (is_corrupt_) return Status("Module is corrupt.");
   SCOPED_TIMER(profile_->total_time_counter());
+  SCOPED_THREAD_COUNTER_MEASUREMENT(llvm_thread_counters_);
 
   // Clean up handcrafted functions that have not been finalized. Clean up is done by
   // deleting the function from the module. Any reference to deleted functions in the

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/codegen/llvm-codegen.h
----------------------------------------------------------------------
diff --git a/be/src/codegen/llvm-codegen.h b/be/src/codegen/llvm-codegen.h
index 783269b..53569ca 100644
--- a/be/src/codegen/llvm-codegen.h
+++ b/be/src/codegen/llvm-codegen.h
@@ -169,7 +169,8 @@ class LlvmCodeGen {
   void Close();
 
   RuntimeProfile* runtime_profile() { return profile_; }
-  RuntimeProfile::Counter* codegen_timer() { return codegen_timer_; }
+  RuntimeProfile::Counter* ir_generation_timer() { return ir_generation_timer_; }
+  RuntimeProfile::ThreadCounters* llvm_thread_counters() { return llvm_thread_counters_; }
 
   /// Turns on/off optimization passes
   void EnableOptimizations(bool enable);
@@ -688,10 +689,6 @@ class LlvmCodeGen {
   /// This function parses the bitcode of 'fn' to populate basic blocks, instructions
   /// and other data structures attached to the function object. Return error status
   /// for any error.
-  Status MaterializeFunctionHelper(llvm::Function* fn);
-
-  /// Entry point for materializing function 'fn'. Invokes MaterializeFunctionHelper()
-  /// to do the actual work. Return error status for any error.
   Status MaterializeFunction(llvm::Function* fn);
 
   /// Materialize the module owned by this codegen object. This will materialize all
@@ -754,11 +751,12 @@ class LlvmCodeGen {
   /// Time spent reading the .ir file from the file system.
   RuntimeProfile::Counter* load_module_timer_;
 
-  /// Time spent constructing the in-memory module from the ir.
+  /// Time spent creating the initial module with the cross-compiled Impala IR.
   RuntimeProfile::Counter* prepare_module_timer_;
 
-  /// Time spent doing codegen (adding IR to the module)
-  RuntimeProfile::Counter* codegen_timer_;
+  /// Time spent by ExecNodes while adding IR to the module. Update by
+  /// FragmentInstanceState during its 'CODEGEN_START' state.
+  RuntimeProfile::Counter* ir_generation_timer_;
 
   /// Time spent optimizing the module.
   RuntimeProfile::Counter* optimization_timer_;
@@ -774,6 +772,10 @@ class LlvmCodeGen {
   RuntimeProfile::Counter* num_functions_;
   RuntimeProfile::Counter* num_instructions_;
 
+  /// Aggregated llvm thread counters. Also includes the phase represented by
+  /// 'ir_generation_timer_' and hence is also updated by FragmentInstanceState.
+  RuntimeProfile::ThreadCounters* llvm_thread_counters_;
+
   /// whether or not optimizations are enabled
   bool optimizations_enabled_;
 

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/exec/hdfs-avro-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-avro-scanner.cc b/be/src/exec/hdfs-avro-scanner.cc
index fe1bed4..e74b589 100644
--- a/be/src/exec/hdfs-avro-scanner.cc
+++ b/be/src/exec/hdfs-avro-scanner.cc
@@ -1062,8 +1062,6 @@ Status HdfsAvroScanner::CodegenReadScalar(const AvroSchemaElement& element,
 Status HdfsAvroScanner::CodegenDecodeAvroData(const HdfsScanNodeBase* node,
     LlvmCodeGen* codegen, const vector<ScalarExpr*>& conjuncts,
     llvm::Function** decode_avro_data_fn) {
-  SCOPED_TIMER(codegen->codegen_timer());
-
   llvm::Function* materialize_tuple_fn;
   RETURN_IF_ERROR(CodegenMaterializeTuple(node, codegen, &materialize_tuple_fn));
   DCHECK(materialize_tuple_fn != nullptr);

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/exec/hdfs-parquet-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-parquet-scanner.cc b/be/src/exec/hdfs-parquet-scanner.cc
index ae22149..73dd29b 100644
--- a/be/src/exec/hdfs-parquet-scanner.cc
+++ b/be/src/exec/hdfs-parquet-scanner.cc
@@ -1019,7 +1019,6 @@ Status HdfsParquetScanner::Codegen(HdfsScanNodeBase* node,
   *process_scratch_batch_fn = nullptr;
   LlvmCodeGen* codegen = node->runtime_state()->codegen();
   DCHECK(codegen != nullptr);
-  SCOPED_TIMER(codegen->codegen_timer());
 
   llvm::Function* fn = codegen->GetFunction(IRFunction::PROCESS_SCRATCH_BATCH, true);
   DCHECK(fn != nullptr);

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/exec/hdfs-scanner.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/hdfs-scanner.cc b/be/src/exec/hdfs-scanner.cc
index a4aee4d..d191d3f 100644
--- a/be/src/exec/hdfs-scanner.cc
+++ b/be/src/exec/hdfs-scanner.cc
@@ -323,7 +323,6 @@ Status HdfsScanner::CodegenWriteCompleteTuple(const HdfsScanNodeBase* node,
     LlvmCodeGen* codegen, const vector<ScalarExpr*>& conjuncts,
     llvm::Function** write_complete_tuple_fn) {
   *write_complete_tuple_fn = NULL;
-  SCOPED_TIMER(codegen->codegen_timer());
   RuntimeState* state = node->runtime_state();
 
   // Cast away const-ness.  The codegen only sets the cached typed llvm struct.
@@ -531,7 +530,6 @@ Status HdfsScanner::CodegenWriteAlignedTuples(const HdfsScanNodeBase* node,
     LlvmCodeGen* codegen, llvm::Function* write_complete_tuple_fn,
     llvm::Function** write_aligned_tuples_fn) {
   *write_aligned_tuples_fn = NULL;
-  SCOPED_TIMER(codegen->codegen_timer());
   DCHECK(write_complete_tuple_fn != NULL);
 
   llvm::Function* write_tuples_fn =

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/exec/partitioned-aggregation-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/partitioned-aggregation-node.cc b/be/src/exec/partitioned-aggregation-node.cc
index c6c6189..d7b8c0a 100644
--- a/be/src/exec/partitioned-aggregation-node.cc
+++ b/be/src/exec/partitioned-aggregation-node.cc
@@ -1721,8 +1721,6 @@ Status PartitionedAggregationNode::CodegenCallUda(LlvmCodeGen* codegen,
 //
 Status PartitionedAggregationNode::CodegenUpdateTuple(
     LlvmCodeGen* codegen, llvm::Function** fn) {
-  SCOPED_TIMER(codegen->codegen_timer());
-
   for (const SlotDescriptor* slot_desc : intermediate_tuple_desc_->slots()) {
     if (slot_desc->type().type == TYPE_CHAR) {
       return Status::Expected("PartitionedAggregationNode::CodegenUpdateTuple(): cannot "
@@ -1811,8 +1809,6 @@ Status PartitionedAggregationNode::CodegenUpdateTuple(
 
 Status PartitionedAggregationNode::CodegenProcessBatch(LlvmCodeGen* codegen,
     TPrefetchMode::type prefetch_mode) {
-  SCOPED_TIMER(codegen->codegen_timer());
-
   llvm::Function* update_tuple_fn;
   RETURN_IF_ERROR(CodegenUpdateTuple(codegen, &update_tuple_fn));
 
@@ -1884,7 +1880,6 @@ Status PartitionedAggregationNode::CodegenProcessBatch(LlvmCodeGen* codegen,
 Status PartitionedAggregationNode::CodegenProcessBatchStreaming(
     LlvmCodeGen* codegen, TPrefetchMode::type prefetch_mode) {
   DCHECK(is_streaming_preagg_);
-  SCOPED_TIMER(codegen->codegen_timer());
 
   IRFunction::Type ir_fn = IRFunction::PART_AGG_NODE_PROCESS_BATCH_STREAMING;
   llvm::Function* process_batch_streaming_fn = codegen->GetFunction(ir_fn, true);

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/exec/select-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/select-node.cc b/be/src/exec/select-node.cc
index 0f0683b..df57db0 100644
--- a/be/src/exec/select-node.cc
+++ b/be/src/exec/select-node.cc
@@ -48,7 +48,6 @@ void SelectNode::Codegen(RuntimeState* state) {
   DCHECK(state->ShouldCodegen());
   ExecNode::Codegen(state);
   if (IsNodeCodegenDisabled()) return;
-  SCOPED_TIMER(state->codegen()->codegen_timer());
   Status codegen_status = CodegenCopyRows(state);
   runtime_profile()->AddCodegenMsg(codegen_status.ok(), codegen_status);
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/exec/text-converter.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/text-converter.cc b/be/src/exec/text-converter.cc
index 9e919a3..783e384 100644
--- a/be/src/exec/text-converter.cc
+++ b/be/src/exec/text-converter.cc
@@ -112,7 +112,6 @@ Status TextConverter::CodegenWriteSlot(LlvmCodeGen* codegen,
     return Status("TextConverter::CodegenWriteSlot(): Char isn't supported for"
         " CodegenWriteSlot");
   }
-  SCOPED_TIMER(codegen->codegen_timer());
 
   // Codegen is_null_string
   bool is_default_null = (len == 2 && null_col_val[0] == '\\' && null_col_val[1] == 'N');

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/runtime/fragment-instance-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/fragment-instance-state.cc b/be/src/runtime/fragment-instance-state.cc
index 7322519..1a0d452 100644
--- a/be/src/runtime/fragment-instance-state.cc
+++ b/be/src/runtime/fragment-instance-state.cc
@@ -244,11 +244,17 @@ Status FragmentInstanceState::Open() {
   if (runtime_state_->ShouldCodegen()) {
     UpdateState(StateEvent::CODEGEN_START);
     RETURN_IF_ERROR(runtime_state_->CreateCodegen());
-    exec_tree_->Codegen(runtime_state_);
-    // It shouldn't be fatal to fail codegen. However, until IMPALA-4233 is fixed,
-    // ScalarFnCall has no fall back to interpretation when codegen fails so propagates
-    // the error status for now.
-    RETURN_IF_ERROR(runtime_state_->CodegenScalarFns());
+    {
+      SCOPED_TIMER(runtime_state_->codegen()->ir_generation_timer());
+      SCOPED_TIMER(runtime_state_->codegen()->runtime_profile()->total_time_counter());
+      SCOPED_THREAD_COUNTER_MEASUREMENT(
+          runtime_state_->codegen()->llvm_thread_counters());
+      exec_tree_->Codegen(runtime_state_);
+      // It shouldn't be fatal to fail codegen. However, until IMPALA-4233 is fixed,
+      // ScalarFnCall has no fall back to interpretation when codegen fails so propagates
+      // the error status for now.
+      RETURN_IF_ERROR(runtime_state_->CodegenScalarFns());
+    }
 
     LlvmCodeGen* codegen = runtime_state_->codegen();
     DCHECK(codegen != nullptr);

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/runtime/tuple.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/tuple.cc b/be/src/runtime/tuple.cc
index 627c7a4..0061419 100644
--- a/be/src/runtime/tuple.cc
+++ b/be/src/runtime/tuple.cc
@@ -311,7 +311,6 @@ Status Tuple::CodegenMaterializeExprs(LlvmCodeGen* codegen, bool collect_string_
   if (collect_string_vals) {
     return Status("CodegenMaterializeExprs() collect_string_vals == true NYI");
   }
-  SCOPED_TIMER(codegen->codegen_timer());
   llvm::LLVMContext& context = codegen->context();
 
   // Codegen each compute function from slot_materialize_exprs

http://git-wip-us.apache.org/repos/asf/impala/blob/d9d3ce7c/be/src/util/tuple-row-compare.cc
----------------------------------------------------------------------
diff --git a/be/src/util/tuple-row-compare.cc b/be/src/util/tuple-row-compare.cc
index 5620dae..f05a88e 100644
--- a/be/src/util/tuple-row-compare.cc
+++ b/be/src/util/tuple-row-compare.cc
@@ -203,7 +203,6 @@ Status TupleRowComparator::Codegen(RuntimeState* state) {
 //   ret i32 0
 // }
 Status TupleRowComparator::CodegenCompare(LlvmCodeGen* codegen, llvm::Function** fn) {
-  SCOPED_TIMER(codegen->codegen_timer());
   llvm::LLVMContext& context = codegen->context();
   const vector<ScalarExpr*>& ordering_exprs = ordering_exprs_;
   llvm::Function* key_fns[ordering_exprs.size()];

[07/15] impala git commit: IMPALA-6747: Automate diagnostics collection.

Posted by ta...@apache.org.

IMPALA-6747: Automate diagnostics collection.

This commit adds the necessary tooling to automate diagnostics
collection for Impala daemons. Following diagnostics are supported.

1. Native core dump (+ shared libs)
2. GDB/Java thread dump (pstack + jstack)
3. Java heap dump (jmap)
4. Minidumps (using breakpad) *
5. Profiles

Given the required inputs, the script outputs a zip compressed
impala diagnostic bundle with all the diagnostics collected.

The script can be run manually with the following command.

python collect_diagnostics.py --help

Tested with python 2.6 and later.

* minidumps collected here correspond to the state of the Impala
process at the time this script is triggered. This is different
from collect_minidumps.py which archives the entire minidump
directory.

Change-Id: I166e726f1dd1ce81187616e4f06d2404fa379bf8
Reviewed-on: http://gerrit.cloudera.org:8080/10056
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
Tested-by: Bharath Vissapragada <bh...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/e63b725f
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/e63b725f
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/e63b725f

Branch: refs/heads/2.x
Commit: e63b725f50d077dffe552de36597010e3b6ed4cc
Parents: b01f781
Author: Bharath Vissapragada <bh...@cloudera.com>
Authored: Thu Apr 12 21:52:29 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 bin/diagnostics/__init__.py            |   0
 bin/diagnostics/collect_diagnostics.py | 545 ++++++++++++++++++++++++++++
 bin/diagnostics/collect_shared_libs.sh |  52 +++
 bin/rat_exclude_files.txt              |   1 +
 tests/unittests/test_command.py        |  49 +++
 5 files changed, 647 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/e63b725f/bin/diagnostics/__init__.py
----------------------------------------------------------------------
diff --git a/bin/diagnostics/__init__.py b/bin/diagnostics/__init__.py
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/impala/blob/e63b725f/bin/diagnostics/collect_diagnostics.py
----------------------------------------------------------------------
diff --git a/bin/diagnostics/collect_diagnostics.py b/bin/diagnostics/collect_diagnostics.py
new file mode 100644
index 0000000..8fe4561
--- /dev/null
+++ b/bin/diagnostics/collect_diagnostics.py
@@ -0,0 +1,545 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import datetime
+import errno
+import getpass
+import glob
+import logging
+import math
+import os
+import shutil
+import subprocess
+import sys
+import tarfile
+import time
+import tempfile
+import traceback
+
+from collections import namedtuple
+from contextlib import closing
+from struct import Struct
+from threading import Timer
+
+# This script is for automating the collection of following diagnostics from a host
+# running an Impala service daemon (catalogd/statestored/impalad). Following diagnostics
+# are supported.
+#
+# 1. Native core dump (+ shared libs)
+# 2. GDB/Java thread dump (pstack + jstack)
+# 3. Java heap dump (jmap)
+# 4. Minidumps (using breakpad)
+# 5. Profiles
+#
+# Dependencies:
+# 1. gdb package should be installed to collect native thread stacks/coredump. The binary
+#    location is picked up from the system path. In case of pstacks, the script falls back
+#    to the breakpad minidumps if the 'pstack' binary is not in system path.
+# 2. jstack/jmap from a JRE/JDK. Default location is picked up from system path but can be
+#    overriden with --java_home PATH_TO_JAVA_HOME.
+# 3. Mindumps are collected by sending a SIGUSR1 signal to the Impala process. Impala
+#    versions without full breakpad support (<= release 2.6) will reliably crash if
+#    we attempt to do that since those versions do not have the corresponding signal
+#    handler. Hence it is suggested to run this script only on releases 2.7 and later.
+# 4. python >= 2.6
+#
+# Usage: python collect_diagnostics.py --help
+#
+# Few example usages:
+#
+# Collect 3 jstacks, pstacks from an impalad process 3s apart.
+#  python collect_diagnostics.py --pid $(pidof impalad) --stacks 3 3
+#
+# Collect core dump and a Java heapdump from the catalogd process
+#  python collect_diagnostics.py --pid $(pidof impalad) --jmap --gcore
+#
+# Collect 5 breakpad minidumps from a statestored process 5s apart.
+#  python collect_diagnostics.py --pid $(pidof statestored) --minidumps 5 5
+#      --minidumps_dir /var/log/statestored/minidumps
+#
+#
+class Command(object):
+  """Wrapper around subprocess.Popen() that is canceled after a configurable timeout."""
+  def __init__(self, cmd, timeout=30):
+    self.cmd = cmd
+    self.timeout = timeout
+    self.child_killed_by_timeout = False
+
+  def run(self, cmd_stdin=None, cmd_stdout=subprocess.PIPE):
+    """Runs the command 'cmd' by setting the appropriate stdin/out. The command is killed
+    if hits a timeout (controlled by self.timeout)."""
+    cmd_string = " ".join(self.cmd)
+    logging.info("Starting command %s with a timeout of %s"
+        % (cmd_string, str(self.timeout)))
+    self.child = subprocess.Popen(self.cmd, stdin=cmd_stdin, stdout=cmd_stdout)
+    timer = Timer(self.timeout, self.kill_child)
+    try:
+      timer.start()
+      # self.stdout is set to None if cmd_stdout is anything other than PIPE. The actual
+      # stdout is written to the file corresponding to cmd_stdout.
+      self.stdout = self.child.communicate()[0]
+      if self.child.returncode == 0:
+        logging.info("Command finished successfully: " + cmd_string)
+      else:
+        cmd_status = "timed out" if self.child_killed_by_timeout else "failed"
+        logging.error("Command %s: %s" % (cmd_status, cmd_string))
+      return self.child.returncode
+    finally:
+      timer.cancel()
+    return -1
+
+  def kill_child(self):
+    """Kills the running command (self.child)."""
+    self.child_killed_by_timeout = True
+    self.child.kill()
+
+class ImpalaDiagnosticsHandler(object):
+  IMPALA_PROCESSES = ["impalad", "catalogd", "statestored"]
+  OUTPUT_DIRS_TO_CREATE = ["stacks", "gcores", "jmaps", "profiles",
+      "shared_libs", "minidumps"]
+  MINIDUMP_HEADER = namedtuple("MDRawHeader", "signature version stream_count \
+      stream_directory_rva checksum time_date_stamp flags")
+
+  def __init__(self, args):
+    """Initializes the state by setting the paths of required executables."""
+    self.args = args
+    if args.pid <= 0:
+      return
+
+    self.script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
+    # Name of the Impala process for which diagnostics should be collected.
+    self.target_process_name = self.get_target_process_name()
+
+    self.java_home = self.get_java_home_from_env()
+    if not self.java_home and args.java_home:
+      self.java_home = os.path.abspath(args.java_home)
+    self.jstack_cmd = os.path.join(self.java_home, "bin/jstack")
+    self.java_cmd = os.path.join(self.java_home, "bin/java")
+    self.jmap_cmd = os.path.join(self.java_home, "bin/jmap")
+
+    self.gdb_cmd = self.get_command_from_path("gdb")
+    self.gcore_cmd = self.get_command_from_path("gcore")
+    self.pstack_cmd = self.get_command_from_path("pstack")
+
+  def create_output_dir_structure(self):
+    """Creates the skeleton directory structure for the diagnostics output collection."""
+    self.collection_root_dir = tempfile.mkdtemp(prefix="impala-diagnostics-%s" %
+        datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-"),
+        dir=os.path.abspath(self.args.output_dir))
+    for dirname in self.OUTPUT_DIRS_TO_CREATE:
+      os.mkdir(os.path.join(self.collection_root_dir, dirname))
+
+  def get_command_from_path(self, cmd):
+    """Returns the path to a given command executable, if one exists in the
+    system PATH."""
+    for path in os.environ["PATH"].split(os.pathsep):
+      cmd_path = os.path.join(path, cmd)
+      if os.access(cmd_path, os.X_OK):
+        return cmd_path
+    return ""
+
+  def get_target_process_name(self):
+    """Returns the process name of the target process for which diagnostics
+    should be collected."""
+    try:
+      return open("/proc/%s/comm" % self.args.pid).read().strip()
+    except Exception:
+      logging.exception("Failed to get target process name.")
+      return ""
+
+  def get_num_child_proc(self, name):
+    """Returns number of processes with the given name and target Impala pid
+    as parent."""
+    # Not all pgrep versions support -c parameter. So fetch the stdout and
+    # count the number of items in the list.
+    cmd = Command(["pgrep", "-P", str(self.args.pid), name])
+    cmd.run()
+    return len(cmd.stdout.split("\n")) - 1
+
+  def get_java_home_from_env(self):
+    """Returns JAVA_HOME set in the env of the target process."""
+    try:
+      envs = open("/proc/%s/environ" % self.args.pid).read().split("\0")
+      for s in envs:
+        k, v = s.split("=", 1)
+        if k == "JAVA_HOME":
+          return v
+    except Exception:
+      logging.exception("Failed to determine JAVA_HOME from proc env.")
+      return ""
+
+  def get_free_disk_space_gbs(self, path):
+    """Returns free disk space (in GBs) of the partition hosting the given path."""
+    s = os.statvfs(path)
+    return (s.f_bsize * s.f_bavail)/(1024.0 * 1024.0 * 1024.0)
+
+  def get_minidump_create_timestamp(self, minidump_path):
+    """Returns the unix timestamp of the minidump create time. It is extracted from
+    the minidump header."""
+    # Read the minidump's header to extract the create time stamp. More information about
+    # the mindump header format can be found here: https://goo.gl/uxKZVe
+    #
+    # typedef struct {
+    #   uint32_t  signature;
+    #   uint32_t  version;
+    #   uint32_t  stream_count;
+    #   MDRVA     stream_directory_rva;  /* A |stream_count|-sized array of
+    #                                     * MDRawDirectory structures. */
+    #   uint32_t  checksum;              /* Can be 0.  In fact, that's all that's
+    #                                     * been found in minidump files. */
+    #   uint32_t  time_date_stamp;       /* time_t */
+    #   uint64_t  flags;
+    # } MDRawHeader;  /* MINIDUMP_HEADER */
+    s = Struct("IIIiIIQ")
+    data = open(minidump_path, "rb").read(s.size)
+    header = self.MINIDUMP_HEADER(*s.unpack_from(data))
+    return header.time_date_stamp
+
+  def wait_for_minidump(self):
+    """Minidump collection is async after sending the SIGUSR1 signal. So this method
+    waits till it is written to the disk. Since minidump forks off a new process from
+    the parent Impala process we need to wait till the forked process exits.
+    Returns after 30s to prevent infinite waiting. Should be called after sending the
+    SIGUSR1 signal to the Impala process."""
+    MAX_WAIT_TIME_S = 30
+    start_time = time.time()
+    while time.time() < start_time + MAX_WAIT_TIME_S:
+      # Sleep for a bit to ensure that the process fork to write minidump has started.
+      # Otherwise the subsequent check on the process count could pass even when the
+      # fork didn't succeed. This sleep reduces the likelihood of such race.
+      time.sleep(1)
+      if self.get_num_child_proc(self.target_process_name) == 0:
+        break
+    return
+
+  def validate_args(self):
+    """Returns True if self.args are valid, false otherwise"""
+    if self.args.pid <= 0:
+      logging.critical("Invalid PID provided.")
+      return False
+
+    if self.target_process_name not in self.IMPALA_PROCESSES:
+      logging.critical("No valid Impala process with the given PID %s" % str(self.args.pid))
+      return False
+
+    if not self.java_home:
+      logging.critical("JAVA_HOME could not be inferred from process env.\
+          Please specify --java_home.")
+      return False
+
+    if self.args.jmap and not os.path.exists(self.jmap_cmd):
+      logging.critical("jmap binary not found, required to collect a Java heap dump.")
+      return False
+
+    if self.args.gcore and not os.path.exists(self.gcore_cmd):
+      logging.critical("gcore binary not found, required to collect a core dump.")
+      return False
+
+    if self.args.profiles_dir and not os.path.isdir(self.args.profiles_dir):
+      logging.critical("No valid profiles directory at path: %s" % self.args.profiles_dir)
+      return False
+
+    return True
+
+  def collect_thread_stacks(self):
+    """Collects jstack/jstack-m/pstack for the given pid in that order. pstack collection
+    falls back to minidumps if pstack binary is missing from the system path. Minidumps
+    are collected by sending a SIGUSR1 to the Impala process and then archiving the
+    contents of the minidump directory. The number of times stacks are collected and the
+    sleep time between the collections are controlled by --stacks argument."""
+    stacks_count, stacks_interval_secs = self.args.stacks
+    if stacks_count <= 0 or stacks_interval_secs < 0:
+      return
+
+    # Skip jstack collection if the jstack binary does not exist.
+    skip_jstacks = not os.path.exists(self.jstack_cmd)
+    if skip_jstacks:
+      logging.info("Skipping jstack collection since jstack binary couldn't be located.")
+
+    # Fallback to breakpad minidump collection if pstack binaries are missing.
+    fallback_to_minidump = False
+    if not self.pstack_cmd:
+      # Fall back to collecting a minidump if pstack is not installed.
+      if not os.path.exists(self.args.minidumps_dir):
+        logging.info("Skipping pstacks since pstack binary couldn't be located. Provide "
+            + "--minidumps_dir for collecting minidumps instead.")
+        # At this point, we can't proceed since we have nothing to collect.
+        if skip_jstacks:
+          return
+      else:
+        fallback_to_minidump = True;
+        logging.info("Collecting breakpad minidumps since pstack/gdb binaries are " +
+            "missing.")
+
+    stacks_dir = os.path.join(self.collection_root_dir, "stacks")
+    # Populate the commands to run in 'cmds_to_run' depending on what kinds of thread
+    # stacks to collect. Each entry is a tuple of form
+    # (Command, stdout_prefix, is_minidump). 'is_minidump' tells whether the command
+    # is trying to trigger a minidump collection.
+    cmds_to_run = []
+    if not skip_jstacks:
+      cmd_args = [self.jstack_cmd, str(self.args.pid)]
+      cmds_to_run.append((Command(cmd_args, self.args.timeout), "jstack", False))
+      # Collect mixed-mode jstack, contains native stack frames.
+      cmd_args_mixed_mode = [self.jstack_cmd, "-m", str(self.args.pid)]
+      cmds_to_run.append(
+          (Command(cmd_args_mixed_mode, self.args.timeout), "jstack-m", False))
+
+    if fallback_to_minidump:
+      cmd_args = ["kill", "-SIGUSR1", str(self.args.pid)]
+      cmds_to_run.append((Command(cmd_args, self.args.timeout), None, True))
+    elif self.pstack_cmd:
+      cmd_args = [self.pstack_cmd, str(self.args.pid)]
+      cmds_to_run.append((Command(cmd_args, self.args.timeout), "pstack", False))
+
+    collection_start_ts = time.time()
+    for i in xrange(stacks_count):
+      for cmd, file_prefix, is_minidump in cmds_to_run:
+        if file_prefix:
+          stdout_file = os.path.join(stacks_dir, file_prefix + "-" + str(i) + ".txt")
+          with open(stdout_file, "w") as output:
+            cmd.run(cmd_stdout=output)
+        else:
+          cmd.run()
+          # Incase of minidump collection, wait for it to be written.
+          if is_minidump:
+            self.wait_for_minidump()
+      time.sleep(stacks_interval_secs)
+
+    # Copy minidumps if required.
+    if fallback_to_minidump:
+      minidump_out_dir =  os.path.join(self.collection_root_dir, "minidumps")
+      self.copy_minidumps(minidump_out_dir, collection_start_ts);
+
+  def collect_minidumps(self):
+    """Collects minidumps on the Impala process based on argument --minidumps. The
+    minidumps are collected by sending a SIGUSR1 signal to the Impala process and then
+    the resulting minidumps are copied to the target directory."""
+    minidump_count, minidump_interval_secs = self.args.minidumps
+    if minidump_count <= 0 or minidump_interval_secs < 0:
+      return
+    # Impala process writes a minidump when it encounters a SIGUSR1.
+    cmd_args = ["kill", "-SIGUSR1", str(self.args.pid)]
+    cmd = Command(cmd_args, self.args.timeout)
+    collection_start_ts = time.time()
+    for i in xrange(minidump_count):
+      cmd.run()
+      self.wait_for_minidump()
+      time.sleep(minidump_interval_secs)
+    out_dir = os.path.join(self.collection_root_dir, "minidumps")
+    self.copy_minidumps(out_dir, collection_start_ts);
+
+  def copy_minidumps(self, target, start_ts):
+    """Copies mindumps with create time >= start_ts to 'target' directory."""
+    logging.info("Copying minidumps from %s to %s with ctime >= %s"
+        % (self.args.minidumps_dir, target, start_ts))
+    for filename in glob.glob(os.path.join(self.args.minidumps_dir, "*.dmp")):
+      try:
+        minidump_ctime = self.get_minidump_create_timestamp(filename)
+        if minidump_ctime >= math.floor(start_ts):
+          shutil.copy2(filename, target)
+        else:
+          logging.info("Ignored mindump: %s ctime: %s" % (filename, minidump_ctime))
+      except Exception:
+        logging.exception("Error processing minidump at path: %s. Skipping it." % filename)
+
+  def collect_java_heapdump(self):
+    """Generates the Java heap dump of the Impala process using the 'jmap' command."""
+    if not self.args.jmap:
+      return
+    jmap_dir = os.path.join(self.collection_root_dir, "jmaps")
+    out_file = os.path.join(jmap_dir, self.target_process_name + "_heap.bin")
+    # jmap command requires it to be run as the process owner.
+    # Command: jmap -dump:format=b,file=<outfile> <pid>
+    cmd_args = [self.jmap_cmd, "-dump:format=b,file=" + out_file, str(self.args.pid)]
+    Command(cmd_args, self.args.timeout).run()
+
+  def collect_native_coredump(self):
+    """Generates the core dump of the Impala process using the 'gcore' command"""
+    if not self.args.gcore:
+      return
+    # Command: gcore -o <outfile> <pid>
+    gcore_dir = os.path.join(self.collection_root_dir, "gcores")
+    out_file_name = self.target_process_name + "-" +\
+        datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".core"
+    out_file = os.path.join(gcore_dir, out_file_name)
+    cmd_args = [self.gcore_cmd, "-o", out_file, str(self.args.pid)]
+    Command(cmd_args, self.args.timeout).run()
+
+  def collect_query_profiles(self):
+    """Collects Impala query profiles from --profiles_dir. Enforces an uncompressed limit
+    of --profiles_max_size_limit bytes on the copied profile logs."""
+    if not self.args.profiles_dir:
+      return
+    out_dir = os.path.join(self.collection_root_dir, "profiles")
+    # Hardcoded in Impala
+    PROFILE_LOG_FILE_PATTERN = "impala_profile_log_1.1-*";
+    logging.info("Collecting profile data, limiting size to %f GB" %
+        (self.args.profiles_max_size_limit/(1024 * 1024 * 1024)))
+
+    profiles_path = os.path.join(self.args.profiles_dir, PROFILE_LOG_FILE_PATTERN)
+    # Sort the profiles by creation time and copy the most recent ones in that order.
+    sorted_profiles =\
+        sorted(glob.iglob(profiles_path), key=os.path.getctime, reverse=True)
+    profile_size_included_so_far = 0
+    for profile_path in sorted_profiles:
+      try:
+        file_size = os.path.getsize(profile_path)
+        if file_size == 0:
+          continue
+        if profile_size_included_so_far + file_size > self.args.profiles_max_size_limit:
+          # Copying the whole file violates profiles_max_size_limit. Copy a part of it.
+          # Profile logs are newline delimited with a single profile per line.
+          num_bytes_to_copy =\
+              self.args.profiles_max_size_limit - profile_size_included_so_far
+          file_name = os.path.basename(profile_path)
+          copied_bytes = 0
+          with open(profile_path, "rb") as in_file:
+            with open(os.path.join(out_dir, file_name), "wb") as out_file:
+              for line in in_file.readlines():
+                if copied_bytes + len(line) > num_bytes_to_copy:
+                  break
+                out_file.write(line)
+                copied_bytes += len(line)
+          return
+        profile_size_included_so_far += file_size
+        shutil.copy2(profile_path, out_dir)
+      except:
+        logging.exception("Encountered an error while collecting profile %s. Skipping it."
+            % profile_path)
+
+  def collect_shared_libs(self):
+    """Collects shared libraries loaded by the target Impala process."""
+    # Shared libs are collected if either of core dump or minidumps are enabled.
+    if not (self.args.gcore or self.args.minidumps_dir):
+      return
+    # If gdb binary is missing, we cannot extract the shared library list
+    if not self.gdb_cmd:
+      logging.info("'gdb' executable missing. Skipping shared library collection.")
+      return
+
+    out_dir = os.path.join(self.collection_root_dir, "shared_libs")
+
+    script_path = os.path.join(self.script_dir, "collect_shared_libs.sh")
+    cmd_args = [script_path, self.gdb_cmd, str(self.args.pid), out_dir]
+    Command(cmd_args, self.args.timeout).run()
+
+  def archive_diagnostics(self):
+    """Creates a gztar of the collected diagnostics and cleans up the original
+    directory. Returns True if successful, False otherwise."""
+    try:
+      # tarfile does not support context managers in python 2.6. We use closing() to work
+      # around that.
+      with closing(tarfile.open(self.collection_root_dir + '.tar.gz', mode='w:gz')) as\
+          archive:
+        archive.add(self.collection_root_dir)
+      return True
+    except Exception:
+      logging.exception("Encountered an exception archiving diagnostics, cleaning up.")
+      return False
+    finally:
+      self.cleanup()
+
+  def cleanup(self):
+    """Cleans up the directory to which diagnostics were written."""
+    shutil.rmtree(self.collection_root_dir, ignore_errors=True)
+
+  def get_diagnostics(self):
+    """Calls all collect_*() methods to collect diagnostics. Returns True if no errors
+    were encountered during diagnostics collection, False otherwise."""
+    if not self.validate_args():
+      return False
+    logging.info("Using JAVA_HOME: %s" % self.java_home)
+    self.create_output_dir_structure()
+    logging.info("Free disk space: %.2fGB" %
+        self.get_free_disk_space_gbs(self.collection_root_dir))
+    os.chdir(self.args.output_dir)
+    collection_methods = [self.collect_shared_libs, self.collect_query_profiles,
+        self.collect_native_coredump, self.collect_java_heapdump, self.collect_minidumps,
+        self.collect_thread_stacks]
+    exception_encountered = False
+    for method in collection_methods:
+      try:
+        method()
+      except IOError as e:
+        if e.errno == errno.ENOSPC:
+          # Clean up and abort if we are low on disk space. Other IOErrors are logged and
+          # ignored.
+          logging.exception("Disk space low, aborting.")
+          self.cleanup()
+          return False
+        logging.exception("Encountered an IOError calling: %s" % method.__name__)
+        exception_encountered = True
+      except Exception:
+        exception_encountered = True
+        logging.exception("Encountered an exception calling: %s" % method.__name__)
+    if exception_encountered:
+      logging.error("Encountered an exception collecting diagnostics. Final output " +
+          "could be partial.\n")
+    # Archive the directory, even if it is partial.
+    archive_path = self.collection_root_dir + ".tar.gz"
+    logging.info("Archiving diagnostics to path: %s" % archive_path)
+    if self.archive_diagnostics():
+      logging.info("Diagnostics collected at path: %s" % archive_path)
+    return not exception_encountered
+
+def get_args_parser():
+  """Creates the argument parser and adds the flags"""
+  parser = argparse.ArgumentParser(description="Impala diagnostics collection")
+  parser.add_argument("--pid", action="store", dest="pid", type=int, default=0,
+      help="PID of the Impala process for which diagnostics should be collected.")
+  parser.add_argument("--java_home", action="store", dest="java_home", default="",
+      help="If not set, it is set to the JAVA_HOME from the pid's environment.")
+  parser.add_argument("--timeout", action="store", dest="timeout", default=300,
+      type=int, help="Timeout (in seconds) for each of the diagnostics commands")
+  parser.add_argument("--stacks", action="store", dest="stacks", nargs=2, type=int,
+      default=[0, 0], metavar=("COUNT", "INTERVAL (in seconds)"),
+      help="Collect jstack, mixed-mode jstack and pstacks of the Impala process.\
+      Breakpad minidumps are collected in case of missing pstack binaries.")
+  parser.add_argument("--jmap", action="store_true", dest="jmap", default=False,
+      help="Collect heap dump of the Java process")
+  parser.add_argument("--gcore", action="store_true", dest="gcore", default=False,
+      help="Collect the native core dump using gdb. Requires gdb to be installed.")
+  parser.add_argument("--minidumps", action="store", dest="minidumps", type=int,
+      nargs=2, default=[0, 0], metavar=("COUNT", "INTERVAL (in seconds)"),
+      help="Collect breakpad minidumps for the Impala process. Requires --minidumps_dir\
+      be set.")
+  parser.add_argument("--minidumps_dir", action="store", dest="minidumps_dir", default="",
+      help="Path of the directory to which Impala process' minidumps are written")
+  parser.add_argument("--profiles_dir", action="store", dest="profiles_dir", default="",
+      help="Path of the profiles directory to be included in the diagnostics output.")
+  parser.add_argument("--profiles_max_size_limit", action="store",
+      dest="profiles_max_size_limit", default=3*1024*1024*1024,
+      type=float, help="Uncompressed limit (in Bytes) on profile logs collected from\
+      --profiles_dir. Defaults to 3GB.")
+  parser.add_argument("--output_dir", action="store", dest="output_dir",
+      default = tempfile.gettempdir(), help="Output directory that contains the final "
+      "diagnostics data. Defaults to %s" % tempfile.gettempdir())
+  return parser
+
+if __name__ == "__main__":
+  parser = get_args_parser()
+  if len(sys.argv) == 1:
+    parser.print_usage()
+    sys.exit(1)
+  logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, datefmt="%Y-%m-%d %H:%M:%S",
+      format="%(asctime)s %(levelname)-8s %(message)s")
+  diagnostics_handler = ImpalaDiagnosticsHandler(parser.parse_args())
+  logging.info("Running as user: %s" % getpass.getuser())
+  logging.info("Input args: %s" % " ".join(sys.argv))
+  sys.exit(0 if diagnostics_handler.get_diagnostics() else 1)

http://git-wip-us.apache.org/repos/asf/impala/blob/e63b725f/bin/diagnostics/collect_shared_libs.sh
----------------------------------------------------------------------
diff --git a/bin/diagnostics/collect_shared_libs.sh b/bin/diagnostics/collect_shared_libs.sh
new file mode 100755
index 0000000..d5de349
--- /dev/null
+++ b/bin/diagnostics/collect_shared_libs.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# $1 - gdb binary path
+# $2 - pid of the Impala process
+# $3 - Output directory to copy the sharedlibs to.
+
+set -euxo pipefail
+
+if [ "$#" -ne 3 ]; then
+  echo "Incorrect usage. Expected: $0 <gdb executable path> <target PID> <output dir>"
+  exit 1
+fi
+
+if [ ! -d $3 ]; then
+  echo "Directory $3 does not exist. This script expects the output directory to exist."
+  exit 1
+fi
+
+# Generate the list of shared libs path to copy.
+shared_libs_to_copy=$(mktemp)
+$1 --pid $2 --batch -ex 'info shared' 2> /dev/null | sed '1,/Shared Object Library/d' |
+    sed 's/\(.*\s\)\(\/.*\)/\2/' | grep \/ > $shared_libs_to_copy
+
+echo "Generated shared library listing for the process."
+
+# Copy the files to the target directory keeping the directory structure intact.
+# We use rsync instead of 'cp --parents' since the latter has permission issues
+# copying from system level directories. https://goo.gl/6yYNhw
+rsync -LR --files-from=$shared_libs_to_copy / $3
+
+echo "Copied the shared libraries to the target directory: $3"
+
+rm -f $shared_libs_to_copy
+# Make sure the impala user has write permissions on all the copied sharedlib paths.
+chmod 755 -R $3

http://git-wip-us.apache.org/repos/asf/impala/blob/e63b725f/bin/rat_exclude_files.txt
----------------------------------------------------------------------
diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt
index a2b6267..ebaa4d0 100644
--- a/bin/rat_exclude_files.txt
+++ b/bin/rat_exclude_files.txt
@@ -17,6 +17,7 @@ shell/__init__.py
 ssh_keys/id_rsa_impala
 testdata/__init__.py
 tests/__init__.py
+bin/diagnostics/__init__.py
 www/index.html
 
 # See $IMPALA_HOME/LICENSE.txt

http://git-wip-us.apache.org/repos/asf/impala/blob/e63b725f/tests/unittests/test_command.py
----------------------------------------------------------------------
diff --git a/tests/unittests/test_command.py b/tests/unittests/test_command.py
new file mode 100644
index 0000000..a2a9e4c
--- /dev/null
+++ b/tests/unittests/test_command.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Unit tests for collect_diagnostics.Command
+
+import os
+import pytest
+import sys
+
+# Update the sys.path to include the modules from bin/diagnostics.
+sys.path.insert(0,
+    os.path.abspath(os.path.join(os.path.dirname(__file__), '../../bin/diagnostics')))
+from collect_diagnostics import Command
+
+class TestCommand(object):
+  """ Unit tests for the Command class"""
+
+  def test_simple_commands(self):
+    # Successful command
+    c = Command(["echo", "foo"], 1000)
+    assert c.run() == 0, "Command expected to succeed, but failed"
+    assert c.stdout.strip("\n") == "foo"
+
+    # Failed command, check return code
+    c = Command(["false"], 1000)
+    assert c.run() == 1
+
+  def test_command_timer(self):
+    # Try to run a command that sleeps for 1000s and set a
+    # timer for 1 second. The command should timed out.
+    c = Command(["sleep", "1000"], 1)
+    assert c.run() != 0, "Command expected to timeout but succeeded."
+    assert c.child_killed_by_timeout, "Command didn't timeout as expected."
+
+

[10/15] impala git commit: IMPALA-6372: Go parallel for Hive dataload

Posted by ta...@apache.org.

IMPALA-6372: Go parallel for Hive dataload

This changes generate-schema-statements.py to produce
separate SQL files for different file formats for Hive.
This changes load-data.py to go parallel on these
separate Hive SQL files. For correctness, the text
version of all tables must be loaded before any
of the other file formats.

load-data.py runs DDLs to create the tables in Impala
and goes parallel. Currently, there are some minor
dependencies so that text tables must be created
prior to creating the other table formats. This
changes the definitions of some tables in
testdata/datasets/functional/functional_schema_template.sql
to remove these dependencies. Now, the DDLs for the
text tables can run in parallel to the other file formats.

To unify the parallelism for Impala and Hive, load-data.py
now uses a single fixed-size pool of processes to run all
SQL files rather than spawning a thread per SQL file.

This also modifies the locations that do invalidate to
use refresh where possible and eliminate global
invalidates.

For debuggability, different SQL executions output to
different log files rather than to standard out. If an
error occurs, this will point out the relevant log
file.

This saves about 10-15 minutes on dataload (including
for GVO).

Change-Id: I34b71e6df3c8f23a5a31451280e35f4dc015a2fd
Reviewed-on: http://gerrit.cloudera.org:8080/8894
Reviewed-by: Joe McDonnell <jo...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/0be44ce6
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/0be44ce6
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/0be44ce6

Branch: refs/heads/2.x
Commit: 0be44ce659455eb662fd3f2209147162ab36dba1
Parents: b7a624a
Author: Joe McDonnell <jo...@cloudera.com>
Authored: Wed Dec 20 10:29:10 2017 -0800
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 bin/load-data.py                                | 395 +++++++++++++------
 testdata/bin/generate-schema-statements.py      | 140 +++++--
 testdata/bin/load_nested.py                     |  33 +-
 .../functional/functional_schema_template.sql   |  10 +-
 4 files changed, 398 insertions(+), 180 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/0be44ce6/bin/load-data.py
----------------------------------------------------------------------
diff --git a/bin/load-data.py b/bin/load-data.py
index ed51487..28a504f 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -23,8 +23,10 @@
 import collections
 import getpass
 import logging
+import multiprocessing
 import os
 import re
+import shutil
 import sqlparse
 import subprocess
 import sys
@@ -32,15 +34,11 @@ import tempfile
 import time
 import traceback
 
-from itertools import product
 from optparse import OptionParser
-from Queue import Queue
 from tests.beeswax.impala_beeswax import *
-from threading import Thread
+from multiprocessing.pool import ThreadPool
 
-logging.basicConfig()
 LOG = logging.getLogger('load-data.py')
-LOG.setLevel(logging.DEBUG)
 
 parser = OptionParser()
 parser.add_option("-e", "--exploration_strategy", dest="exploration_strategy",
@@ -80,6 +78,8 @@ parser.add_option("--use_kerberos", action="store_true", default=False,
                   help="Load data on a kerberized cluster.")
 parser.add_option("--principal", default=None, dest="principal",
                   help="Kerberos service principal, required if --use_kerberos is set")
+parser.add_option("--num_processes", default=multiprocessing.cpu_count(),
+                  dest="num_processes", help="Number of parallel processes to use.")
 
 options, args = parser.parse_args()
 
@@ -111,21 +111,6 @@ if options.use_kerberos:
 HIVE_ARGS = '-n %s -u "jdbc:hive2://%s/default;%s" --verbose=true'\
     % (getpass.getuser(), options.hive_hs2_hostport, hive_auth)
 
-# When HiveServer2 is configured to use "local" mode (i.e., MR jobs are run
-# in-process rather than on YARN), Hadoop's LocalDistributedCacheManager has a
-# race, wherein it tires to localize jars into
-# /tmp/hadoop-$USER/mapred/local/<millis>. Two simultaneous Hive queries
-# against HS2 can conflict here. Weirdly LocalJobRunner handles a similar issue
-# (with the staging directory) by appending a random number. To over come this,
-# in the case that HS2 is on the local machine (which we conflate with also
-# running MR jobs locally), we move the temporary directory into a unique
-# directory via configuration. This block can be removed when
-# https://issues.apache.org/jira/browse/MAPREDUCE-6441 is resolved.
-# A similar workaround is used in tests/common/impala_test_suite.py.
-if options.hive_hs2_hostport.startswith("localhost:"):
-  HIVE_ARGS += ' --hiveconf "mapreduce.cluster.local.dir=%s"' % (tempfile.mkdtemp(
-    prefix="impala-data-load-"))
-
 HADOOP_CMD = os.path.join(os.environ['HADOOP_HOME'], 'bin/hadoop')
 
 def available_workloads(workload_dir):
@@ -135,70 +120,112 @@ def available_workloads(workload_dir):
 def validate_workloads(all_workloads, workloads):
   for workload in workloads:
     if workload not in all_workloads:
-      print 'Workload \'%s\' not found in workload directory' % workload
-      print 'Available workloads: ' + ', '.join(all_workloads)
+      LOG.error('Workload \'%s\' not found in workload directory' % workload)
+      LOG.error('Available workloads: ' + ', '.join(all_workloads))
       sys.exit(1)
 
-def exec_cmd(cmd, error_msg, exit_on_error=True):
-  ret_val = -1
-  try:
+def exec_cmd(cmd, error_msg=None, exit_on_error=True, out_file=None):
+  """Run the given command in the shell returning whether the command
+     succeeded. If 'error_msg' is set, log the error message on failure.
+     If 'exit_on_error' is True, exit the program on failure.
+     If 'out_file' is specified, log all output to that file."""
+  success = True
+  if out_file:
+    with open(out_file, 'w') as f:
+      ret_val = subprocess.call(cmd, shell=True, stderr=f, stdout=f)
+  else:
     ret_val = subprocess.call(cmd, shell=True)
-  except Exception as e:
-    error_msg = "%s: %s" % (error_msg, str(e))
-  finally:
-    if ret_val != 0:
-      print error_msg
-      if exit_on_error: sys.exit(ret_val)
-  return ret_val
-
-def exec_hive_query_from_file(file_name):
-  if not os.path.exists(file_name): return
-  hive_cmd = "%s %s -f %s" % (HIVE_CMD, HIVE_ARGS, file_name)
-  print 'Executing Hive Command: %s' % hive_cmd
-  exec_cmd(hive_cmd,  'Error executing file from Hive: ' + file_name)
+  if ret_val != 0:
+    if error_msg: LOG.info(error_msg)
+    if exit_on_error: sys.exit(ret_val)
+    success = False
+  return success
+
+def exec_hive_query_from_file_beeline(file_name):
+  if not os.path.exists(file_name):
+    LOG.info("Error: File {0} not found".format(file_name))
+    return False
+
+  LOG.info("Beginning execution of hive SQL: {0}".format(file_name))
+
+  # When HiveServer2 is configured to use "local" mode (i.e., MR jobs are run
+  # in-process rather than on YARN), Hadoop's LocalDistributedCacheManager has a
+  # race, wherein it tires to localize jars into
+  # /tmp/hadoop-$USER/mapred/local/<millis>. Two simultaneous Hive queries
+  # against HS2 can conflict here. Weirdly LocalJobRunner handles a similar issue
+  # (with the staging directory) by appending a random number. To over come this,
+  # in the case that HS2 is on the local machine (which we conflate with also
+  # running MR jobs locally), we move the temporary directory into a unique
+  # directory via configuration. This block can be removed when
+  # https://issues.apache.org/jira/browse/MAPREDUCE-6441 is resolved.
+  hive_args = HIVE_ARGS
+  unique_dir = None
+  if options.hive_hs2_hostport.startswith("localhost:"):
+    unique_dir = tempfile.mkdtemp(prefix="hive-data-load-")
+    hive_args += ' --hiveconf "mapreduce.cluster.local.dir=%s"' % unique_dir
+
+  output_file = file_name + ".log"
+  hive_cmd = "{0} {1} -f {2}".format(HIVE_CMD, hive_args, file_name)
+  is_success = exec_cmd(hive_cmd, exit_on_error=False, out_file=output_file)
+  shutil.rmtree(unique_dir)
+
+  if is_success:
+    LOG.info("Finished execution of hive SQL: {0}".format(file_name))
+  else:
+    LOG.info("Error executing hive SQL: {0} See: {1}".format(file_name, \
+             output_file))
+
+  return is_success
 
 def exec_hbase_query_from_file(file_name):
   if not os.path.exists(file_name): return
   hbase_cmd = "hbase shell %s" % file_name
-  print 'Executing HBase Command: %s' % hbase_cmd
-  exec_cmd(hbase_cmd, 'Error executing hbase create commands')
+  LOG.info('Executing HBase Command: %s' % hbase_cmd)
+  exec_cmd(hbase_cmd, error_msg='Error executing hbase create commands')
 
 # KERBEROS TODO: fails when kerberized and impalad principal isn't "impala"
 def exec_impala_query_from_file(file_name):
   """Execute each query in an Impala query file individually"""
+  if not os.path.exists(file_name):
+    LOG.info("Error: File {0} not found".format(file_name))
+    return False
+
+  LOG.info("Beginning execution of impala SQL: {0}".format(file_name))
   is_success = True
   impala_client = ImpalaBeeswaxClient(options.impalad, use_kerberos=options.use_kerberos)
-  try:
-    impala_client.connect()
-    with open(file_name, 'r+') as query_file:
-      queries = sqlparse.split(query_file.read())
-    for query in queries:
-      query = sqlparse.format(query.rstrip(';'), strip_comments=True)
-      print '(%s):\n%s\n' % (file_name, query.strip())
-      if query.strip() != "":
-        result = impala_client.execute(query)
-  except Exception as e:
-    print "Data Loading from Impala failed with error: %s" % str(e)
-    traceback.print_exc()
-    is_success = False
-  finally:
-    impala_client.close_connection()
-  return is_success
+  output_file = file_name + ".log"
+  with open(output_file, 'w') as out_file:
+    try:
+      impala_client.connect()
+      with open(file_name, 'r+') as query_file:
+        queries = sqlparse.split(query_file.read())
+        for query in queries:
+          query = sqlparse.format(query.rstrip(';'), strip_comments=True)
+          if query.strip() != "":
+            result = impala_client.execute(query)
+            out_file.write("{0}\n{1}\n".format(query, result))
+    except Exception as e:
+      out_file.write("ERROR: {0}\n".format(query))
+      traceback.print_exc(file=out_file)
+      is_success = False
 
-def exec_bash_script(file_name):
-  bash_cmd = "bash %s" % file_name
-  print 'Executing Bash Command: ' + bash_cmd
-  exec_cmd(bash_cmd, 'Error bash script: ' + file_name)
+  if is_success:
+    LOG.info("Finished execution of impala SQL: {0}".format(file_name))
+  else:
+    LOG.info("Error executing impala SQL: {0} See: {1}".format(file_name, \
+             output_file))
+
+  return is_success
 
 def run_dataset_preload(dataset):
   """Execute a preload script if present in dataset directory. E.g. to generate data
   before loading"""
   dataset_preload_script = os.path.join(DATASET_DIR, dataset, "preload")
   if os.path.exists(dataset_preload_script):
-    print("Running preload script for " + dataset)
+    LOG.info("Running preload script for " + dataset)
     if options.scale_factor > 1:
       dataset_preload_script += " " + str(options.scale_factor)
-    exec_cmd(dataset_preload_script, "Error executing preload script for " + dataset,
+    exec_cmd(dataset_preload_script, error_msg="Error executing preload script for " + dataset,
         exit_on_error=True)
 
 def generate_schema_statements(workload):
@@ -215,29 +242,29 @@ def generate_schema_statements(workload):
   if options.hdfs_namenode is not None:
     generate_cmd += " --hdfs_namenode=%s" % options.hdfs_namenode
   generate_cmd += " --backend=%s" % options.impalad
-  print 'Executing Generate Schema Command: ' + generate_cmd
+  LOG.info('Executing Generate Schema Command: ' + generate_cmd)
   schema_cmd = os.path.join(TESTDATA_BIN_DIR, generate_cmd)
   error_msg = 'Error generating schema statements for workload: ' + workload
-  exec_cmd(schema_cmd, error_msg)
+  exec_cmd(schema_cmd, error_msg=error_msg)
 
 def get_dataset_for_workload(workload):
   dimension_file_name = os.path.join(WORKLOAD_DIR, workload,
                                      '%s_dimensions.csv' % workload)
   if not os.path.isfile(dimension_file_name):
-    print 'Dimension file not found: ' + dimension_file_name
+    LOG.error('Dimension file not found: ' + dimension_file_name)
     sys.exit(1)
   with open(dimension_file_name, 'rb') as input_file:
     match = re.search('dataset:\s*([\w\-\.]+)', input_file.read())
     if match:
       return match.group(1)
     else:
-      print 'Dimension file does not contain dataset for workload \'%s\'' % (workload)
+      LOG.error('Dimension file does not contain dataset for workload \'%s\'' % (workload))
       sys.exit(1)
 
 def copy_avro_schemas_to_hdfs(schemas_dir):
   """Recursively copies all of schemas_dir to the test warehouse."""
   if not os.path.exists(schemas_dir):
-    print 'Avro schema dir (%s) does not exist. Skipping copy to HDFS.' % schemas_dir
+    LOG.info('Avro schema dir (%s) does not exist. Skipping copy to HDFS.' % schemas_dir)
     return
 
   exec_hadoop_fs_cmd("-mkdir -p " + options.hive_warehouse_dir)
@@ -245,41 +272,36 @@ def copy_avro_schemas_to_hdfs(schemas_dir):
 
 def exec_hadoop_fs_cmd(args, exit_on_error=True):
   cmd = "%s fs %s" % (HADOOP_CMD, args)
-  print "Executing Hadoop command: " + cmd
-  exec_cmd(cmd, "Error executing Hadoop command, exiting",
+  LOG.info("Executing Hadoop command: " + cmd)
+  exec_cmd(cmd, error_msg="Error executing Hadoop command, exiting",
       exit_on_error=exit_on_error)
 
-def exec_impala_query_from_file_parallel(query_files):
-  # Get the name of the query file that loads the base tables, if it exists.
-  # TODO: Find a better way to detect the file that loads the base tables.
-  create_base_table_file = next((q for q in query_files if 'text' in q), None)
-  if create_base_table_file:
-    is_success = exec_impala_query_from_file(create_base_table_file)
-    query_files.remove(create_base_table_file)
-    # If loading the base tables failed, exit with a non zero error code.
-    if not is_success: sys.exit(1)
-  if not query_files: return
-  threads = []
-  result_queue = Queue()
-  for query_file in query_files:
-    thread = Thread(target=lambda x: result_queue.put(exec_impala_query_from_file(x)),
-        args=[query_file])
-    thread.daemon = True
-    threads.append(thread)
-    thread.start()
-  # Keep looping until the number of results retrieved is the same as the number of
-  # threads spawned, or until a data loading query fails. result_queue.get() will
-  # block until a result is available in the queue.
-  num_fetched_results = 0
-  while num_fetched_results < len(threads):
-    success = result_queue.get()
-    num_fetched_results += 1
-    if not success: sys.exit(1)
-  # There is a small window where a thread may still be alive even if all the threads have
-  # finished putting their results in the queue.
-  for thread in threads: thread.join()
-
-if __name__ == "__main__":
+def exec_query_files_parallel(thread_pool, query_files, execution_type):
+  """Executes the query files provided using the execution engine specified
+     in parallel using the given thread pool. Aborts immediately if any execution
+     encounters an error."""
+  assert(execution_type == 'impala' or execution_type == 'hive')
+  if len(query_files) == 0: return
+  if execution_type == 'impala':
+    execution_function = exec_impala_query_from_file
+  elif execution_type == 'hive':
+    execution_function = exec_hive_query_from_file_beeline
+
+  for result in thread_pool.imap_unordered(execution_function, query_files):
+    if not result:
+      thread_pool.terminate()
+      sys.exit(1)
+
+def impala_exec_query_files_parallel(thread_pool, query_files):
+  exec_query_files_parallel(thread_pool, query_files, 'impala')
+
+def hive_exec_query_files_parallel(thread_pool, query_files):
+  exec_query_files_parallel(thread_pool, query_files, 'hive')
+
+def main():
+  logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%H:%M:%S')
+  LOG.setLevel(logging.DEBUG)
+
   # Having the actual command line at the top of each data-load-* log can help
   # when debugging dataload issues.
   #
@@ -288,62 +310,185 @@ if __name__ == "__main__":
   all_workloads = available_workloads(WORKLOAD_DIR)
   workloads = []
   if options.workloads is None:
-    print "At least one workload name must be specified."
+    LOG.error("At least one workload name must be specified.")
     parser.print_help()
     sys.exit(1)
   elif options.workloads == 'all':
-    print 'Loading data for all workloads.'
+    LOG.info('Loading data for all workloads.')
     workloads = all_workloads
   else:
     workloads = options.workloads.split(",")
     validate_workloads(all_workloads, workloads)
 
-  print 'Starting data load for the following workloads: ' + ', '.join(workloads)
+  LOG.info('Starting data load for the following workloads: ' + ', '.join(workloads))
+  LOG.info('Running with {0} threads'.format(options.num_processes))
 
+  # Note: The processes are in whatever the caller's directory is, so all paths
+  #       passed to the pool need to be absolute paths. This will allow the pool
+  #       to be used for different workloads (and thus different directories)
+  #       simultaneously.
+  thread_pool = ThreadPool(processes=options.num_processes)
   loading_time_map = collections.defaultdict(float)
   for workload in workloads:
     start_time = time.time()
     dataset = get_dataset_for_workload(workload)
     run_dataset_preload(dataset)
+    # This script is tightly coupled with testdata/bin/generate-schema-statements.py
+    # Specifically, this script is expecting the following:
+    # 1. generate-schema-statements.py generates files and puts them in the
+    #    directory ${IMPALA_DATA_LOADING_SQL_DIR}/${workload}
+    #    (e.g. ${IMPALA_HOME}/logs/data_loading/sql/tpch)
+    # 2. generate-schema-statements.py populates the subdirectory
+    #    avro_schemas/${workload} with JSON files specifying the Avro schema for the
+    #    tables being loaded.
+    # 3. generate-schema-statements.py uses a particular naming scheme to distinguish
+    #    between SQL files of different load phases.
+    #
+    #    Using the following variables:
+    #    workload_exploration = ${workload}-${exploration_strategy} and
+    #    file_format_suffix = ${file_format}-${codec}-${compression_type}
+    #
+    #    A. Impala table creation scripts run in Impala to create tables, partitions,
+    #       and views. There is one for each file format. They take the form:
+    #       create-${workload_exploration}-impala-generated-${file_format_suffix}.sql
+    #
+    #    B. Hive creation/load scripts run in Hive to load data into tables and create
+    #       tables or views that Impala does not support. There is one for each
+    #       file format. They take the form:
+    #       load-${workload_exploration}-hive-generated-${file_format_suffix}.sql
+    #
+    #    C. HBase creation script runs through the hbase commandline to create
+    #       HBase tables. (Only generated if loading HBase table.) It takes the form:
+    #       load-${workload_exploration}-hbase-generated.create
+    #
+    #    D. HBase postload script runs through the hbase commandline to flush
+    #       HBase tables. (Only generated if loading HBase table.) It takes the form:
+    #       post-load-${workload_exploration}-hbase-generated.sql
+    #
+    #    E. Impala load scripts run in Impala to load data. Only Parquet and Kudu
+    #       are loaded through Impala. There is one for each of those formats loaded.
+    #       They take the form:
+    #       load-${workload_exploration}-impala-generated-${file_format_suffix}.sql
+    #
+    #    F. Invalidation script runs through Impala to invalidate/refresh metadata
+    #       for tables. It takes the form:
+    #       invalidate-${workload_exploration}-impala-generated.sql
     generate_schema_statements(workload)
+
+    # Determine the directory from #1
     sql_dir = os.path.join(SQL_OUTPUT_DIR, dataset)
     assert os.path.isdir(sql_dir),\
       ("Could not find the generated SQL files for loading dataset '%s'.\
         \nExpected to find the SQL files in: %s" % (dataset, sql_dir))
-    os.chdir(os.path.join(SQL_OUTPUT_DIR, dataset))
-    copy_avro_schemas_to_hdfs(AVRO_SCHEMA_DIR)
-    dataset_dir_contents = os.listdir(os.getcwd())
-    load_file_substr = "%s-%s" % (workload, options.exploration_strategy)
-    # Data loading with Impala is done in parallel, each file format has a separate query
-    # file.
-    create_filename = 'create-%s-impala-generated' % load_file_substr
-    load_filename = 'load-%s-impala-generated' % load_file_substr
-    impala_create_files = [f for f in dataset_dir_contents if create_filename in f]
-    impala_load_files = [f for f in dataset_dir_contents if load_filename in f]
+
+    # Copy the avro schemas (see #2) into HDFS
+    avro_schemas_path = os.path.join(sql_dir, AVRO_SCHEMA_DIR)
+    copy_avro_schemas_to_hdfs(avro_schemas_path)
+
+    # List all of the files in the sql directory to sort out the various types of
+    # files (see #3).
+    dataset_dir_contents = [os.path.join(sql_dir, f) for f in os.listdir(sql_dir)]
+    workload_exploration = "%s-%s" % (workload, options.exploration_strategy)
+
+    # Remove the AVRO_SCHEMA_DIR from the list of files
+    if os.path.exists(avro_schemas_path):
+      dataset_dir_contents.remove(avro_schemas_path)
+
+    # Match for Impala create files (3.A)
+    impala_create_match = 'create-%s-impala-generated' % workload_exploration
+    # Match for Hive create/load files (3.B)
+    hive_load_match = 'load-%s-hive-generated' % workload_exploration
+    # Match for HBase creation script (3.C)
+    hbase_create_match = 'load-%s-hbase-generated.create' % workload_exploration
+    # Match for HBase post-load script (3.D)
+    hbase_postload_match = 'post-load-%s-hbase-generated.sql' % workload_exploration
+    # Match for Impala load scripts (3.E)
+    impala_load_match = 'load-%s-impala-generated' % workload_exploration
+    # Match for Impala invalidate script (3.F)
+    invalidate_match = 'invalidate-%s-impala-generated' % workload_exploration
+
+    impala_create_files = []
+    hive_load_text_files = []
+    hive_load_nontext_files = []
+    hbase_create_files = []
+    hbase_postload_files = []
+    impala_load_files = []
+    invalidate_files = []
+    for filename in dataset_dir_contents:
+      if impala_create_match in filename:
+        impala_create_files.append(filename)
+      elif hive_load_match in filename:
+        if 'text-none-none' in filename:
+          hive_load_text_files.append(filename)
+        else:
+          hive_load_nontext_files.append(filename)
+      elif hbase_create_match in filename:
+        hbase_create_files.append(filename)
+      elif hbase_postload_match in filename:
+        hbase_postload_files.append(filename)
+      elif impala_load_match in filename:
+        impala_load_files.append(filename)
+      elif invalidate_match in filename:
+        invalidate_files.append(filename)
+      else:
+        assert False, "Unexpected input file {0}".format(filename)
+
+    # Simple helper function to dump a header followed by the filenames
+    def log_file_list(header, file_list):
+      if (len(file_list) == 0): return
+      LOG.debug(header)
+      map(LOG.debug, map(os.path.basename, file_list))
+      LOG.debug("\n")
+
+    log_file_list("Impala Create Files:", impala_create_files)
+    log_file_list("Hive Load Text Files:", hive_load_text_files)
+    log_file_list("Hive Load Non-Text Files:", hive_load_nontext_files)
+    log_file_list("HBase Create Files:", hbase_create_files)
+    log_file_list("HBase Post-Load Files:", hbase_postload_files)
+    log_file_list("Impala Load Files:", impala_load_files)
+    log_file_list("Impala Invalidate Files:", invalidate_files)
 
     # Execute the data loading scripts.
     # Creating tables in Impala has no dependencies, so we execute them first.
     # HBase table inserts are done via hive, so the hbase tables need to be created before
-    # running the hive script. Some of the Impala inserts depend on hive tables,
+    # running the hive scripts. Some of the Impala inserts depend on hive tables,
     # so they're done at the end. Finally, the Hbase Tables that have been filled with data
     # need to be flushed.
-    exec_impala_query_from_file_parallel(impala_create_files)
-    exec_hbase_query_from_file('load-%s-hbase-generated.create' % load_file_substr)
-    exec_hive_query_from_file('load-%s-hive-generated.sql' % load_file_substr)
-    exec_hbase_query_from_file('post-load-%s-hbase-generated.sql' % load_file_substr)
+
+    impala_exec_query_files_parallel(thread_pool, impala_create_files)
+
+    # There should be at most one hbase creation script
+    assert(len(hbase_create_files) <= 1)
+    for hbase_create in hbase_create_files:
+      exec_hbase_query_from_file(hbase_create)
+
+    # If this is loading text tables plus multiple other formats, the text tables
+    # need to be loaded first
+    assert(len(hive_load_text_files) <= 1)
+    hive_exec_query_files_parallel(thread_pool, hive_load_text_files)
+    hive_exec_query_files_parallel(thread_pool, hive_load_nontext_files)
+
+    assert(len(hbase_postload_files) <= 1)
+    for hbase_postload in hbase_postload_files:
+      exec_hbase_query_from_file(hbase_postload)
 
     # Invalidate so that Impala sees the loads done by Hive before loading Parquet/Kudu
     # Note: This only invalidates tables for this workload.
-    invalidate_sql_file = 'invalidate-{0}-impala-generated.sql'.format(load_file_substr)
-    if impala_load_files: exec_impala_query_from_file(invalidate_sql_file)
-    exec_impala_query_from_file_parallel(impala_load_files)
+    assert(len(invalidate_files) <= 1)
+    if impala_load_files:
+      impala_exec_query_files_parallel(thread_pool, invalidate_files)
+      impala_exec_query_files_parallel(thread_pool, impala_load_files)
     # Final invalidate for this workload
-    exec_impala_query_from_file(invalidate_sql_file)
+    impala_exec_query_files_parallel(thread_pool, invalidate_files)
     loading_time_map[workload] = time.time() - start_time
 
   total_time = 0.0
+  thread_pool.close()
+  thread_pool.join()
   for workload, load_time in loading_time_map.iteritems():
     total_time += load_time
-    print 'Data loading for workload \'%s\' completed in: %.2fs'\
-        % (workload, load_time)
-  print 'Total load time: %.2fs\n' % total_time
+    LOG.info('Data loading for workload \'%s\' completed in: %.2fs'\
+        % (workload, load_time))
+  LOG.info('Total load time: %.2fs\n' % total_time)
+
+if __name__ == "__main__": main()

http://git-wip-us.apache.org/repos/asf/impala/blob/0be44ce6/testdata/bin/generate-schema-statements.py
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-schema-statements.py b/testdata/bin/generate-schema-statements.py
index 3f730e6..e039c48 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -16,30 +16,84 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# This script generates the "CREATE TABLE", "INSERT", and "LOAD" statements for loading
-# test data and writes them to create-*-generated.sql and
-# load-*-generated.sql. These files are then executed by hive or impala, depending
-# on their contents. Additionally, for hbase, the file is of the form
-# create-*hbase*-generated.create.
 #
-# The statements that are generated are based on an input test vector
-# (read from a file) that describes the coverage desired. For example, currently
-# we want to run benchmarks with different data sets, across different file types, and
-# with different compression algorithms set. To improve data loading performance this
-# script will generate an INSERT INTO statement to generate the data if the file does
-# not already exist in HDFS. If the file does already exist in HDFS then we simply issue a
-# LOAD statement which is much faster.
+# This script generates statements to create and populate
+# tables in a variety of formats. The tables and formats are
+# defined through a combination of files:
+# 1. Workload format specifics specify for each workload
+#    which formats are part of core, exhaustive, etc.
+#    This operates via the normal test dimensions.
+#    (see tests/common/test_dimension.py and
+#     testdata/workloads/*/*.csv)
+# 2. Workload table availability constraints specify which
+#    tables exist for which formats.
+#    (see testdata/datasets/*/schema_constraints.csv)
+# The arguments to this script specify the workload and
+# exploration strategy and can optionally restrict it
+# further to individual tables.
+#
+# This script is generating several SQL scripts to be
+# executed by bin/load-data.py. The two scripts are tightly
+# coupled and any change in files generated must be
+# reflected in bin/load-data.py. Currently, this script
+# generates three things:
+# 1. It creates the directory (destroying the existing
+#    directory if necessary)
+#    ${IMPALA_DATA_LOADING_SQL_DIR}/${workload}
+# 2. It creates and populates a subdirectory
+#    avro_schemas/${workload} with JSON files specifying
+#    the Avro schema for each table.
+# 3. It generates SQL files with the following naming schema:
+#
+#    Using the following variables:
+#    workload_exploration = ${workload}-${exploration_strategy} and
+#    file_format_suffix = ${file_format}-${codec}-${compression_type}
+#
+#    A. Impala table creation scripts run in Impala to create tables, partitions,
+#       and views. There is one for each file format. They take the form:
+#       create-${workload_exploration}-impala-generated-${file_format_suffix}.sql
+#
+#    B. Hive creation/load scripts run in Hive to load data into tables and create
+#       tables or views that Impala does not support. There is one for each
+#       file format. They take the form:
+#       load-${workload_exploration}-hive-generated-${file_format_suffix}.sql
+#
+#    C. HBase creation script runs through the hbase commandline to create
+#       HBase tables. (Only generated if loading HBase table.) It takes the form:
+#       load-${workload_exploration}-hbase-generated.create
+#
+#    D. HBase postload script runs through the hbase commandline to flush
+#       HBase tables. (Only generated if loading HBase table.) It takes the form:
+#       post-load-${workload_exploration}-hbase-generated.sql
 #
-# The input test vectors are generated via the generate_test_vectors.py so
-# ensure that script has been run (or the test vector files already exist) before
-# running this script.
+#    E. Impala load scripts run in Impala to load data. Only Parquet and Kudu
+#       are loaded through Impala. There is one for each of those formats loaded.
+#       They take the form:
+#       load-${workload_exploration}-impala-generated-${file_format_suffix}.sql
+#
+#    F. Invalidation script runs through Impala to invalidate/refresh metadata
+#       for tables. It takes the form:
+#       invalidate-${workload_exploration}-impala-generated.sql
+#
+# In summary, table "CREATE" statements are mostly done by Impala. Any "CREATE"
+# statements that Impala does not support are done through Hive. Loading data
+# into tables mostly runs in Hive except for Parquet and Kudu tables.
+# Loading proceeds in two parts: First, data is loaded into text tables.
+# Second, almost all other formats are populated by inserts from the text
+# table. Since data loaded in Hive may not be visible in Impala, all tables
+# need to have metadata refreshed or invalidated before access in Impala.
+# This means that loading Parquet or Kudu requires invalidating source
+# tables. It also means that invalidate needs to happen at the end of dataload.
+#
+# For tables requiring customized actions to create schemas or place data,
+# this script allows the table specification to include commands that
+# this will execute as part of generating the SQL for table. If the command
+# generates output, that output is used for that section. This is useful
+# for custom tables that rely on loading specific files into HDFS or
+# for tables where specifying the schema is tedious (e.g. wide tables).
+# This should be used sparingly, because these commands are executed
+# serially.
 #
-# Note: This statement generation is assuming the following data loading workflow:
-# 1) Load all the data in the specified source table
-# 2) Create tables for the new file formats and compression types
-# 3) Run INSERT OVERWRITE TABLE SELECT * from the source table into the new tables
-#    or LOAD directly if the file already exists in HDFS.
 import collections
 import csv
 import glob
@@ -171,7 +225,7 @@ KNOWN_EXPLORATION_STRATEGIES = ['core', 'pairwise', 'exhaustive', 'lzo']
 def build_create_statement(table_template, table_name, db_name, db_suffix,
                            file_format, compression, hdfs_location,
                            force_reload):
-  create_stmt = 'CREATE DATABASE IF NOT EXISTS %s%s;\n' % (db_name, db_suffix)
+  create_stmt = ''
   if (force_reload):
     create_stmt += 'DROP TABLE IF EXISTS %s%s.%s;\n' % (db_name, db_suffix, table_name)
   if compression == 'lzo':
@@ -453,13 +507,13 @@ class Statements(object):
 
   def write_to_file(self, filename):
     # If there is no content to write, skip
-    if self.__is_empty(): return
+    if not self: return
     output = self.create + self.load_base + self.load
     with open(filename, 'w') as f:
       f.write('\n\n'.join(output))
 
-  def __is_empty(self):
-    return not (self.create or self.load or self.load_base)
+  def __nonzero__(self):
+    return bool(self.create or self.load or self.load_base)
 
 def eval_section(section_str):
   """section_str should be the contents of a section (i.e. a string). If section_str
@@ -481,7 +535,6 @@ def generate_statements(output_name, test_vectors, sections,
   # TODO: This method has become very unwieldy. It has to be re-factored sooner than
   # later.
   # Parquet statements to be executed separately by Impala
-  hive_output = Statements()
   hbase_output = Statements()
   hbase_post_load = Statements()
   impala_invalidate = Statements()
@@ -492,16 +545,18 @@ def generate_statements(output_name, test_vectors, sections,
   existing_tables = get_hdfs_subdirs_with_data(options.hive_warehouse_dir)
   for row in test_vectors:
     impala_create = Statements()
+    hive_output = Statements()
     impala_load = Statements()
     file_format, data_set, codec, compression_type =\
         [row.file_format, row.dataset, row.compression_codec, row.compression_type]
     table_format = '%s/%s/%s' % (file_format, codec, compression_type)
+    db_suffix = row.db_suffix()
+    db_name = '{0}{1}'.format(data_set, options.scale_factor)
+    db = '{0}{1}'.format(db_name, db_suffix)
+    create_db_stmt = 'CREATE DATABASE IF NOT EXISTS {0};\n'.format(db)
+    impala_create.create.append(create_db_stmt)
     for section in sections:
       table_name = section['BASE_TABLE_NAME'].strip()
-      db_suffix = row.db_suffix()
-      db_name = '{0}{1}'.format(data_set, options.scale_factor)
-      db = '{0}{1}'.format(db_name, db_suffix)
-
 
       if table_names and (table_name.lower() not in table_names):
         print 'Skipping table: %s.%s, table is not in specified table list' % (db, table_name)
@@ -640,8 +695,13 @@ def generate_statements(output_name, test_vectors, sections,
             column_families))
         hbase_post_load.load.append("flush '%s_hbase.%s'\n" % (db_name, table_name))
 
-      # Need to emit an "invalidate metadata" for each individual table
-      invalidate_table_stmt = "INVALIDATE METADATA {0}.{1};\n".format(db, table_name)
+      # Need to make sure that tables created and/or data loaded in Hive is seen
+      # in Impala. We only need to do a full invalidate if the table was created in Hive
+      # and Impala doesn't know about it. Otherwise, do a refresh.
+      if output == hive_output:
+        invalidate_table_stmt = "INVALIDATE METADATA {0}.{1};\n".format(db, table_name)
+      else:
+        invalidate_table_stmt = "REFRESH {0}.{1};\n".format(db, table_name)
       impala_invalidate.create.append(invalidate_table_stmt)
 
       # The ALTER statement in hive does not accept fully qualified table names so
@@ -701,16 +761,18 @@ def generate_statements(output_name, test_vectors, sections,
 
     impala_create.write_to_file("create-%s-impala-generated-%s-%s-%s.sql" %
         (output_name, file_format, codec, compression_type))
+    hive_output.write_to_file("load-%s-hive-generated-%s-%s-%s.sql" %
+        (output_name, file_format, codec, compression_type))
     impala_load.write_to_file("load-%s-impala-generated-%s-%s-%s.sql" %
         (output_name, file_format, codec, compression_type))
 
-
-  hive_output.write_to_file('load-' + output_name + '-hive-generated.sql')
-  hbase_output.create.append("exit")
-  hbase_output.write_to_file('load-' + output_name + '-hbase-generated.create')
-  hbase_post_load.load.append("exit")
-  hbase_post_load.write_to_file('post-load-' + output_name + '-hbase-generated.sql')
-  impala_invalidate.write_to_file('invalidate-' + output_name + '-impala-generated.sql')
+  if hbase_output:
+    hbase_output.create.append("exit")
+    hbase_output.write_to_file('load-' + output_name + '-hbase-generated.create')
+  if hbase_post_load:
+    hbase_post_load.load.append("exit")
+    hbase_post_load.write_to_file('post-load-' + output_name + '-hbase-generated.sql')
+  impala_invalidate.write_to_file("invalidate-" + output_name + "-impala-generated.sql")
 
 def parse_schema_template_file(file_name):
   VALID_SECTION_NAMES = ['DATASET', 'BASE_TABLE_NAME', 'COLUMNS', 'PARTITION_COLUMNS',

http://git-wip-us.apache.org/repos/asf/impala/blob/0be44ce6/testdata/bin/load_nested.py
----------------------------------------------------------------------
diff --git a/testdata/bin/load_nested.py b/testdata/bin/load_nested.py
index 146c0ff..d391fdb 100755
--- a/testdata/bin/load_nested.py
+++ b/testdata/bin/load_nested.py
@@ -263,32 +263,43 @@ def load():
         TBLPROPERTIES('parquet.compression'='SNAPPY')
         AS SELECT * FROM tmp_customer;
 
-        DROP TABLE tmp_orders_string;
-        DROP TABLE tmp_customer_string;
-        DROP TABLE tmp_customer;
-
         CREATE TABLE region
         STORED AS PARQUET
         TBLPROPERTIES('parquet.compression'='SNAPPY')
         AS SELECT * FROM tmp_region;
 
-        DROP TABLE tmp_region_string;
-        DROP TABLE tmp_region;
-
         CREATE TABLE supplier
         STORED AS PARQUET
         TBLPROPERTIES('parquet.compression'='SNAPPY')
-        AS SELECT * FROM tmp_supplier;
+        AS SELECT * FROM tmp_supplier;""".split(";"):
+      if not stmt.strip():
+        continue
+      LOG.info("Executing: {0}".format(stmt))
+      hive.execute(stmt)
+
+  with cluster.impala.cursor(db_name=target_db) as impala:
+    # Drop the temporary tables. These temporary tables were created
+    # in Impala, so they exist in Impala's metadata. This drop is executed by
+    # Impala so that the metadata is automatically updated.
+    for stmt in """
+        DROP TABLE tmp_orders_string;
+        DROP TABLE tmp_customer_string;
+        DROP TABLE tmp_customer;
+
+        DROP TABLE tmp_region_string;
+        DROP TABLE tmp_region;
 
         DROP TABLE tmp_supplier;
         DROP TABLE tmp_supplier_string;""".split(";"):
       if not stmt.strip():
         continue
       LOG.info("Executing: {0}".format(stmt))
-      hive.execute(stmt)
+      impala.execute(stmt)
 
-  with cluster.impala.cursor(db_name=target_db) as impala:
-    impala.invalidate_metadata()
+    impala.invalidate_metadata(table_name="customer")
+    impala.invalidate_metadata(table_name="part")
+    impala.invalidate_metadata(table_name="region")
+    impala.invalidate_metadata(table_name="supplier")
     impala.compute_stats()
 
   LOG.info("Done loading nested TPCH data")

http://git-wip-us.apache.org/repos/asf/impala/blob/0be44ce6/testdata/datasets/functional/functional_schema_template.sql
----------------------------------------------------------------------
diff --git a/testdata/datasets/functional/functional_schema_template.sql b/testdata/datasets/functional/functional_schema_template.sql
index a7a5eac..be666ee 100644
--- a/testdata/datasets/functional/functional_schema_template.sql
+++ b/testdata/datasets/functional/functional_schema_template.sql
@@ -242,16 +242,16 @@ functional
 ---- BASE_TABLE_NAME
 alltypesinsert
 ---- CREATE
-CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} LIKE {db_name}.alltypes
-STORED AS {file_format};
+CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+LIKE {db_name}{db_suffix}.alltypes STORED AS {file_format};
 ====
 ---- DATASET
 functional
 ---- BASE_TABLE_NAME
 alltypesnopart_insert
 ---- CREATE
-CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name} like {db_name}.alltypesnopart
-STORED AS {file_format};
+CREATE TABLE IF NOT EXISTS {db_name}{db_suffix}.{table_name}
+LIKE {db_name}{db_suffix}.alltypesnopart STORED AS {file_format};
 ====
 ---- DATASET
 functional
@@ -2009,7 +2009,7 @@ functional
 ---- BASE_TABLE_NAME
 avro_unicode_nulls
 ---- CREATE_HIVE
-create external table if not exists {db_name}{db_suffix}.{table_name} like {db_name}.liketbl stored as avro LOCATION '/test-warehouse/avro_null_char';
+create external table if not exists {db_name}{db_suffix}.{table_name} like {db_name}{db_suffix}.liketbl stored as avro LOCATION '/test-warehouse/avro_null_char';
 ---- LOAD
 `hdfs dfs -mkdir -p /test-warehouse/avro_null_char && \
 hdfs dfs -put -f ${IMPALA_HOME}/testdata/avro_null_char/000000_0 /test-warehouse/avro_null_char/

[14/15] impala git commit: IMPALA-6723: [DOCS] Hints for CTAS

Posted by ta...@apache.org.

IMPALA-6723: [DOCS] Hints for CTAS

Change-Id: I91d9f4f039a603382ff4415d1dd22a351279cbfa

IMPALA-6723 Hints for CTAS

Change-Id: I201a4e1ddaf62164e1f6b636c4e1e60af60e1af7

IMPALA-6723: [DOCS] Hints for CTAS

Optimizer hints were move out of SELECT section.
Hints for CTAS were added to the same section as INSERT.

Change-Id: I91d9f4f039a603382ff4415d1dd22a351279cbfa
Reviewed-on: http://gerrit.cloudera.org:8080/9993
Reviewed-by: Alex Rodoni <ar...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d8815fb3
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d8815fb3
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d8815fb3

Branch: refs/heads/2.x
Commit: d8815fb3802ae0ee20e0f30833b5185c914755e6
Parents: a98ff44
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Fri Apr 6 09:56:48 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:48 2018 +0000

----------------------------------------------------------------------
 docs/impala.ditamap           |   2 +-
 docs/shared/impala_common.xml |   1 +
 docs/topics/impala_hints.xml  | 226 +++++++++++++++++++++++++++++--------
 3 files changed, 181 insertions(+), 48 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/d8815fb3/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 7b92028..8dd22f7 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -166,7 +166,6 @@ under the License.
         <topicref rev="IMPALA-5309" href="topics/impala_tablesample.xml"/>
         <topicref href="topics/impala_with.xml"/>
         <topicref href="topics/impala_distinct.xml"/>
-        <topicref href="topics/impala_hints.xml"/>
       </topicref>
       <topicref href="topics/impala_set.xml">
         <topicref href="topics/impala_query_options.xml">
@@ -237,6 +236,7 @@ under the License.
       <topicref href="topics/impala_update.xml"/>
       <topicref href="topics/impala_upsert.xml"/>
       <topicref href="topics/impala_use.xml"/>
+      <topicref href="topics/impala_hints.xml"/>
     </topicref>
     <topicref href="topics/impala_functions.xml">
       <topicref href="topics/impala_math_functions.xml"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/d8815fb3/docs/shared/impala_common.xml
----------------------------------------------------------------------
diff --git a/docs/shared/impala_common.xml b/docs/shared/impala_common.xml
index fec6224..4ffd211 100644
--- a/docs/shared/impala_common.xml
+++ b/docs/shared/impala_common.xml
@@ -2991,6 +2991,7 @@ select max(height), avg(height) from census_data where age &gt; 20;
         Another way to define different names for the same tables or columns is to create views. See
         <xref href="../topics/impala_views.xml#views"/> for details.
       </p>
+      <!--Alex R: Insert hints below is being refactored in impala_hints.xml fore more general purpose. Keep this for now for impala_paquet.xml.-->
 
       <p id="insert_hints">
         When inserting into partitioned tables, especially using the Parquet file format, you can include a hint in

http://git-wip-us.apache.org/repos/asf/impala/blob/d8815fb3/docs/topics/impala_hints.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_hints.xml b/docs/topics/impala_hints.xml
index 6cafcfb..b936b6a 100644
--- a/docs/topics/impala_hints.xml
+++ b/docs/topics/impala_hints.xml
@@ -20,8 +20,8 @@ under the License.
 <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
 <concept id="hints">
 
-  <title>Query Hints in Impala SELECT Statements</title>
-  <titlealts audience="PDF"><navtitle>Hints</navtitle></titlealts>
+  <title>Optimizer Hints</title>
+  <titlealts audience="PDF"><navtitle>Optimizer Hints</navtitle></titlealts>
   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
@@ -37,15 +37,13 @@ under the License.
   <conbody>
 
     <p>
-      <indexterm audience="hidden">hints</indexterm>
-      The Impala SQL dialect supports query hints, for fine-tuning the inner workings of queries. Specify hints as
-      a temporary workaround for expensive queries, where missing statistics or other factors cause inefficient
-      performance.
-    </p>
+      <indexterm audience="hidden">hints</indexterm> The Impala SQL supports
+      query hints, for fine-tuning the inner workings of queries. Specify hints
+      as a temporary workaround for expensive queries, where missing statistics
+      or other factors cause inefficient performance. </p>
 
-    <p>
-      Hints are most often used for the most resource-intensive kinds of Impala queries:
-    </p>
+    <p> Hints are most often used for the resource-intensive Impala queries,
+      such as: </p>
 
     <ul>
       <li>
@@ -61,41 +59,18 @@ under the License.
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
-    <p>
-      You can also represent the hints as keywords surrounded by <codeph>[]</codeph>
-      square brackets; include the brackets in the text of the SQL statement.
-      <note conref="../shared/impala_common.xml#common/square_bracket_hint_caveat"/>
-    </p>
-
-<codeblock>SELECT STRAIGHT_JOIN <varname>select_list</varname> FROM
-<varname>join_left_hand_table</varname>
-  JOIN [{ /* +BROADCAST */ | /* +SHUFFLE */ }]
-<varname>join_right_hand_table</varname>
-<varname>remainder_of_query</varname>;
-
-INSERT <varname>insert_clauses</varname>
-  [{ /* +SHUFFLE */ | /* +NOSHUFFLE */ }]
-  [<ph rev="IMPALA-2522 2.8.0">/* +CLUSTERED */</ph>]
-  SELECT <varname>remainder_of_query</varname>;
-
-<ph rev="2.12.0 IMPALA-4168">
-UPSERT [{ /* +SHUFFLE */ | /* +NOSHUFFLE */ }]
-  [<ph rev="IMPALA-2522 2.8.0">/* +CLUSTERED */</ph>]
-  <varname>upsert_clauses</varname>
-  SELECT <varname>remainder_of_query</varname>;</ph>
-</codeblock>
-
-    <p rev="2.0.0">
-      In <keyword keyref="impala20_full"/> and higher, you can also specify the hints inside comments that use
-      either the <codeph>/* */</codeph> or <codeph>--</codeph> notation. Specify a <codeph>+</codeph> symbol
-      immediately before the hint name. Recently added hints are only available using the <codeph>/* */</codeph>
-      and <codeph>--</codeph> notation.
-      For clarity, the <codeph>/* */</codeph> and <codeph>--</codeph> styles
-      are used in the syntax and examples throughout this section.
-      With the <codeph>/* */</codeph> or <codeph>--</codeph> notation for
-      hints, specify a <codeph>+</codeph> symbol immediately before the first hint name.
-      Multiple hints can be specified separated by commas, for example
-      <codeph>/* +clustered,shuffle */</codeph>
+    <p rev="2.0.0"> In <keyword keyref="impala20_full"/> and higher, you can
+      specify the hints inside comments that use either the <codeph>/*
+        */</codeph> or <codeph>--</codeph> notation. Specify a
+        <codeph>+</codeph> symbol immediately before the hint name. Recently
+      added hints are only available using the <codeph>/* */</codeph> and
+        <codeph>--</codeph> notation. For clarity, the <codeph>/* */</codeph>
+      and <codeph>--</codeph> styles are used in the syntax and examples
+      throughout this section. With the <codeph>/* */</codeph> or
+        <codeph>--</codeph> notation for hints, specify a <codeph>+</codeph>
+      symbol immediately before the first hint name. Multiple hints can be
+      specified separated by commas, for example <codeph>/* +clustered,shuffle
+        */</codeph>
     </p>
 
 <codeblock rev="2.0.0">SELECT STRAIGHT_JOIN <varname>select_list</varname> FROM
@@ -167,6 +142,43 @@ UPSERT -- +CLUSTERED
 UPSERT /* +CLUSTERED */
   <varname>upsert_clauses</varname>
   SELECT <varname>remainder_of_query</varname>;</ph>
+
+CREATE /* +SHUFFLE|NOSHUFFLE */
+  <varname>table_clauses</varname>
+  AS SELECT <varname>remainder_of_query</varname>;
+
+CREATE -- +SHUFFLE|NOSHUFFLE
+  <varname>table_clauses</varname>
+  AS SELECT <varname>remainder_of_query</varname>;
+
+CREATE /* +CLUSTER|NOCLUSTER */
+  <varname>table_clauses</varname>
+  AS SELECT <varname>remainder_of_query</varname>;
+
+CREATE -- +CLUSTER|NOCLUSTER
+  <varname>table_clauses</varname>
+  AS SELECT <varname>remainder_of_query</varname>;
+</codeblock>
+    <p>The square bracket style hints are supported for backward compatibility,
+      but the syntax is deprecated and will be removed in a future release. For
+      that reason, any newly added hints are not available with the square
+      bracket syntax.</p>
+    <codeblock>SELECT STRAIGHT_JOIN <varname>select_list</varname> FROM
+<varname>join_left_hand_table</varname>
+  JOIN [{ /* +BROADCAST */ | /* +SHUFFLE */ }]
+<varname>join_right_hand_table</varname>
+<varname>remainder_of_query</varname>;
+
+INSERT <varname>insert_clauses</varname>
+  [{ /* +SHUFFLE */ | /* +NOSHUFFLE */ }]
+  [<ph rev="IMPALA-2522 2.8.0">/* +CLUSTERED */</ph>]
+  SELECT <varname>remainder_of_query</varname>;
+
+<ph rev="2.12.0 IMPALA-4168">
+UPSERT [{ /* +SHUFFLE */ | /* +NOSHUFFLE */ }]
+  [<ph rev="IMPALA-2522 2.8.0">/* +CLUSTERED */</ph>]
+  <varname>upsert_clauses</varname>
+  SELECT <varname>remainder_of_query</varname>;</ph>
 </codeblock>
 
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
@@ -226,10 +238,130 @@ UPSERT /* +CLUSTERED */
     </ul>
 
     <p>
-      <b>Hints for INSERT ... SELECT queries:</b>
+      <b>Hints for INSERT ... SELECT and CREATE TABLE AS SELECT (CTAS):</b>
+    </p>
+    <p id="insert_hints">
+      When inserting into partitioned tables, such as using the Parquet file
+      format, you can include a hint in the <codeph>INSERT</codeph> or <codeph>CREATE TABLE AS SELECT(CTAS)</codeph>
+      statements to fine-tune the overall performance of the operation and its
+      resource usage.</p>
+    <p>
+      You would only use hints if an <codeph>INSERT</codeph> or
+        <codeph>CTAS</codeph> into a partitioned table was failing due to
+      capacity limits, or if such an operation was succeeding but with
+      less-than-optimal performance.
     </p>
 
-    <p conref="../shared/impala_common.xml#common/insert_hints"/>
+    <ul>
+      <li>
+        <codeph>/* +SHUFFLE */</codeph> and <codeph>/* +NOSHUFFLE */</codeph> Hints
+        <ul>
+          <li>
+            <codeph>/* +SHUFFLE */</codeph> adds an exchange node, before
+            writing the data, which re-partitions the result of the
+              <codeph>SELECT</codeph> based on the partitioning columns of the
+            target table. With this hint, only one node writes to a partition at
+            a time, minimizing the global number of simultaneous writes and the
+            number of memory buffers holding data for individual partitions.
+            This also reduces fragmentation, resulting in fewer files. Thus it
+            reduces overall resource usage of the <codeph>INSERT</codeph> or
+              <codeph>CTAS</codeph> operation and allows some operations to
+            succeed that otherwise would fail. It does involve some data
+            transfer between the nodes so that the data files for a particular
+            partition are all written on the same node.
+
+            <p>
+              Use <codeph>/* +SHUFFLE */</codeph> in cases where an <codeph>INSERT</codeph>
+              or <codeph>CTAS</codeph> statement fails or runs inefficiently due
+              to all nodes attempting to write data for all partitions.
+            </p>
+
+            <p> If the table is unpartitioned or every partitioning expression
+              is constant, then <codeph>/* +SHUFFLE */</codeph> will cause every
+              write to happen on the coordinator node.
+            </p>
+          </li>
+
+          <li>
+            <codeph>/* +NOSHUFFLE */</codeph> does not add exchange node before
+            inserting to partitioned tables and disables re-partitioning. So the
+            selected execution plan might be faster overall, but might also
+            produce a larger number of small data files or exceed capacity
+            limits, causing the <codeph>INSERT</codeph> or <codeph>CTAS</codeph>
+            operation to fail.
+
+            <p> Impala automatically uses the <codeph>/*
+                +SHUFFLE */</codeph> method if any partition key column in the
+              source table, mentioned in the <codeph>SELECT</codeph> clause,
+              does not have column statistics. In this case, use the <codeph>/*
+                +NOSHUFFLE */</codeph> hint if you want to override this default
+              behavior.
+            </p>
+          </li>
+
+          <li>
+            If column statistics are available for all partition key columns
+            in the source table mentioned in the <codeph>INSERT ...
+              SELECT</codeph> or <codeph>CTAS</codeph> query, Impala chooses
+            whether to use the <codeph>/* +SHUFFLE */</codeph> or <codeph>/*
+              +NOSHUFFLE */</codeph> technique based on the estimated number of
+            distinct values in those columns and the number of nodes involved in
+            the operation. In this case, you might need the <codeph>/* +SHUFFLE
+              */</codeph> or the <codeph>/* +NOSHUFFLE */</codeph> hint to
+            override the execution plan selected by Impala.
+          </li>
+        </ul>
+      </li>
+
+      <li>
+        <codeph>/* +CLUSTERED */</codeph> and <codeph>/* +NOCLUSTERED
+          */</codeph> Hints
+        <ul>
+          <li>
+            <codeph>/* +CLUSTERED */</codeph> sorts data by the partition
+            columns before inserting to ensure that only one partition is
+            written at a time per node. Use this hint to reduce the number of
+            files kept open and the number of buffers kept in memory
+            simultaneously. This technique is primarily useful for inserts into
+            Parquet tables, where the large block size requires substantial
+            memory to buffer data for multiple output files at once. This hint
+            is available in <keyword keyref="impala28_full"/> or higher.
+
+            <p>
+              Starting in <keyword keyref="impala30_full"/>, <codeph>/*
+                +CLUSTERED */</codeph> is the default behavior for HDFS tables.
+            </p>
+          </li>
+
+          <li>
+            <codeph>/* +NOCLUSTERED */</codeph> does not sort by primary key
+            before insert. This hint is available in <keyword
+              keyref="impala28_full"/> or higher.
+
+            <p>
+              Use this hint when inserting to Kudu tables.
+            </p>
+
+            <p>
+              In the versions lower than <keyword keyref="impala30_full"/>,
+                <codeph>/* +NOCLUSTERED */</codeph> is the default in HDFS
+              tables.
+            </p>
+          </li>
+        </ul>
+      </li>
+    </ul>
+
+    <p>
+      Starting from <keyword keyref="impala29_full"/>, <codeph>INSERT</codeph>
+      or <codeph>UPSERT</codeph> operations into Kudu tables automatically have
+      an exchange and sort node added to the plan that partitions and sorts the
+      rows according to the partitioning/primary key scheme of the target table
+      (unless the number of rows to be inserted is small enough to trigger
+      single node execution). Use the<codeph> /* +NOCLUSTERED */</codeph> and
+        <codeph>/* +NOSHUFFLE */</codeph> hints together to disable partitioning
+      and sorting before the rows are sent to Kudu.
+    </p>
 
     <p rev="IMPALA-2924">
       <b>Hints for scheduling of HDFS blocks:</b>

[15/15] impala git commit: IMPALA-6732: [DOCS] 2.12 Release Notes

Posted by ta...@apache.org.

IMPALA-6732: [DOCS] 2.12 Release Notes

Change-Id: Ia0eee2a5bd7d31afaff048f0ac6e46123eb56e47
Reviewed-on: http://gerrit.cloudera.org:8080/10071
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Sailesh Mukil <sa...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/ca70cd91
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/ca70cd91
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/ca70cd91

Branch: refs/heads/2.x
Commit: ca70cd9160e4f7dcae92d1a719bdc67886033372
Parents: 58c8126
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Sat Apr 14 09:19:16 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:48 2018 +0000

----------------------------------------------------------------------
 docs/impala_keydefs.ditamap                 |  2 ++
 docs/impala_release_notes.ditamap           |  2 +-
 docs/topics/impala_fixed_issues.xml         | 20 ++++++++++++++++++--
 docs/topics/impala_incompatible_changes.xml | 16 ++++++++++++++++
 docs/topics/impala_new_features.xml         | 17 +++++++++++++++++
 5 files changed, 54 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/ca70cd91/docs/impala_keydefs.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_keydefs.ditamap b/docs/impala_keydefs.ditamap
index dea94b9..ae6b2f8 100644
--- a/docs/impala_keydefs.ditamap
+++ b/docs/impala_keydefs.ditamap
@@ -10601,6 +10601,8 @@ under the License.
   <keydef keys="impala13_full"><topicmeta><keywords><keyword>Impala 1.3</keyword></keywords></topicmeta></keydef>
 
 <!-- Pointers to changelog pages -->
+  <keydef keys="changelog_300" href="https://impala.apache.org/docs/changelog-3.0.html" scope="external" format="html"/>
+  <keydef keys="changelog_212" href="https://impala.apache.org/docs/changelog-2.12.html" scope="external" format="html"/>
   <keydef keys="changelog_211" href="https://impala.apache.org/docs/changelog-2.11.html" scope="external" format="html"/>
   <keydef keys="changelog_210" href="https://impala.apache.org/docs/changelog-2.10.html" scope="external" format="html"/>
   <keydef keys="changelog_29" href="https://impala.apache.org/docs/changelog-2.9.html" scope="external" format="html"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/ca70cd91/docs/impala_release_notes.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala_release_notes.ditamap b/docs/impala_release_notes.ditamap
index 58ad5ab..554b2ee 100644
--- a/docs/impala_release_notes.ditamap
+++ b/docs/impala_release_notes.ditamap
@@ -20,7 +20,7 @@ under the License.
 <!DOCTYPE map PUBLIC "-//OASIS//DTD DITA Map//EN" "map.dtd">
 <map audience="standalone">
   <title>Apache Impala Release Notes</title>
-  <topicref href="topics/impala_relnotes.xml" audience="HTML standalone"/>
+<!--  <topicref href="topics/impala_relnotes.xml" audience="HTML standalone"/>-->
   <topicref href="topics/impala_new_features.xml"/>
   <topicref href="topics/impala_incompatible_changes.xml"/>
   <topicref href="topics/impala_known_issues.xml"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/ca70cd91/docs/topics/impala_fixed_issues.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_fixed_issues.xml b/docs/topics/impala_fixed_issues.xml
index 6f2b789..9d2ca97 100644
--- a/docs/topics/impala_fixed_issues.xml
+++ b/docs/topics/impala_fixed_issues.xml
@@ -46,11 +46,27 @@ under the License.
     <p outputclass="toc inpage"/>
   </conbody>
 
+<!-- All 2.12.x subsections go under here -->
+
+  <concept rev="2.12.0" id="fixed_issues_2_12_0">
+
+    <title>Issues Fixed in <keyword keyref="impala212"/></title>
+
+    <conbody>
+
+      <p>
+        For the full list of issues closed in this release, including bug fixes,
+        see the <xref keyref="changelog_212">changelog for <keyword keyref="impala212"/></xref>.
+      </p>
+
+    </conbody>
+  </concept>
+
 <!-- All 2.11.x subsections go under here -->
 
   <concept rev="2.11.0" id="fixed_issues_2_11_0">
 
-    <title>Issues Fixed in <keyword keyref="impala2110"/></title>
+    <title>Issues Fixed in <keyword keyref="impala211"/></title>
 
     <conbody>
 
@@ -66,7 +82,7 @@ under the License.
 
   <concept rev="2.10.0" id="fixed_issues_2100">
 
-    <title>Issues Fixed in <keyword keyref="impala2100"/></title>
+    <title>Issues Fixed in <keyword keyref="impala210"/></title>
 
     <conbody>
 

http://git-wip-us.apache.org/repos/asf/impala/blob/ca70cd91/docs/topics/impala_incompatible_changes.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_incompatible_changes.xml b/docs/topics/impala_incompatible_changes.xml
index 9d8d711..f26c2f2 100644
--- a/docs/topics/impala_incompatible_changes.xml
+++ b/docs/topics/impala_incompatible_changes.xml
@@ -53,6 +53,22 @@ under the License.
     <p outputclass="toc inpage"/>
   </conbody>
 
+  <concept rev="2.12.0" id="incompatible_changes_212x">
+
+    <title>Incompatible Changes Introduced in Impala 2.12.x</title>
+
+    <conbody>
+
+      <p>
+        For the full list of issues closed in this release, including any that introduce
+        behavior changes or incompatibilities, see the
+        <xref keyref="changelog_212">changelog for <keyword keyref="impala212"/></xref>.
+      </p>
+
+    </conbody>
+
+  </concept>
+
   <concept rev="2.11.0" id="incompatible_changes_211x">
 
     <title>Incompatible Changes Introduced in Impala 2.11.x</title>

http://git-wip-us.apache.org/repos/asf/impala/blob/ca70cd91/docs/topics/impala_new_features.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_new_features.xml b/docs/topics/impala_new_features.xml
index 0deb311..deb15e0 100644
--- a/docs/topics/impala_new_features.xml
+++ b/docs/topics/impala_new_features.xml
@@ -46,6 +46,23 @@ under the License.
 
   </conbody>
 
+<!-- All 2.12.x new features go under here -->
+
+  <concept rev="2.12.0" id="new_features_2120">
+
+    <title>New Features in <keyword keyref="impala212_full"/></title>
+
+    <conbody>
+
+      <p>
+        For the full list of issues closed in this release, including the issues
+        marked as <q>new features</q> or <q>improvements</q>, see the
+        <xref keyref="changelog_212">changelog for <keyword keyref="impala212"/></xref>.
+      </p>
+
+    </conbody>
+  </concept>
+
 <!-- All 2.11.x new features go under here -->
 
   <concept rev="2.11.0" id="new_features_2110">

[13/15] impala git commit: IMPALA-6514: [DOCS] impala-shell option for load balancer and Kerberos

Posted by ta...@apache.org.

IMPALA-6514: [DOCS] impala-shell option for load balancer and Kerberos

Change-Id: I50d2063bfbe4838692777e2019ee3f3a991dfc21
Reviewed-on: http://gerrit.cloudera.org:8080/10047
Reviewed-by: Vincent Tran <vt...@cloudera.com>
Reviewed-by: Alex Rodoni <ar...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/58c81267
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/58c81267
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/58c81267

Branch: refs/heads/2.x
Commit: 58c8126715edb05f94cf6735c65e2e5917a47857
Parents: 915ea30
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Thu Apr 12 11:55:18 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:48 2018 +0000

----------------------------------------------------------------------
 docs/topics/impala_proxy.xml         | 40 +++++++++++++++++++++++++++----
 docs/topics/impala_shell_options.xml | 29 ++++++++++++++++++++++
 2 files changed, 64 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/58c81267/docs/topics/impala_proxy.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_proxy.xml b/docs/topics/impala_proxy.xml
index 1f5bb4b..588fada 100644
--- a/docs/topics/impala_proxy.xml
+++ b/docs/topics/impala_proxy.xml
@@ -238,11 +238,41 @@ under the License.
         verify that the host they are connecting to is the same one that is
         actually processing the request, to prevent man-in-the-middle attacks.
       </p>
-      <note>
-          Once you enable a proxy server in a Kerberized cluster, users will not
-          be able to connect to individual impala daemons directly from impala
-          shell.
-      </note>
+      <p>
+        In <keyword keyref="impala211_full">Impala 2.11</keyword> and lower
+        versions, once you enable a proxy server in a Kerberized cluster, users
+        will not be able to connect to individual impala daemons directly from
+        impala-shell.
+      </p>
+
+      <p>
+        In <keyword keyref="impala212_full">Impala 2.12</keyword> and higher,
+        if you enable a proxy server in a Kerberized cluster, users have an
+        option to connect to Impala daemons directly from
+          <cmdname>impala-shell</cmdname> using the <codeph>-b</codeph> /
+          <codeph>--kerberos_host_fqdn</codeph> option when you start
+          <cmdname>impala-shell</cmdname>. This option can be used for testing or
+        troubleshooting purposes, but not recommended for live production
+        environments as it defeats the purpose of a load balancer/proxy.
+      </p>
+
+      <p>
+        Example:
+<codeblock>
+impala-shell -i impalad-1.mydomain.com -k -b loadbalancer-1.mydomain.com
+</codeblock>
+      </p>
+
+      <p>
+        Alternatively, with the fully qualified
+        configurations:
+<codeblock>impala-shell --impalad=impalad-1.mydomain.com:21000 --kerberos --kerberos_host_fqdn=loadbalancer-1.mydomain.com</codeblock>
+      </p>
+      <p>
+        See <xref href="impala_shell_options.xml#shell_options"/> for
+        information about the option.
+      </p>
+
       <p>
         To clarify that the load-balancing proxy server is legitimate, perform
         these extra Kerberos setup steps:

http://git-wip-us.apache.org/repos/asf/impala/blob/58c81267/docs/topics/impala_shell_options.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_shell_options.xml b/docs/topics/impala_shell_options.xml
index 43e8162..8f59c94 100644
--- a/docs/topics/impala_shell_options.xml
+++ b/docs/topics/impala_shell_options.xml
@@ -106,6 +106,35 @@ under the License.
             <row>
               <entry>
                 <p>
+                  -b or
+                </p>
+                <p>
+                  --kerberos_host_fqdn
+                </p>
+              </entry>
+              <entry>
+                <p>
+                  kerberos_host_fqdn=
+                </p>
+                <p>
+                  <varname>load-balancer-hostname</varname>
+                </p>
+              </entry>
+              <entry>
+                <p>
+                  If set, the setting overrides the expected hostname of the
+                  Impala daemon's Kerberos service principal.
+                    <cmdname>impala-shell</cmdname> will check that the server's
+                  principal matches this hostname. This may be used when
+                    <codeph>impalad</codeph> is configured to be accessed via a
+                  load-balancer, but it is desired for impala-shell to talk to a
+                  specific <codeph>impalad</codeph> directly.
+                </p>
+              </entry>
+            </row>
+            <row>
+              <entry>
+                <p>
                   --print_header
                 </p>
               </entry>

[06/15] impala git commit: IMPALA-6713: Fix request for unneeded memory in partial sort

Posted by ta...@apache.org.

IMPALA-6713: Fix request for unneeded memory in partial sort

When a Sorter::Run is initialized, if it is an initial run and has
varlen data, it requests an extra buffer to have space to sort the
varlen data should it need to spill to disk.

This extra buffer is not needed in the case of partial sorts, which
do not spill, and because this extra buffer was not included in the
calculation of the minimum required reservation, requesting it caused
the partial sort to fail in cases where the partial sort only had its
minimum reservation available to use.

The solution is to not request the extra memory for partial sorts.

Testing:
- Added a test to test_sort.py that ensures the partial sort can
  complete successfully even if additional memory requests beyond its
  minimum reservation are denied.

Change-Id: I2d9c0863009021340d8b684669b371a2cfb1ecad
Reviewed-on: http://gerrit.cloudera.org:8080/10031
Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/1f37ca2b
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/1f37ca2b
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/1f37ca2b

Branch: refs/heads/2.x
Commit: 1f37ca2b4e6a164b59a518e2005b76cbd7b5c209
Parents: b2316be
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Thu Apr 12 03:36:06 2018 +0000
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 be/src/runtime/sorter.cc      |  5 +++--
 tests/query_test/test_sort.py | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/1f37ca2b/be/src/runtime/sorter.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/sorter.cc b/be/src/runtime/sorter.cc
index 5ef321b..ead4065 100644
--- a/be/src/runtime/sorter.cc
+++ b/be/src/runtime/sorter.cc
@@ -630,7 +630,8 @@ Sorter::Run::Run(Sorter* parent, TupleDescriptor* sort_tuple_desc, bool initial_
     num_tuples_(0) {}
 
 Status Sorter::Run::Init() {
-  int num_to_create = 1 + has_var_len_slots_ + (has_var_len_slots_ && initial_run_);
+  int num_to_create = 1 + has_var_len_slots_
+      + (has_var_len_slots_ && initial_run_ && sorter_->enable_spilling_);
   int64_t required_mem = num_to_create * sorter_->page_len_;
   if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(required_mem)) {
     return Status(Substitute(
@@ -641,7 +642,7 @@ Status Sorter::Run::Init() {
   RETURN_IF_ERROR(AddPage(&fixed_len_pages_));
   if (has_var_len_slots_) {
     RETURN_IF_ERROR(AddPage(&var_len_pages_));
-    if (initial_run_) {
+    if (initial_run_ && sorter_->enable_spilling_) {
       // Need additional var len page to reorder var len data in UnpinAllPages().
       RETURN_IF_ERROR(var_len_copy_page_.Init(sorter_));
     }

http://git-wip-us.apache.org/repos/asf/impala/blob/1f37ca2b/tests/query_test/test_sort.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_sort.py b/tests/query_test/test_sort.py
index 23de9f2..e44d086 100644
--- a/tests/query_test/test_sort.py
+++ b/tests/query_test/test_sort.py
@@ -203,3 +203,19 @@ class TestRandomSort(ImpalaTestSuite):
       functional.alltypestiny"""
     results = transpose_results(self.execute_query(query).data, lambda x: float(x))
     assert (results == sorted(results))
+
+
+class TestPartialSort(ImpalaTestSuite):
+  """Test class to do functional validation of partial sorts."""
+
+  def test_partial_sort_min_reservation(self, unique_database):
+    """Test that the partial sort node can operate if it only gets its minimum
+    memory reservation."""
+    table_name = "%s.kudu_test" % unique_database
+    self.client.set_configuration_option(
+        "debug_action", "-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0")
+    self.execute_query("""create table %s (col0 string primary key)
+        partition by hash(col0) partitions 8 stored as kudu""" % table_name)
+    result = self.execute_query(
+        "insert into %s select string_col from functional.alltypessmall" % table_name)
+    assert "PARTIAL SORT" in result.runtime_profile, result.runtime_profile

[08/15] impala git commit: IMPALA-6850: Print actual error message on Sentry error

Posted by ta...@apache.org.

IMPALA-6850: Print actual error message on Sentry error

The patch puts the output of Sentry to
$IMPALA_CLUSTER_LOGS_DIR/sentry/sentry.out to follow the
same convention as other service output logs.

Testing:
- Injected some failure in run-sentry-service.sh script to see if the
  error message was captured

Change-Id: I76627bb5b986a548ec6e4f12b555bd6fc8c4dab8
Reviewed-on: http://gerrit.cloudera.org:8080/10064
Reviewed-by: Vuk Ercegovac <ve...@cloudera.com>
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/857d2b0e
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/857d2b0e
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/857d2b0e

Branch: refs/heads/2.x
Commit: 857d2b0e2451c82682f765e4dd45f6001bdfb043
Parents: 0be44ce
Author: Fredy Wijaya <fw...@cloudera.com>
Authored: Fri Apr 13 13:46:56 2018 -0500
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 testdata/bin/run-all.sh            | 8 ++++----
 testdata/bin/run-sentry-service.sh | 5 ++++-
 2 files changed, 8 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/857d2b0e/testdata/bin/run-all.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-all.sh b/testdata/bin/run-all.sh
index f722b89..6820e5d 100755
--- a/testdata/bin/run-all.sh
+++ b/testdata/bin/run-all.sh
@@ -57,8 +57,8 @@ if [[ ${DEFAULT_FS} == "hdfs://localhost:20500" ]]; then
       tee ${IMPALA_CLUSTER_LOGS_DIR}/run-hive-server.log
 
   echo " --> Starting the Sentry Policy Server"
-  $IMPALA_HOME/testdata/bin/run-sentry-service.sh > \
-      ${IMPALA_CLUSTER_LOGS_DIR}/run-sentry-service.log 2>&1
+  $IMPALA_HOME/testdata/bin/run-sentry-service.sh 2>&1 | \
+      tee ${IMPALA_CLUSTER_LOGS_DIR}/run-sentry-service.log
 
 elif [[ ${DEFAULT_FS} == "${LOCAL_FS}" ]]; then
   # When the local file system is used as default, we only start the Hive metastore.
@@ -80,6 +80,6 @@ else
       tee ${IMPALA_CLUSTER_LOGS_DIR}/run-hive-server.log
 
   echo " --> Starting the Sentry Policy Server"
-  $IMPALA_HOME/testdata/bin/run-sentry-service.sh > \
-      ${IMPALA_CLUSTER_LOGS_DIR}/run-sentry-service.log 2>&1
+  $IMPALA_HOME/testdata/bin/run-sentry-service.sh 2>&1 | \
+      tee ${IMPALA_CLUSTER_LOGS_DIR}/run-sentry-service.log
 fi

http://git-wip-us.apache.org/repos/asf/impala/blob/857d2b0e/testdata/bin/run-sentry-service.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-sentry-service.sh b/testdata/bin/run-sentry-service.sh
index cb6de28..755c382 100755
--- a/testdata/bin/run-sentry-service.sh
+++ b/testdata/bin/run-sentry-service.sh
@@ -23,6 +23,9 @@ trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)
 . ${IMPALA_HOME}/bin/set-classpath.sh
 
 SENTRY_SERVICE_CONFIG=${SENTRY_CONF_DIR}/sentry-site.xml
+LOGDIR="${IMPALA_CLUSTER_LOGS_DIR}"/sentry
+
+mkdir -p "${LOGDIR}" || true
 
 # First kill any running instances of the service.
 $IMPALA_HOME/testdata/bin/kill-sentry-service.sh
@@ -30,7 +33,7 @@ $IMPALA_HOME/testdata/bin/kill-sentry-service.sh
 # Sentry picks up JARs from the HADOOP_CLASSPATH and not the CLASSPATH.
 export HADOOP_CLASSPATH=${POSTGRES_JDBC_DRIVER}
 # Start the service.
-${SENTRY_HOME}/bin/sentry --command service -c ${SENTRY_SERVICE_CONFIG} &
+${SENTRY_HOME}/bin/sentry --command service -c ${SENTRY_SERVICE_CONFIG} > "${LOGDIR}"/sentry.out 2>&1 &
 
 # Wait for the service to come online
 "$JAVA" -cp $CLASSPATH org.apache.impala.testutil.SentryServicePinger \

[03/15] impala git commit: IMPALA-6451: AuthorizationException in CTAS for Kudu tables

Posted by ta...@apache.org.

IMPALA-6451: AuthorizationException in CTAS for Kudu tables

CREATE TABLE on EXTERNAL Kudu tables or with TBLPROPERTIES
('kudu.master_addresses') require ALL privileges on SERVER.
See IMPALA-4000.

The patch fixes a bug where the CTAS statement is rewritten
and during the initial analysis phase, 'kudu.master_addresses'
property is added into TBLPROPERTIES which causes the next
analysis phase to assume the statement was executed with
TBLPROPERTIES ('kudu.master_addresses'). Hence, causing the
ALL privilege request on SERVER to be registered.

This patch also refactors the generated Kudu table name into
a single place that stores all generated Kudu properties that
gets cleared when the statement is reset.

Testing:
- Added a new test
- Ran all front-end tests
- Ran Kudu end-to-end query tests

Change-Id: Iac3bbe4dceb80dfefd9756bc322f8c10ceab9f49
Reviewed-on: http://gerrit.cloudera.org:8080/10030
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b2316be6
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b2316be6
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b2316be6

Branch: refs/heads/2.x
Commit: b2316be6c8e672aaa15c51896e5ede053dc94d0e
Parents: b173c53
Author: Fredy Wijaya <fw...@cloudera.com>
Authored: Wed Apr 11 21:00:50 2018 -0500
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 .../apache/impala/analysis/CreateTableStmt.java | 21 ++++++++++----------
 .../org/apache/impala/analysis/TableDef.java    | 15 +++++++-------
 .../org/apache/impala/analysis/ToSqlUtils.java  | 19 ++++++++++++++----
 .../impala/analysis/AuthorizationTest.java      | 11 ++++++++++
 4 files changed, 44 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/b2316be6/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
index ee020df..ec689b6 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
@@ -112,11 +112,11 @@ public class CreateTableStmt extends StatementBase {
   Map<String, String> getSerdeProperties() { return tableDef_.getSerdeProperties(); }
   public THdfsFileFormat getFileFormat() { return tableDef_.getFileFormat(); }
   RowFormat getRowFormat() { return tableDef_.getRowFormat(); }
-  private String getGeneratedKuduTableName() {
-    return tableDef_.getGeneratedKuduTableName();
+  private void putGeneratedKuduProperty(String key, String value) {
+    tableDef_.putGeneratedKuduProperty(key, value);
   }
-  private void setGeneratedKuduTableName(String tableName) {
-    tableDef_.setGeneratedKuduTableName(tableName);
+  public Map<String, String> getGeneratedKuduProperties() {
+    return tableDef_.getGeneratedKuduProperties();
   }
 
   // Only exposed for ToSqlUtils. Returns the list of primary keys declared by the user
@@ -166,10 +166,7 @@ public class CreateTableStmt extends StatementBase {
     params.setIf_not_exists(getIfNotExists());
     params.setSort_columns(getSortColumns());
     params.setTable_properties(Maps.newHashMap(getTblProperties()));
-    if (!getGeneratedKuduTableName().isEmpty()) {
-      params.getTable_properties().put(KuduTable.KEY_TABLE_NAME,
-          getGeneratedKuduTableName());
-    }
+    params.getTable_properties().putAll(Maps.newHashMap(getGeneratedKuduProperties()));
     params.setSerde_properties(getSerdeProperties());
     for (KuduPartitionParam d: getKuduPartitionParams()) {
       params.addToPartition_by(d.toThrift());
@@ -261,7 +258,8 @@ public class CreateTableStmt extends StatementBase {
       throw new AnalysisException("Invalid storage handler specified for Kudu table: " +
           handler);
     }
-    getTblProperties().put(KuduTable.KEY_STORAGE_HANDLER, KuduTable.KUDU_STORAGE_HANDLER);
+    putGeneratedKuduProperty(KuduTable.KEY_STORAGE_HANDLER,
+        KuduTable.KUDU_STORAGE_HANDLER);
 
     String masterHosts = getTblProperties().get(KuduTable.KEY_MASTER_HOSTS);
     if (Strings.isNullOrEmpty(masterHosts)) {
@@ -271,7 +269,7 @@ public class CreateTableStmt extends StatementBase {
             "Table property '%s' is required when the impalad startup flag " +
             "-kudu_master_hosts is not used.", KuduTable.KEY_MASTER_HOSTS));
       }
-      getTblProperties().put(KuduTable.KEY_MASTER_HOSTS, masterHosts);
+      putGeneratedKuduProperty(KuduTable.KEY_MASTER_HOSTS, masterHosts);
     }
 
     // TODO: Find out what is creating a directory in HDFS and stop doing that. Kudu
@@ -357,7 +355,8 @@ public class CreateTableStmt extends StatementBase {
     AnalysisUtils.throwIfNotNull(getTblProperties().get(KuduTable.KEY_TABLE_NAME),
         String.format("Not allowed to set '%s' manually for managed Kudu tables .",
             KuduTable.KEY_TABLE_NAME));
-    setGeneratedKuduTableName(KuduUtil.getDefaultCreateKuduTableName(getDb(), getTbl()));
+    putGeneratedKuduProperty(KuduTable.KEY_TABLE_NAME,
+        KuduUtil.getDefaultCreateKuduTableName(getDb(), getTbl()));
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/impala/blob/b2316be6/fe/src/main/java/org/apache/impala/analysis/TableDef.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/TableDef.java b/fe/src/main/java/org/apache/impala/analysis/TableDef.java
index 915bbba..948515b 100644
--- a/fe/src/main/java/org/apache/impala/analysis/TableDef.java
+++ b/fe/src/main/java/org/apache/impala/analysis/TableDef.java
@@ -17,6 +17,7 @@
 
 package org.apache.impala.analysis;
 
+import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
@@ -88,8 +89,8 @@ class TableDef {
   // True if analyze() has been called.
   private boolean isAnalyzed_ = false;
 
-  //Kudu table name generated during analysis for managed Kudu tables
-  private String generatedKuduTableName_ = "";
+  // Generated Kudu properties set during analysis.
+  private Map<String, String> generatedKuduProperties_ = new HashMap<>();
 
   // END: Members that need to be reset()
   /////////////////////////////////////////
@@ -163,7 +164,7 @@ class TableDef {
     dataLayout_.reset();
     columnDefs_.clear();
     isAnalyzed_ = false;
-    generatedKuduTableName_ = "";
+    generatedKuduProperties_.clear();
   }
 
   public TableName getTblName() {
@@ -187,10 +188,10 @@ class TableDef {
   List<ColumnDef> getPrimaryKeyColumnDefs() { return primaryKeyColDefs_; }
   boolean isExternal() { return isExternal_; }
   boolean getIfNotExists() { return ifNotExists_; }
-  String getGeneratedKuduTableName() { return generatedKuduTableName_; }
-  void setGeneratedKuduTableName(String tableName) {
-    Preconditions.checkNotNull(tableName);
-    generatedKuduTableName_ = tableName;
+  Map<String, String> getGeneratedKuduProperties() { return generatedKuduProperties_; }
+  void putGeneratedKuduProperty(String key, String value) {
+    Preconditions.checkNotNull(key);
+    generatedKuduProperties_.put(key, value);
   }
   List<KuduPartitionParam> getKuduPartitionParams() {
     return dataLayout_.getKuduPartitionParams();

http://git-wip-us.apache.org/repos/asf/impala/blob/b2316be6/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
index 9a164f0..a3c084d 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
@@ -67,8 +67,10 @@ public class ToSqlUtils {
   /**
    * Removes all hidden properties from the given 'tblProperties' map.
    */
-  private static void removeHiddenTableProperties(Map<String, String> tblProperties) {
+  private static void removeHiddenTableProperties(Map<String, String> tblProperties,
+      Map<String, String> generatedTblProperties) {
     for (String key: HIDDEN_TABLE_PROPERTIES) tblProperties.remove(key);
+    generatedTblProperties.remove(KuduTable.KEY_TABLE_NAME);
   }
 
   /**
@@ -165,11 +167,17 @@ public class ToSqlUtils {
     for (ColumnDef col: stmt.getPartitionColumnDefs()) {
       partitionColsSql.add(col.toString());
     }
+    LinkedHashMap<String, String> properties = Maps.newLinkedHashMap(
+        stmt.getTblProperties());
+    LinkedHashMap<String, String> generatedProperties = Maps.newLinkedHashMap(
+        stmt.getGeneratedKuduProperties());
+    removeHiddenTableProperties(properties, generatedProperties);
+    properties.putAll(generatedProperties);
     String kuduParamsSql = getKuduPartitionByParams(stmt);
     // TODO: Pass the correct compression, if applicable.
     return getCreateTableSql(stmt.getDb(), stmt.getTbl(), stmt.getComment(), colsSql,
         partitionColsSql, stmt.getTblPrimaryKeyColumnNames(), kuduParamsSql,
-        stmt.getSortColumns(), stmt.getTblProperties(), stmt.getSerdeProperties(),
+        stmt.getSortColumns(), properties, stmt.getSerdeProperties(),
         stmt.isExternal(), stmt.getIfNotExists(), stmt.getRowFormat(),
         HdfsFileFormat.fromThrift(stmt.getFileFormat()), HdfsCompression.NONE, null,
         stmt.getLocation());
@@ -190,7 +198,10 @@ public class ToSqlUtils {
     // Use a LinkedHashMap to preserve the ordering of the table properties.
     LinkedHashMap<String, String> properties =
         Maps.newLinkedHashMap(innerStmt.getTblProperties());
-    removeHiddenTableProperties(properties);
+    LinkedHashMap<String, String> generatedProperties = Maps.newLinkedHashMap(
+        stmt.getCreateStmt().getGeneratedKuduProperties());
+    removeHiddenTableProperties(properties, generatedProperties);
+    properties.putAll(generatedProperties);
     String kuduParamsSql = getKuduPartitionByParams(innerStmt);
     // TODO: Pass the correct compression, if applicable.
     String createTableSql = getCreateTableSql(innerStmt.getDb(), innerStmt.getTbl(),
@@ -220,7 +231,7 @@ public class ToSqlUtils {
         msTable.getTableType().equals(TableType.EXTERNAL_TABLE.toString());
     List<String> sortColsSql = getSortColumns(properties);
     String comment = properties.get("comment");
-    removeHiddenTableProperties(properties);
+    removeHiddenTableProperties(properties, Maps.<String, String>newHashMap());
     ArrayList<String> colsSql = Lists.newArrayList();
     ArrayList<String> partitionColsSql = Lists.newArrayList();
     boolean isHbaseTable = table instanceof HBaseTable;

http://git-wip-us.apache.org/repos/asf/impala/blob/b2316be6/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java b/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
index bcea229..db31eaf 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
@@ -923,6 +923,17 @@ public class AuthorizationTest extends FrontendTestBase {
     AuthzOk("create table tpch.kudu_tbl (i int, j int, primary key (i))" +
         " PARTITION BY HASH (i) PARTITIONS 9 stored as kudu");
 
+    // IMPALA-6451: CTAS for Kudu tables on non-external tables and without
+    // TBLPROPERTIES ('kudu.master_addresses') should not require ALL privileges
+    // on SERVER.
+    // User has ALL privilege on tpch database and SELECT privilege on
+    // functional.alltypesagg table.
+    // The statement below causes the SQL statement to be rewritten.
+    AuthzOk("create table tpch.kudu_tbl primary key (bigint_col) stored as kudu as " +
+        "select bigint_col, string_col, current_timestamp() as ins_date " +
+        "from functional.alltypesagg " +
+        "where exists (select 1 from functional.alltypesagg)");
+
     // User does not have permission to create table at the specified location..
     AuthzError("create table tpch.new_table (i int) location " +
         "'hdfs://localhost:20500/test-warehouse/alltypes'",

[12/15] impala git commit: IMPALA-6464: [DOCS] COMPUTE STATS supports a list of columns

Posted by ta...@apache.org.

IMPALA-6464: [DOCS] COMPUTE STATS supports a list of columns

Change-Id: I609c38eac29e36eca008bfb66f5e78f5491e719a
Reviewed-on: http://gerrit.cloudera.org:8080/10070
Reviewed-by: Vuk Ercegovac <ve...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/a98ff448
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/a98ff448
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/a98ff448

Branch: refs/heads/2.x
Commit: a98ff448f7fd5429ff8e7f873236ae2f09f15664
Parents: ca70cd9
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Fri Apr 13 18:14:57 2018 -0700
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:48 2018 +0000

----------------------------------------------------------------------
 docs/topics/impala_compute_stats.xml | 116 ++++++++++++++++++++----------
 1 file changed, 77 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/a98ff448/docs/topics/impala_compute_stats.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_compute_stats.xml b/docs/topics/impala_compute_stats.xml
index 98694f8..b62972c 100644
--- a/docs/topics/impala_compute_stats.xml
+++ b/docs/topics/impala_compute_stats.xml
@@ -49,7 +49,11 @@ under the License.
 
     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>
 
-<codeblock rev="2.1.0">COMPUTE STATS [<varname>db_name</varname>.]<varname>table_name</varname>
+<codeblock rev="impala-3562">COMPUTE STATS
+  [<varname>db_name</varname>.]<varname>table_name</varname> [ ( <varname>column_list</varname> ) ]
+
+<varname>column_list</varname> ::= <varname>column_name</varname> [ , <varname>column_name</varname>, ... ]
+
 COMPUTE INCREMENTAL STATS [<varname>db_name</varname>.]<varname>table_name</varname> [PARTITION (<varname>partition_spec</varname>)]
 
 <varname>partition_spec</varname> ::= <varname>simple_partition_spec</varname> | <ph rev="IMPALA-1654"><varname>complex_partition_spec</varname></ph>
@@ -64,12 +68,40 @@ COMPUTE INCREMENTAL STATS [<varname>db_name</varname>.]<varname>table_name</varn
     <p conref="../shared/impala_common.xml#common/usage_notes_blurb"/>
 
     <p>
-      Originally, Impala relied on users to run the Hive <codeph>ANALYZE TABLE</codeph> statement, but that method
-      of gathering statistics proved unreliable and difficult to use. The Impala <codeph>COMPUTE STATS</codeph>
-      statement is built from the ground up to improve the reliability and user-friendliness of this operation.
-      <codeph>COMPUTE STATS</codeph> does not require any setup steps or special configuration. You only run a
-      single Impala <codeph>COMPUTE STATS</codeph> statement to gather both table and column statistics, rather
-      than separate Hive <codeph>ANALYZE TABLE</codeph> statements for each kind of statistics.
+      Originally, Impala relied on users to run the Hive <codeph>ANALYZE
+        TABLE</codeph> statement, but that method of gathering statistics proved
+      unreliable and difficult to use. The Impala <codeph>COMPUTE STATS</codeph>
+      statement was built to improve the reliability and user-friendliness of
+      this operation. <codeph>COMPUTE STATS</codeph> does not require any setup
+      steps or special configuration. You only run a single Impala
+        <codeph>COMPUTE STATS</codeph> statement to gather both table and column
+      statistics, rather than separate Hive <codeph>ANALYZE TABLE</codeph>
+      statements for each kind of statistics.
+    </p>
+
+    <p rev="impala-3562">
+      For non-incremental <codeph>COMPUTE STATS</codeph>
+      statement, the columns for which statistics are computed can be specified
+      with an optional comma-separate list of columns.
+    </p>
+
+    <p rev="impala-3562">
+      If no column list is given, the <codeph>COMPUTE STATS</codeph> statement
+      computes column-level statistics for all columns of the table. This adds
+      potentially unneeded work for columns whose stats are not needed by
+      queries. It can be especially costly for very wide tables and unneeded
+      large string fields.
+    </p>
+    <p rev="impala-3562">
+      <codeph>COMPUTE STATS</codeph> returns an error when a specified column
+      cannot be analyzed, such as when the column does not exist, the column is
+      of an unsupported type for COMPUTE STATS, e.g. colums of complex types,
+      or the column is a partitioning column.
+
+    </p>
+    <p rev="impala-3562">
+      If an empty column list is given, no column is analyzed by <codeph>COMPUTE
+        STATS</codeph>.
     </p>
 
     <p rev="2.1.0">
@@ -92,39 +124,45 @@ COMPUTE INCREMENTAL STATS [<varname>db_name</varname>.]<varname>table_name</varn
       <codeph>COMPUTE STATS</codeph> statement. Such tables display <codeph>false</codeph> under the
       <codeph>Incremental stats</codeph> column of the <codeph>SHOW TABLE STATS</codeph> output.
     </p>
-
     <note>
-      Because many of the most performance-critical and resource-intensive operations rely on table and column
-      statistics to construct accurate and efficient plans, <codeph>COMPUTE STATS</codeph> is an important step at
-      the end of your ETL process. Run <codeph>COMPUTE STATS</codeph> on all tables as your first step during
-      performance tuning for slow queries, or troubleshooting for out-of-memory conditions:
-      <ul>
-        <li>
-          Accurate statistics help Impala construct an efficient query plan for join queries, improving performance
-          and reducing memory usage.
-        </li>
-
-        <li>
-          Accurate statistics help Impala distribute the work effectively for insert operations into Parquet
-          tables, improving performance and reducing memory usage.
-        </li>
-
-        <li rev="1.3.0">
-          Accurate statistics help Impala estimate the memory required for each query, which is important when you
-          use resource management features, such as admission control and the YARN resource management framework.
-          The statistics help Impala to achieve high concurrency, full utilization of available memory, and avoid
-          contention with workloads from other Hadoop components.
-        </li>
-        <li rev="IMPALA-4572">
-          In <keyword keyref="impala28_full"/> and higher, when you run the
-          <codeph>COMPUTE STATS</codeph> or <codeph>COMPUTE INCREMENTAL STATS</codeph>
-          statement against a Parquet table, Impala automatically applies the query
-          option setting <codeph>MT_DOP=4</codeph> to increase the amount of intra-node
-          parallelism during this CPU-intensive operation. See <xref keyref="mt_dop"/>
-          for details about what this query option does and how to use it with
-          CPU-intensive <codeph>SELECT</codeph> statements.
-        </li>
-      </ul>
+      <p>
+        Because many of the most performance-critical and resource-intensive
+        operations rely on table and column statistics to construct accurate and
+        efficient plans, <codeph>COMPUTE STATS</codeph> is an important step at
+        the end of your ETL process. Run <codeph>COMPUTE STATS</codeph> on all
+        tables as your first step during performance tuning for slow queries, or
+        troubleshooting for out-of-memory conditions:
+        <ul>
+          <li>
+            Accurate statistics help Impala construct an efficient query plan
+            for join queries, improving performance and reducing memory usage.
+          </li>
+          <li>
+            Accurate statistics help Impala distribute the work effectively
+            for insert operations into Parquet tables, improving performance and
+            reducing memory usage.
+          </li>
+          <li rev="1.3.0">
+            Accurate statistics help Impala estimate the memory
+            required for each query, which is important when you use resource
+            management features, such as admission control and the YARN resource
+            management framework. The statistics help Impala to achieve high
+            concurrency, full utilization of available memory, and avoid
+            contention with workloads from other Hadoop components.
+          </li>
+          <li rev="IMPALA-4572">
+            In <keyword keyref="impala28_full"/> and
+            higher, when you run the <codeph>COMPUTE STATS</codeph> or
+              <codeph>COMPUTE INCREMENTAL STATS</codeph> statement against a
+            Parquet table, Impala automatically applies the query option setting
+              <codeph>MT_DOP=4</codeph> to increase the amount of intra-node
+            parallelism during this CPU-intensive operation. See <xref
+              keyref="mt_dop"/> for details about what this query option does
+            and how to use it with CPU-intensive <codeph>SELECT</codeph>
+            statements.
+          </li>
+        </ul>
+      </p>
     </note>
 
     <p rev="IMPALA-1654">

[05/15] impala git commit: IMPALA-6845: TestHdfsQueries causes some tests to be run twice

Posted by ta...@apache.org.

IMPALA-6845: TestHdfsQueries causes some tests to be run twice

TestHdfsQueries is a subclass of TestQueries and inherits of all its
'test_*' methods, causing these tests to be run twice any time
test_queries.py is run. This was not intentional (it was subclassed
just to inherit 'add_test_dimensions') and causes test runs to take
longer than necessary.

This patch removes the subclass relationship and copies the logic in
add_test_dimensions() from TestQueries in HdfsTestQueries, with a
convenience function added to minimize code duplication.

Testing:
- Ran test_queries.py under both 'core' and 'exhaustive' and checked
  that the same tests are run, except all now only a single time each.

Change-Id: Ida659aa7b5131a6a7469baa93a41f7581bd0659a
Reviewed-on: http://gerrit.cloudera.org:8080/10053
Reviewed-by: Michael Brown <mi...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/b01f7812
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/b01f7812
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/b01f7812

Branch: refs/heads/2.x
Commit: b01f78125ab622c406e4554c0a4353e9c60bed7c
Parents: 1f37ca2
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Thu Apr 12 23:11:05 2018 +0000
Committer: Impala Public Jenkins <im...@gerrit.cloudera.org>
Committed: Wed Apr 18 21:17:47 2018 +0000

----------------------------------------------------------------------
 tests/common/test_dimensions.py  | 17 ++++++++++++++++-
 tests/query_test/test_queries.py | 27 ++++++++++++---------------
 2 files changed, 28 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/b01f7812/tests/common/test_dimensions.py
----------------------------------------------------------------------
diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py
index df3f8c2..434b884 100644
--- a/tests/common/test_dimensions.py
+++ b/tests/common/test_dimensions.py
@@ -17,10 +17,11 @@
 
 # Common test dimensions and associated utility functions.
 
+import copy
 import os
 from itertools import product
 
-from tests.common.test_vector import ImpalaTestDimension
+from tests.common.test_vector import ImpalaTestDimension, ImpalaTestVector
 
 WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
 
@@ -179,6 +180,20 @@ def create_exec_option_dimension_from_dict(exec_option_dimensions):
   # Build a test vector out of it
   return ImpalaTestDimension('exec_option', *exec_option_dimension_values)
 
+def extend_exec_option_dimension(test_suite, key, value):
+  """
+  Takes an ImpalaTestSuite object 'test_suite' and extends the exec option test dimension
+  by creating a copy of each existing exec option value that has 'key' set to 'value',
+  doubling the number of tests that will be run.
+  """
+  dim = test_suite.ImpalaTestMatrix.dimensions["exec_option"]
+  new_value = []
+  for v in dim:
+    new_value.append(ImpalaTestVector.Value(v.name, copy.copy(v.value)))
+    new_value[-1].value[key] = value
+  dim.extend(new_value)
+  test_suite.ImpalaTestMatrix.add_dimension(dim)
+
 def get_dataset_from_workload(workload):
   # TODO: We need a better way to define the workload -> dataset mapping so we can
   # extract it without reading the actual test vector file

http://git-wip-us.apache.org/repos/asf/impala/blob/b01f7812/tests/query_test/test_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py
index 997b59a..2d6914b 100644
--- a/tests/query_test/test_queries.py
+++ b/tests/query_test/test_queries.py
@@ -22,7 +22,7 @@ import pytest
 import re
 
 from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.test_dimensions import create_uncompressed_text_dimension
+from tests.common.test_dimensions import create_uncompressed_text_dimension, extend_exec_option_dimension
 from tests.common.test_vector import ImpalaTestVector
 
 class TestQueries(ImpalaTestSuite):
@@ -33,18 +33,9 @@ class TestQueries(ImpalaTestSuite):
       cls.ImpalaTestMatrix.add_constraint(lambda v:\
           v.get_value('table_format').file_format == 'parquet')
 
-    # Manually adding a test dimension here to test the small query opt
-    # in exhaustive.
-    # TODO Cleanup required, allow adding values to dimensions without having to
-    # manually explode them
+    # Adding a test dimension here to test the small query opt in exhaustive.
     if cls.exploration_strategy() == 'exhaustive':
-      dim = cls.ImpalaTestMatrix.dimensions["exec_option"]
-      new_value = []
-      for v in dim:
-        new_value.append(ImpalaTestVector.Value(v.name, copy.copy(v.value)))
-        new_value[-1].value["exec_single_node_rows_threshold"] = 100
-      dim.extend(new_value)
-      cls.ImpalaTestMatrix.add_dimension(dim)
+      extend_exec_option_dimension(cls, "exec_single_node_rows_threshold", "100")
 
   @classmethod
   def get_workload(cls):
@@ -212,9 +203,7 @@ class TestQueriesParquetTables(ImpalaTestSuite):
     self.run_test_case('QueryTest/single-node-large-sorts', vector)
 
 # Tests for queries in HDFS-specific tables, e.g. AllTypesAggMultiFilesNoPart.
-# This is a subclass of TestQueries to get the extra test dimension for
-# exec_single_node_rows_threshold in exhaustive.
-class TestHdfsQueries(TestQueries):
+class TestHdfsQueries(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestHdfsQueries, cls).add_test_dimensions()
@@ -222,6 +211,14 @@ class TestHdfsQueries(TestQueries):
     cls.ImpalaTestMatrix.add_constraint(lambda v:\
         v.get_value('table_format').file_format != 'kudu')
 
+    # Adding a test dimension here to test the small query opt in exhaustive.
+    if cls.exploration_strategy() == 'exhaustive':
+      extend_exec_option_dimension(cls, "exec_single_node_rows_threshold", "100")
+
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
   def test_hdfs_scan_node(self, vector):
     self.run_test_case('QueryTest/hdfs-scan-node', vector)