You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@impala.apache.org by bh...@apache.org on 2018/04/13 23:00:11 UTC

[1/5] impala git commit: IMPALA-6809: Allow bootstrap_system.sh in non ~/Impala directory

Repository: impala
Updated Branches:
  refs/heads/master 5417e712f -> ce09269fd


IMPALA-6809: Allow bootstrap_system.sh in non ~/Impala directory

Testing:
- Ran bootstrap_development.sh with IMPALA_HOME set to non ~/Impala
  directory
- Ran bootstrap_development.sh with IMPALA_HOME not set

Change-Id: I3241c180b5fb28f1b5f939200f72461ad6fd7d7a
Reviewed-on: http://gerrit.cloudera.org:8080/9994
Reviewed-by: Fredy Wijaya <fw...@cloudera.com>
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/2dfa2611
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/2dfa2611
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/2dfa2611

Branch: refs/heads/master
Commit: 2dfa2611b759dc9bae6368d4a611c8ccf1daf763
Parents: 5417e71
Author: Fredy Wijaya <fw...@cloudera.com>
Authored: Wed Apr 11 09:02:48 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Fri Apr 13 00:44:04 2018 +0000

----------------------------------------------------------------------
 bin/bootstrap_system.sh | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/2dfa2611/bin/bootstrap_system.sh
----------------------------------------------------------------------
diff --git a/bin/bootstrap_system.sh b/bin/bootstrap_system.sh
index 3f88dd3..bce161b 100755
--- a/bin/bootstrap_system.sh
+++ b/bin/bootstrap_system.sh
@@ -22,6 +22,8 @@
 # configurations, so it is best to run this in a fresh install. It also sets up the
 # ~/.bashrc for the calling user and impala-config-local.sh with some environment
 # variables to make Impala compile and run after this script is complete.
+# When IMPALA_HOME is set, the script will bootstrap Impala development in the
+# location specified.
 #
 # The intended user is a person who wants to start contributing code to Impala. This
 # script serves as an executable reference point for how to get started.
@@ -98,11 +100,13 @@ apt-get --yes install git
 echo ">>> Checking out Impala"
 
 # If there is no Impala git repo, get one now
-if ! [[ -d ~/Impala ]]
+
+: ${IMPALA_HOME:=~/Impala}
+if ! [[ -d "$IMPALA_HOME" ]]
 then
-  time -p git clone https://git-wip-us.apache.org/repos/asf/impala.git ~/Impala
+  time -p git clone https://git-wip-us.apache.org/repos/asf/impala.git "$IMPALA_HOME"
 fi
-cd ~/Impala
+cd "$IMPALA_HOME"
 SET_IMPALA_HOME="export IMPALA_HOME=$(pwd)"
 echo "$SET_IMPALA_HOME" >> ~/.bashrc
 eval "$SET_IMPALA_HOME"
@@ -199,17 +203,19 @@ echo "* - nofile 1048576" | sudo tee -a /etc/security/limits.conf
 
 # LZO is not needed to compile or run Impala, but it is needed for the data load
 echo ">>> Checking out Impala-lzo"
-if ! [[ -d ~/Impala-lzo ]]
+: ${IMPALA_LZO_HOME:="${IMPALA_HOME}/../Impala-lzo"}
+if ! [[ -d "$IMPALA_LZO_HOME" ]]
 then
-  git clone https://github.com/cloudera/impala-lzo.git ~/Impala-lzo
+  git clone https://github.com/cloudera/impala-lzo.git "$IMPALA_LZO_HOME"
 fi
 
 echo ">>> Checking out and building hadoop-lzo"
 
-if ! [[ -d ~/hadoop-lzo ]]
+: ${HADOOP_LZO_HOME:="${IMPALA_HOME}/../hadoop-lzo"}
+if ! [[ -d "$HADOOP_LZO_HOME" ]]
 then
-  git clone https://github.com/cloudera/hadoop-lzo.git ~/hadoop-lzo
+  git clone https://github.com/cloudera/hadoop-lzo.git "$HADOOP_LZO_HOME"
 fi
-cd ~/hadoop-lzo/
+cd "$HADOOP_LZO_HOME"
 time -p ant package
 cd "$IMPALA_HOME"

[2/5] impala git commit: IMPALA-6451: AuthorizationException in CTAS for Kudu tables

Posted by bh...@apache.org.

IMPALA-6451: AuthorizationException in CTAS for Kudu tables

CREATE TABLE on EXTERNAL Kudu tables or with TBLPROPERTIES
('kudu.master_addresses') require ALL privileges on SERVER.
See IMPALA-4000.

The patch fixes a bug where the CTAS statement is rewritten
and during the initial analysis phase, 'kudu.master_addresses'
property is added into TBLPROPERTIES which causes the next
analysis phase to assume the statement was executed with
TBLPROPERTIES ('kudu.master_addresses'). Hence, causing the
ALL privilege request on SERVER to be registered.

This patch also refactors the generated Kudu table name into
a single place that stores all generated Kudu properties that
gets cleared when the statement is reset.

Testing:
- Added a new test
- Ran all front-end tests
- Ran Kudu end-to-end query tests

Change-Id: Iac3bbe4dceb80dfefd9756bc322f8c10ceab9f49
Reviewed-on: http://gerrit.cloudera.org:8080/10030
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6f8f7574
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6f8f7574
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6f8f7574

Branch: refs/heads/master
Commit: 6f8f757446597ffde4d4d77ce51543f82fed0c0b
Parents: 2dfa261
Author: Fredy Wijaya <fw...@cloudera.com>
Authored: Wed Apr 11 21:00:50 2018 -0500
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Fri Apr 13 20:55:01 2018 +0000

----------------------------------------------------------------------
 .../apache/impala/analysis/CreateTableStmt.java | 21 ++++++++++----------
 .../org/apache/impala/analysis/TableDef.java    | 15 +++++++-------
 .../org/apache/impala/analysis/ToSqlUtils.java  | 19 ++++++++++++++----
 .../impala/analysis/AuthorizationTest.java      | 11 ++++++++++
 4 files changed, 44 insertions(+), 22 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6f8f7574/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
index ee020df..ec689b6 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CreateTableStmt.java
@@ -112,11 +112,11 @@ public class CreateTableStmt extends StatementBase {
   Map<String, String> getSerdeProperties() { return tableDef_.getSerdeProperties(); }
   public THdfsFileFormat getFileFormat() { return tableDef_.getFileFormat(); }
   RowFormat getRowFormat() { return tableDef_.getRowFormat(); }
-  private String getGeneratedKuduTableName() {
-    return tableDef_.getGeneratedKuduTableName();
+  private void putGeneratedKuduProperty(String key, String value) {
+    tableDef_.putGeneratedKuduProperty(key, value);
   }
-  private void setGeneratedKuduTableName(String tableName) {
-    tableDef_.setGeneratedKuduTableName(tableName);
+  public Map<String, String> getGeneratedKuduProperties() {
+    return tableDef_.getGeneratedKuduProperties();
   }
 
   // Only exposed for ToSqlUtils. Returns the list of primary keys declared by the user
@@ -166,10 +166,7 @@ public class CreateTableStmt extends StatementBase {
     params.setIf_not_exists(getIfNotExists());
     params.setSort_columns(getSortColumns());
     params.setTable_properties(Maps.newHashMap(getTblProperties()));
-    if (!getGeneratedKuduTableName().isEmpty()) {
-      params.getTable_properties().put(KuduTable.KEY_TABLE_NAME,
-          getGeneratedKuduTableName());
-    }
+    params.getTable_properties().putAll(Maps.newHashMap(getGeneratedKuduProperties()));
     params.setSerde_properties(getSerdeProperties());
     for (KuduPartitionParam d: getKuduPartitionParams()) {
       params.addToPartition_by(d.toThrift());
@@ -261,7 +258,8 @@ public class CreateTableStmt extends StatementBase {
       throw new AnalysisException("Invalid storage handler specified for Kudu table: " +
           handler);
     }
-    getTblProperties().put(KuduTable.KEY_STORAGE_HANDLER, KuduTable.KUDU_STORAGE_HANDLER);
+    putGeneratedKuduProperty(KuduTable.KEY_STORAGE_HANDLER,
+        KuduTable.KUDU_STORAGE_HANDLER);
 
     String masterHosts = getTblProperties().get(KuduTable.KEY_MASTER_HOSTS);
     if (Strings.isNullOrEmpty(masterHosts)) {
@@ -271,7 +269,7 @@ public class CreateTableStmt extends StatementBase {
             "Table property '%s' is required when the impalad startup flag " +
             "-kudu_master_hosts is not used.", KuduTable.KEY_MASTER_HOSTS));
       }
-      getTblProperties().put(KuduTable.KEY_MASTER_HOSTS, masterHosts);
+      putGeneratedKuduProperty(KuduTable.KEY_MASTER_HOSTS, masterHosts);
     }
 
     // TODO: Find out what is creating a directory in HDFS and stop doing that. Kudu
@@ -357,7 +355,8 @@ public class CreateTableStmt extends StatementBase {
     AnalysisUtils.throwIfNotNull(getTblProperties().get(KuduTable.KEY_TABLE_NAME),
         String.format("Not allowed to set '%s' manually for managed Kudu tables .",
             KuduTable.KEY_TABLE_NAME));
-    setGeneratedKuduTableName(KuduUtil.getDefaultCreateKuduTableName(getDb(), getTbl()));
+    putGeneratedKuduProperty(KuduTable.KEY_TABLE_NAME,
+        KuduUtil.getDefaultCreateKuduTableName(getDb(), getTbl()));
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/impala/blob/6f8f7574/fe/src/main/java/org/apache/impala/analysis/TableDef.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/TableDef.java b/fe/src/main/java/org/apache/impala/analysis/TableDef.java
index 915bbba..948515b 100644
--- a/fe/src/main/java/org/apache/impala/analysis/TableDef.java
+++ b/fe/src/main/java/org/apache/impala/analysis/TableDef.java
@@ -17,6 +17,7 @@
 
 package org.apache.impala.analysis;
 
+import java.util.HashMap;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
@@ -88,8 +89,8 @@ class TableDef {
   // True if analyze() has been called.
   private boolean isAnalyzed_ = false;
 
-  //Kudu table name generated during analysis for managed Kudu tables
-  private String generatedKuduTableName_ = "";
+  // Generated Kudu properties set during analysis.
+  private Map<String, String> generatedKuduProperties_ = new HashMap<>();
 
   // END: Members that need to be reset()
   /////////////////////////////////////////
@@ -163,7 +164,7 @@ class TableDef {
     dataLayout_.reset();
     columnDefs_.clear();
     isAnalyzed_ = false;
-    generatedKuduTableName_ = "";
+    generatedKuduProperties_.clear();
   }
 
   public TableName getTblName() {
@@ -187,10 +188,10 @@ class TableDef {
   List<ColumnDef> getPrimaryKeyColumnDefs() { return primaryKeyColDefs_; }
   boolean isExternal() { return isExternal_; }
   boolean getIfNotExists() { return ifNotExists_; }
-  String getGeneratedKuduTableName() { return generatedKuduTableName_; }
-  void setGeneratedKuduTableName(String tableName) {
-    Preconditions.checkNotNull(tableName);
-    generatedKuduTableName_ = tableName;
+  Map<String, String> getGeneratedKuduProperties() { return generatedKuduProperties_; }
+  void putGeneratedKuduProperty(String key, String value) {
+    Preconditions.checkNotNull(key);
+    generatedKuduProperties_.put(key, value);
   }
   List<KuduPartitionParam> getKuduPartitionParams() {
     return dataLayout_.getKuduPartitionParams();

http://git-wip-us.apache.org/repos/asf/impala/blob/6f8f7574/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
----------------------------------------------------------------------
diff --git a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
index 9a164f0..a3c084d 100644
--- a/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
+++ b/fe/src/main/java/org/apache/impala/analysis/ToSqlUtils.java
@@ -67,8 +67,10 @@ public class ToSqlUtils {
   /**
    * Removes all hidden properties from the given 'tblProperties' map.
    */
-  private static void removeHiddenTableProperties(Map<String, String> tblProperties) {
+  private static void removeHiddenTableProperties(Map<String, String> tblProperties,
+      Map<String, String> generatedTblProperties) {
     for (String key: HIDDEN_TABLE_PROPERTIES) tblProperties.remove(key);
+    generatedTblProperties.remove(KuduTable.KEY_TABLE_NAME);
   }
 
   /**
@@ -165,11 +167,17 @@ public class ToSqlUtils {
     for (ColumnDef col: stmt.getPartitionColumnDefs()) {
       partitionColsSql.add(col.toString());
     }
+    LinkedHashMap<String, String> properties = Maps.newLinkedHashMap(
+        stmt.getTblProperties());
+    LinkedHashMap<String, String> generatedProperties = Maps.newLinkedHashMap(
+        stmt.getGeneratedKuduProperties());
+    removeHiddenTableProperties(properties, generatedProperties);
+    properties.putAll(generatedProperties);
     String kuduParamsSql = getKuduPartitionByParams(stmt);
     // TODO: Pass the correct compression, if applicable.
     return getCreateTableSql(stmt.getDb(), stmt.getTbl(), stmt.getComment(), colsSql,
         partitionColsSql, stmt.getTblPrimaryKeyColumnNames(), kuduParamsSql,
-        stmt.getSortColumns(), stmt.getTblProperties(), stmt.getSerdeProperties(),
+        stmt.getSortColumns(), properties, stmt.getSerdeProperties(),
         stmt.isExternal(), stmt.getIfNotExists(), stmt.getRowFormat(),
         HdfsFileFormat.fromThrift(stmt.getFileFormat()), HdfsCompression.NONE, null,
         stmt.getLocation());
@@ -190,7 +198,10 @@ public class ToSqlUtils {
     // Use a LinkedHashMap to preserve the ordering of the table properties.
     LinkedHashMap<String, String> properties =
         Maps.newLinkedHashMap(innerStmt.getTblProperties());
-    removeHiddenTableProperties(properties);
+    LinkedHashMap<String, String> generatedProperties = Maps.newLinkedHashMap(
+        stmt.getCreateStmt().getGeneratedKuduProperties());
+    removeHiddenTableProperties(properties, generatedProperties);
+    properties.putAll(generatedProperties);
     String kuduParamsSql = getKuduPartitionByParams(innerStmt);
     // TODO: Pass the correct compression, if applicable.
     String createTableSql = getCreateTableSql(innerStmt.getDb(), innerStmt.getTbl(),
@@ -220,7 +231,7 @@ public class ToSqlUtils {
         msTable.getTableType().equals(TableType.EXTERNAL_TABLE.toString());
     List<String> sortColsSql = getSortColumns(properties);
     String comment = properties.get("comment");
-    removeHiddenTableProperties(properties);
+    removeHiddenTableProperties(properties, Maps.<String, String>newHashMap());
     ArrayList<String> colsSql = Lists.newArrayList();
     ArrayList<String> partitionColsSql = Lists.newArrayList();
     boolean isHbaseTable = table instanceof HBaseTable;

http://git-wip-us.apache.org/repos/asf/impala/blob/6f8f7574/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
----------------------------------------------------------------------
diff --git a/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java b/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
index 9a62e93..f0b7332 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AuthorizationTest.java
@@ -1126,6 +1126,17 @@ public class AuthorizationTest extends FrontendTestBase {
     AuthzOk("create table tpch.kudu_tbl (i int, j int, primary key (i))" +
         " PARTITION BY HASH (i) PARTITIONS 9 stored as kudu");
 
+    // IMPALA-6451: CTAS for Kudu tables on non-external tables and without
+    // TBLPROPERTIES ('kudu.master_addresses') should not require ALL privileges
+    // on SERVER.
+    // User has ALL privilege on tpch database and SELECT privilege on
+    // functional.alltypesagg table.
+    // The statement below causes the SQL statement to be rewritten.
+    AuthzOk("create table tpch.kudu_tbl primary key (bigint_col) stored as kudu as " +
+        "select bigint_col, string_col, current_timestamp() as ins_date " +
+        "from functional.alltypesagg " +
+        "where exists (select 1 from functional.alltypesagg)");
+
     // User does not have permission to create table at the specified location..
     AuthzError("create table tpch.new_table (i int) location " +
         "'hdfs://localhost:20500/test-warehouse/alltypes'",

[5/5] impala git commit: IMPALA-6747: Automate diagnostics collection.

Posted by bh...@apache.org.

IMPALA-6747: Automate diagnostics collection.

This commit adds the necessary tooling to automate diagnostics
collection for Impala daemons. Following diagnostics are supported.

1. Native core dump (+ shared libs)
2. GDB/Java thread dump (pstack + jstack)
3. Java heap dump (jmap)
4. Minidumps (using breakpad) *
5. Profiles

Given the required inputs, the script outputs a zip compressed
impala diagnostic bundle with all the diagnostics collected.

The script can be run manually with the following command.

python collect_diagnostics.py --help

Tested with python 2.6 and later.

* minidumps collected here correspond to the state of the Impala
process at the time this script is triggered. This is different
from collect_minidumps.py which archives the entire minidump
directory.

Change-Id: I166e726f1dd1ce81187616e4f06d2404fa379bf8
Reviewed-on: http://gerrit.cloudera.org:8080/10056
Reviewed-by: Philip Zeyliger <ph...@cloudera.com>
Reviewed-by: Bharath Vissapragada <bh...@cloudera.com>
Tested-by: Bharath Vissapragada <bh...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/ce09269f
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/ce09269f
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/ce09269f

Branch: refs/heads/master
Commit: ce09269fde901a262b9e1ce0c7259d8864c1bb3b
Parents: 16bed5c
Author: Bharath Vissapragada <bh...@cloudera.com>
Authored: Thu Apr 12 21:52:29 2018 -0700
Committer: Bharath Vissapragada <bh...@cloudera.com>
Committed: Fri Apr 13 22:01:33 2018 +0000

----------------------------------------------------------------------
 bin/diagnostics/__init__.py            |   0
 bin/diagnostics/collect_diagnostics.py | 545 ++++++++++++++++++++++++++++
 bin/diagnostics/collect_shared_libs.sh |  52 +++
 bin/rat_exclude_files.txt              |   1 +
 tests/unittests/test_command.py        |  49 +++
 5 files changed, 647 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/ce09269f/bin/diagnostics/__init__.py
----------------------------------------------------------------------
diff --git a/bin/diagnostics/__init__.py b/bin/diagnostics/__init__.py
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/impala/blob/ce09269f/bin/diagnostics/collect_diagnostics.py
----------------------------------------------------------------------
diff --git a/bin/diagnostics/collect_diagnostics.py b/bin/diagnostics/collect_diagnostics.py
new file mode 100644
index 0000000..8fe4561
--- /dev/null
+++ b/bin/diagnostics/collect_diagnostics.py
@@ -0,0 +1,545 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import datetime
+import errno
+import getpass
+import glob
+import logging
+import math
+import os
+import shutil
+import subprocess
+import sys
+import tarfile
+import time
+import tempfile
+import traceback
+
+from collections import namedtuple
+from contextlib import closing
+from struct import Struct
+from threading import Timer
+
+# This script is for automating the collection of following diagnostics from a host
+# running an Impala service daemon (catalogd/statestored/impalad). Following diagnostics
+# are supported.
+#
+# 1. Native core dump (+ shared libs)
+# 2. GDB/Java thread dump (pstack + jstack)
+# 3. Java heap dump (jmap)
+# 4. Minidumps (using breakpad)
+# 5. Profiles
+#
+# Dependencies:
+# 1. gdb package should be installed to collect native thread stacks/coredump. The binary
+#    location is picked up from the system path. In case of pstacks, the script falls back
+#    to the breakpad minidumps if the 'pstack' binary is not in system path.
+# 2. jstack/jmap from a JRE/JDK. Default location is picked up from system path but can be
+#    overriden with --java_home PATH_TO_JAVA_HOME.
+# 3. Mindumps are collected by sending a SIGUSR1 signal to the Impala process. Impala
+#    versions without full breakpad support (<= release 2.6) will reliably crash if
+#    we attempt to do that since those versions do not have the corresponding signal
+#    handler. Hence it is suggested to run this script only on releases 2.7 and later.
+# 4. python >= 2.6
+#
+# Usage: python collect_diagnostics.py --help
+#
+# Few example usages:
+#
+# Collect 3 jstacks, pstacks from an impalad process 3s apart.
+#  python collect_diagnostics.py --pid $(pidof impalad) --stacks 3 3
+#
+# Collect core dump and a Java heapdump from the catalogd process
+#  python collect_diagnostics.py --pid $(pidof impalad) --jmap --gcore
+#
+# Collect 5 breakpad minidumps from a statestored process 5s apart.
+#  python collect_diagnostics.py --pid $(pidof statestored) --minidumps 5 5
+#      --minidumps_dir /var/log/statestored/minidumps
+#
+#
+class Command(object):
+  """Wrapper around subprocess.Popen() that is canceled after a configurable timeout."""
+  def __init__(self, cmd, timeout=30):
+    self.cmd = cmd
+    self.timeout = timeout
+    self.child_killed_by_timeout = False
+
+  def run(self, cmd_stdin=None, cmd_stdout=subprocess.PIPE):
+    """Runs the command 'cmd' by setting the appropriate stdin/out. The command is killed
+    if hits a timeout (controlled by self.timeout)."""
+    cmd_string = " ".join(self.cmd)
+    logging.info("Starting command %s with a timeout of %s"
+        % (cmd_string, str(self.timeout)))
+    self.child = subprocess.Popen(self.cmd, stdin=cmd_stdin, stdout=cmd_stdout)
+    timer = Timer(self.timeout, self.kill_child)
+    try:
+      timer.start()
+      # self.stdout is set to None if cmd_stdout is anything other than PIPE. The actual
+      # stdout is written to the file corresponding to cmd_stdout.
+      self.stdout = self.child.communicate()[0]
+      if self.child.returncode == 0:
+        logging.info("Command finished successfully: " + cmd_string)
+      else:
+        cmd_status = "timed out" if self.child_killed_by_timeout else "failed"
+        logging.error("Command %s: %s" % (cmd_status, cmd_string))
+      return self.child.returncode
+    finally:
+      timer.cancel()
+    return -1
+
+  def kill_child(self):
+    """Kills the running command (self.child)."""
+    self.child_killed_by_timeout = True
+    self.child.kill()
+
+class ImpalaDiagnosticsHandler(object):
+  IMPALA_PROCESSES = ["impalad", "catalogd", "statestored"]
+  OUTPUT_DIRS_TO_CREATE = ["stacks", "gcores", "jmaps", "profiles",
+      "shared_libs", "minidumps"]
+  MINIDUMP_HEADER = namedtuple("MDRawHeader", "signature version stream_count \
+      stream_directory_rva checksum time_date_stamp flags")
+
+  def __init__(self, args):
+    """Initializes the state by setting the paths of required executables."""
+    self.args = args
+    if args.pid <= 0:
+      return
+
+    self.script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
+    # Name of the Impala process for which diagnostics should be collected.
+    self.target_process_name = self.get_target_process_name()
+
+    self.java_home = self.get_java_home_from_env()
+    if not self.java_home and args.java_home:
+      self.java_home = os.path.abspath(args.java_home)
+    self.jstack_cmd = os.path.join(self.java_home, "bin/jstack")
+    self.java_cmd = os.path.join(self.java_home, "bin/java")
+    self.jmap_cmd = os.path.join(self.java_home, "bin/jmap")
+
+    self.gdb_cmd = self.get_command_from_path("gdb")
+    self.gcore_cmd = self.get_command_from_path("gcore")
+    self.pstack_cmd = self.get_command_from_path("pstack")
+
+  def create_output_dir_structure(self):
+    """Creates the skeleton directory structure for the diagnostics output collection."""
+    self.collection_root_dir = tempfile.mkdtemp(prefix="impala-diagnostics-%s" %
+        datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-"),
+        dir=os.path.abspath(self.args.output_dir))
+    for dirname in self.OUTPUT_DIRS_TO_CREATE:
+      os.mkdir(os.path.join(self.collection_root_dir, dirname))
+
+  def get_command_from_path(self, cmd):
+    """Returns the path to a given command executable, if one exists in the
+    system PATH."""
+    for path in os.environ["PATH"].split(os.pathsep):
+      cmd_path = os.path.join(path, cmd)
+      if os.access(cmd_path, os.X_OK):
+        return cmd_path
+    return ""
+
+  def get_target_process_name(self):
+    """Returns the process name of the target process for which diagnostics
+    should be collected."""
+    try:
+      return open("/proc/%s/comm" % self.args.pid).read().strip()
+    except Exception:
+      logging.exception("Failed to get target process name.")
+      return ""
+
+  def get_num_child_proc(self, name):
+    """Returns number of processes with the given name and target Impala pid
+    as parent."""
+    # Not all pgrep versions support -c parameter. So fetch the stdout and
+    # count the number of items in the list.
+    cmd = Command(["pgrep", "-P", str(self.args.pid), name])
+    cmd.run()
+    return len(cmd.stdout.split("\n")) - 1
+
+  def get_java_home_from_env(self):
+    """Returns JAVA_HOME set in the env of the target process."""
+    try:
+      envs = open("/proc/%s/environ" % self.args.pid).read().split("\0")
+      for s in envs:
+        k, v = s.split("=", 1)
+        if k == "JAVA_HOME":
+          return v
+    except Exception:
+      logging.exception("Failed to determine JAVA_HOME from proc env.")
+      return ""
+
+  def get_free_disk_space_gbs(self, path):
+    """Returns free disk space (in GBs) of the partition hosting the given path."""
+    s = os.statvfs(path)
+    return (s.f_bsize * s.f_bavail)/(1024.0 * 1024.0 * 1024.0)
+
+  def get_minidump_create_timestamp(self, minidump_path):
+    """Returns the unix timestamp of the minidump create time. It is extracted from
+    the minidump header."""
+    # Read the minidump's header to extract the create time stamp. More information about
+    # the mindump header format can be found here: https://goo.gl/uxKZVe
+    #
+    # typedef struct {
+    #   uint32_t  signature;
+    #   uint32_t  version;
+    #   uint32_t  stream_count;
+    #   MDRVA     stream_directory_rva;  /* A |stream_count|-sized array of
+    #                                     * MDRawDirectory structures. */
+    #   uint32_t  checksum;              /* Can be 0.  In fact, that's all that's
+    #                                     * been found in minidump files. */
+    #   uint32_t  time_date_stamp;       /* time_t */
+    #   uint64_t  flags;
+    # } MDRawHeader;  /* MINIDUMP_HEADER */
+    s = Struct("IIIiIIQ")
+    data = open(minidump_path, "rb").read(s.size)
+    header = self.MINIDUMP_HEADER(*s.unpack_from(data))
+    return header.time_date_stamp
+
+  def wait_for_minidump(self):
+    """Minidump collection is async after sending the SIGUSR1 signal. So this method
+    waits till it is written to the disk. Since minidump forks off a new process from
+    the parent Impala process we need to wait till the forked process exits.
+    Returns after 30s to prevent infinite waiting. Should be called after sending the
+    SIGUSR1 signal to the Impala process."""
+    MAX_WAIT_TIME_S = 30
+    start_time = time.time()
+    while time.time() < start_time + MAX_WAIT_TIME_S:
+      # Sleep for a bit to ensure that the process fork to write minidump has started.
+      # Otherwise the subsequent check on the process count could pass even when the
+      # fork didn't succeed. This sleep reduces the likelihood of such race.
+      time.sleep(1)
+      if self.get_num_child_proc(self.target_process_name) == 0:
+        break
+    return
+
+  def validate_args(self):
+    """Returns True if self.args are valid, false otherwise"""
+    if self.args.pid <= 0:
+      logging.critical("Invalid PID provided.")
+      return False
+
+    if self.target_process_name not in self.IMPALA_PROCESSES:
+      logging.critical("No valid Impala process with the given PID %s" % str(self.args.pid))
+      return False
+
+    if not self.java_home:
+      logging.critical("JAVA_HOME could not be inferred from process env.\
+          Please specify --java_home.")
+      return False
+
+    if self.args.jmap and not os.path.exists(self.jmap_cmd):
+      logging.critical("jmap binary not found, required to collect a Java heap dump.")
+      return False
+
+    if self.args.gcore and not os.path.exists(self.gcore_cmd):
+      logging.critical("gcore binary not found, required to collect a core dump.")
+      return False
+
+    if self.args.profiles_dir and not os.path.isdir(self.args.profiles_dir):
+      logging.critical("No valid profiles directory at path: %s" % self.args.profiles_dir)
+      return False
+
+    return True
+
+  def collect_thread_stacks(self):
+    """Collects jstack/jstack-m/pstack for the given pid in that order. pstack collection
+    falls back to minidumps if pstack binary is missing from the system path. Minidumps
+    are collected by sending a SIGUSR1 to the Impala process and then archiving the
+    contents of the minidump directory. The number of times stacks are collected and the
+    sleep time between the collections are controlled by --stacks argument."""
+    stacks_count, stacks_interval_secs = self.args.stacks
+    if stacks_count <= 0 or stacks_interval_secs < 0:
+      return
+
+    # Skip jstack collection if the jstack binary does not exist.
+    skip_jstacks = not os.path.exists(self.jstack_cmd)
+    if skip_jstacks:
+      logging.info("Skipping jstack collection since jstack binary couldn't be located.")
+
+    # Fallback to breakpad minidump collection if pstack binaries are missing.
+    fallback_to_minidump = False
+    if not self.pstack_cmd:
+      # Fall back to collecting a minidump if pstack is not installed.
+      if not os.path.exists(self.args.minidumps_dir):
+        logging.info("Skipping pstacks since pstack binary couldn't be located. Provide "
+            + "--minidumps_dir for collecting minidumps instead.")
+        # At this point, we can't proceed since we have nothing to collect.
+        if skip_jstacks:
+          return
+      else:
+        fallback_to_minidump = True;
+        logging.info("Collecting breakpad minidumps since pstack/gdb binaries are " +
+            "missing.")
+
+    stacks_dir = os.path.join(self.collection_root_dir, "stacks")
+    # Populate the commands to run in 'cmds_to_run' depending on what kinds of thread
+    # stacks to collect. Each entry is a tuple of form
+    # (Command, stdout_prefix, is_minidump). 'is_minidump' tells whether the command
+    # is trying to trigger a minidump collection.
+    cmds_to_run = []
+    if not skip_jstacks:
+      cmd_args = [self.jstack_cmd, str(self.args.pid)]
+      cmds_to_run.append((Command(cmd_args, self.args.timeout), "jstack", False))
+      # Collect mixed-mode jstack, contains native stack frames.
+      cmd_args_mixed_mode = [self.jstack_cmd, "-m", str(self.args.pid)]
+      cmds_to_run.append(
+          (Command(cmd_args_mixed_mode, self.args.timeout), "jstack-m", False))
+
+    if fallback_to_minidump:
+      cmd_args = ["kill", "-SIGUSR1", str(self.args.pid)]
+      cmds_to_run.append((Command(cmd_args, self.args.timeout), None, True))
+    elif self.pstack_cmd:
+      cmd_args = [self.pstack_cmd, str(self.args.pid)]
+      cmds_to_run.append((Command(cmd_args, self.args.timeout), "pstack", False))
+
+    collection_start_ts = time.time()
+    for i in xrange(stacks_count):
+      for cmd, file_prefix, is_minidump in cmds_to_run:
+        if file_prefix:
+          stdout_file = os.path.join(stacks_dir, file_prefix + "-" + str(i) + ".txt")
+          with open(stdout_file, "w") as output:
+            cmd.run(cmd_stdout=output)
+        else:
+          cmd.run()
+          # Incase of minidump collection, wait for it to be written.
+          if is_minidump:
+            self.wait_for_minidump()
+      time.sleep(stacks_interval_secs)
+
+    # Copy minidumps if required.
+    if fallback_to_minidump:
+      minidump_out_dir =  os.path.join(self.collection_root_dir, "minidumps")
+      self.copy_minidumps(minidump_out_dir, collection_start_ts);
+
+  def collect_minidumps(self):
+    """Collects minidumps on the Impala process based on argument --minidumps. The
+    minidumps are collected by sending a SIGUSR1 signal to the Impala process and then
+    the resulting minidumps are copied to the target directory."""
+    minidump_count, minidump_interval_secs = self.args.minidumps
+    if minidump_count <= 0 or minidump_interval_secs < 0:
+      return
+    # Impala process writes a minidump when it encounters a SIGUSR1.
+    cmd_args = ["kill", "-SIGUSR1", str(self.args.pid)]
+    cmd = Command(cmd_args, self.args.timeout)
+    collection_start_ts = time.time()
+    for i in xrange(minidump_count):
+      cmd.run()
+      self.wait_for_minidump()
+      time.sleep(minidump_interval_secs)
+    out_dir = os.path.join(self.collection_root_dir, "minidumps")
+    self.copy_minidumps(out_dir, collection_start_ts);
+
+  def copy_minidumps(self, target, start_ts):
+    """Copies mindumps with create time >= start_ts to 'target' directory."""
+    logging.info("Copying minidumps from %s to %s with ctime >= %s"
+        % (self.args.minidumps_dir, target, start_ts))
+    for filename in glob.glob(os.path.join(self.args.minidumps_dir, "*.dmp")):
+      try:
+        minidump_ctime = self.get_minidump_create_timestamp(filename)
+        if minidump_ctime >= math.floor(start_ts):
+          shutil.copy2(filename, target)
+        else:
+          logging.info("Ignored mindump: %s ctime: %s" % (filename, minidump_ctime))
+      except Exception:
+        logging.exception("Error processing minidump at path: %s. Skipping it." % filename)
+
+  def collect_java_heapdump(self):
+    """Generates the Java heap dump of the Impala process using the 'jmap' command."""
+    if not self.args.jmap:
+      return
+    jmap_dir = os.path.join(self.collection_root_dir, "jmaps")
+    out_file = os.path.join(jmap_dir, self.target_process_name + "_heap.bin")
+    # jmap command requires it to be run as the process owner.
+    # Command: jmap -dump:format=b,file=<outfile> <pid>
+    cmd_args = [self.jmap_cmd, "-dump:format=b,file=" + out_file, str(self.args.pid)]
+    Command(cmd_args, self.args.timeout).run()
+
+  def collect_native_coredump(self):
+    """Generates the core dump of the Impala process using the 'gcore' command"""
+    if not self.args.gcore:
+      return
+    # Command: gcore -o <outfile> <pid>
+    gcore_dir = os.path.join(self.collection_root_dir, "gcores")
+    out_file_name = self.target_process_name + "-" +\
+        datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + ".core"
+    out_file = os.path.join(gcore_dir, out_file_name)
+    cmd_args = [self.gcore_cmd, "-o", out_file, str(self.args.pid)]
+    Command(cmd_args, self.args.timeout).run()
+
+  def collect_query_profiles(self):
+    """Collects Impala query profiles from --profiles_dir. Enforces an uncompressed limit
+    of --profiles_max_size_limit bytes on the copied profile logs."""
+    if not self.args.profiles_dir:
+      return
+    out_dir = os.path.join(self.collection_root_dir, "profiles")
+    # Hardcoded in Impala
+    PROFILE_LOG_FILE_PATTERN = "impala_profile_log_1.1-*";
+    logging.info("Collecting profile data, limiting size to %f GB" %
+        (self.args.profiles_max_size_limit/(1024 * 1024 * 1024)))
+
+    profiles_path = os.path.join(self.args.profiles_dir, PROFILE_LOG_FILE_PATTERN)
+    # Sort the profiles by creation time and copy the most recent ones in that order.
+    sorted_profiles =\
+        sorted(glob.iglob(profiles_path), key=os.path.getctime, reverse=True)
+    profile_size_included_so_far = 0
+    for profile_path in sorted_profiles:
+      try:
+        file_size = os.path.getsize(profile_path)
+        if file_size == 0:
+          continue
+        if profile_size_included_so_far + file_size > self.args.profiles_max_size_limit:
+          # Copying the whole file violates profiles_max_size_limit. Copy a part of it.
+          # Profile logs are newline delimited with a single profile per line.
+          num_bytes_to_copy =\
+              self.args.profiles_max_size_limit - profile_size_included_so_far
+          file_name = os.path.basename(profile_path)
+          copied_bytes = 0
+          with open(profile_path, "rb") as in_file:
+            with open(os.path.join(out_dir, file_name), "wb") as out_file:
+              for line in in_file.readlines():
+                if copied_bytes + len(line) > num_bytes_to_copy:
+                  break
+                out_file.write(line)
+                copied_bytes += len(line)
+          return
+        profile_size_included_so_far += file_size
+        shutil.copy2(profile_path, out_dir)
+      except:
+        logging.exception("Encountered an error while collecting profile %s. Skipping it."
+            % profile_path)
+
+  def collect_shared_libs(self):
+    """Collects shared libraries loaded by the target Impala process."""
+    # Shared libs are collected if either of core dump or minidumps are enabled.
+    if not (self.args.gcore or self.args.minidumps_dir):
+      return
+    # If gdb binary is missing, we cannot extract the shared library list
+    if not self.gdb_cmd:
+      logging.info("'gdb' executable missing. Skipping shared library collection.")
+      return
+
+    out_dir = os.path.join(self.collection_root_dir, "shared_libs")
+
+    script_path = os.path.join(self.script_dir, "collect_shared_libs.sh")
+    cmd_args = [script_path, self.gdb_cmd, str(self.args.pid), out_dir]
+    Command(cmd_args, self.args.timeout).run()
+
+  def archive_diagnostics(self):
+    """Creates a gztar of the collected diagnostics and cleans up the original
+    directory. Returns True if successful, False otherwise."""
+    try:
+      # tarfile does not support context managers in python 2.6. We use closing() to work
+      # around that.
+      with closing(tarfile.open(self.collection_root_dir + '.tar.gz', mode='w:gz')) as\
+          archive:
+        archive.add(self.collection_root_dir)
+      return True
+    except Exception:
+      logging.exception("Encountered an exception archiving diagnostics, cleaning up.")
+      return False
+    finally:
+      self.cleanup()
+
+  def cleanup(self):
+    """Cleans up the directory to which diagnostics were written."""
+    shutil.rmtree(self.collection_root_dir, ignore_errors=True)
+
+  def get_diagnostics(self):
+    """Calls all collect_*() methods to collect diagnostics. Returns True if no errors
+    were encountered during diagnostics collection, False otherwise."""
+    if not self.validate_args():
+      return False
+    logging.info("Using JAVA_HOME: %s" % self.java_home)
+    self.create_output_dir_structure()
+    logging.info("Free disk space: %.2fGB" %
+        self.get_free_disk_space_gbs(self.collection_root_dir))
+    os.chdir(self.args.output_dir)
+    collection_methods = [self.collect_shared_libs, self.collect_query_profiles,
+        self.collect_native_coredump, self.collect_java_heapdump, self.collect_minidumps,
+        self.collect_thread_stacks]
+    exception_encountered = False
+    for method in collection_methods:
+      try:
+        method()
+      except IOError as e:
+        if e.errno == errno.ENOSPC:
+          # Clean up and abort if we are low on disk space. Other IOErrors are logged and
+          # ignored.
+          logging.exception("Disk space low, aborting.")
+          self.cleanup()
+          return False
+        logging.exception("Encountered an IOError calling: %s" % method.__name__)
+        exception_encountered = True
+      except Exception:
+        exception_encountered = True
+        logging.exception("Encountered an exception calling: %s" % method.__name__)
+    if exception_encountered:
+      logging.error("Encountered an exception collecting diagnostics. Final output " +
+          "could be partial.\n")
+    # Archive the directory, even if it is partial.
+    archive_path = self.collection_root_dir + ".tar.gz"
+    logging.info("Archiving diagnostics to path: %s" % archive_path)
+    if self.archive_diagnostics():
+      logging.info("Diagnostics collected at path: %s" % archive_path)
+    return not exception_encountered
+
+def get_args_parser():
+  """Creates the argument parser and adds the flags"""
+  parser = argparse.ArgumentParser(description="Impala diagnostics collection")
+  parser.add_argument("--pid", action="store", dest="pid", type=int, default=0,
+      help="PID of the Impala process for which diagnostics should be collected.")
+  parser.add_argument("--java_home", action="store", dest="java_home", default="",
+      help="If not set, it is set to the JAVA_HOME from the pid's environment.")
+  parser.add_argument("--timeout", action="store", dest="timeout", default=300,
+      type=int, help="Timeout (in seconds) for each of the diagnostics commands")
+  parser.add_argument("--stacks", action="store", dest="stacks", nargs=2, type=int,
+      default=[0, 0], metavar=("COUNT", "INTERVAL (in seconds)"),
+      help="Collect jstack, mixed-mode jstack and pstacks of the Impala process.\
+      Breakpad minidumps are collected in case of missing pstack binaries.")
+  parser.add_argument("--jmap", action="store_true", dest="jmap", default=False,
+      help="Collect heap dump of the Java process")
+  parser.add_argument("--gcore", action="store_true", dest="gcore", default=False,
+      help="Collect the native core dump using gdb. Requires gdb to be installed.")
+  parser.add_argument("--minidumps", action="store", dest="minidumps", type=int,
+      nargs=2, default=[0, 0], metavar=("COUNT", "INTERVAL (in seconds)"),
+      help="Collect breakpad minidumps for the Impala process. Requires --minidumps_dir\
+      be set.")
+  parser.add_argument("--minidumps_dir", action="store", dest="minidumps_dir", default="",
+      help="Path of the directory to which Impala process' minidumps are written")
+  parser.add_argument("--profiles_dir", action="store", dest="profiles_dir", default="",
+      help="Path of the profiles directory to be included in the diagnostics output.")
+  parser.add_argument("--profiles_max_size_limit", action="store",
+      dest="profiles_max_size_limit", default=3*1024*1024*1024,
+      type=float, help="Uncompressed limit (in Bytes) on profile logs collected from\
+      --profiles_dir. Defaults to 3GB.")
+  parser.add_argument("--output_dir", action="store", dest="output_dir",
+      default = tempfile.gettempdir(), help="Output directory that contains the final "
+      "diagnostics data. Defaults to %s" % tempfile.gettempdir())
+  return parser
+
+if __name__ == "__main__":
+  parser = get_args_parser()
+  if len(sys.argv) == 1:
+    parser.print_usage()
+    sys.exit(1)
+  logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, datefmt="%Y-%m-%d %H:%M:%S",
+      format="%(asctime)s %(levelname)-8s %(message)s")
+  diagnostics_handler = ImpalaDiagnosticsHandler(parser.parse_args())
+  logging.info("Running as user: %s" % getpass.getuser())
+  logging.info("Input args: %s" % " ".join(sys.argv))
+  sys.exit(0 if diagnostics_handler.get_diagnostics() else 1)

http://git-wip-us.apache.org/repos/asf/impala/blob/ce09269f/bin/diagnostics/collect_shared_libs.sh
----------------------------------------------------------------------
diff --git a/bin/diagnostics/collect_shared_libs.sh b/bin/diagnostics/collect_shared_libs.sh
new file mode 100755
index 0000000..d5de349
--- /dev/null
+++ b/bin/diagnostics/collect_shared_libs.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# $1 - gdb binary path
+# $2 - pid of the Impala process
+# $3 - Output directory to copy the sharedlibs to.
+
+set -euxo pipefail
+
+if [ "$#" -ne 3 ]; then
+  echo "Incorrect usage. Expected: $0 <gdb executable path> <target PID> <output dir>"
+  exit 1
+fi
+
+if [ ! -d $3 ]; then
+  echo "Directory $3 does not exist. This script expects the output directory to exist."
+  exit 1
+fi
+
+# Generate the list of shared libs path to copy.
+shared_libs_to_copy=$(mktemp)
+$1 --pid $2 --batch -ex 'info shared' 2> /dev/null | sed '1,/Shared Object Library/d' |
+    sed 's/\(.*\s\)\(\/.*\)/\2/' | grep \/ > $shared_libs_to_copy
+
+echo "Generated shared library listing for the process."
+
+# Copy the files to the target directory keeping the directory structure intact.
+# We use rsync instead of 'cp --parents' since the latter has permission issues
+# copying from system level directories. https://goo.gl/6yYNhw
+rsync -LR --files-from=$shared_libs_to_copy / $3
+
+echo "Copied the shared libraries to the target directory: $3"
+
+rm -f $shared_libs_to_copy
+# Make sure the impala user has write permissions on all the copied sharedlib paths.
+chmod 755 -R $3

http://git-wip-us.apache.org/repos/asf/impala/blob/ce09269f/bin/rat_exclude_files.txt
----------------------------------------------------------------------
diff --git a/bin/rat_exclude_files.txt b/bin/rat_exclude_files.txt
index 8a40ef2..aba4a94 100644
--- a/bin/rat_exclude_files.txt
+++ b/bin/rat_exclude_files.txt
@@ -17,6 +17,7 @@ shell/__init__.py
 ssh_keys/id_rsa_impala
 testdata/__init__.py
 tests/__init__.py
+bin/diagnostics/__init__.py
 www/index.html
 
 # See $IMPALA_HOME/LICENSE.txt

http://git-wip-us.apache.org/repos/asf/impala/blob/ce09269f/tests/unittests/test_command.py
----------------------------------------------------------------------
diff --git a/tests/unittests/test_command.py b/tests/unittests/test_command.py
new file mode 100644
index 0000000..a2a9e4c
--- /dev/null
+++ b/tests/unittests/test_command.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Unit tests for collect_diagnostics.Command
+
+import os
+import pytest
+import sys
+
+# Update the sys.path to include the modules from bin/diagnostics.
+sys.path.insert(0,
+    os.path.abspath(os.path.join(os.path.dirname(__file__), '../../bin/diagnostics')))
+from collect_diagnostics import Command
+
+class TestCommand(object):
+  """ Unit tests for the Command class"""
+
+  def test_simple_commands(self):
+    # Successful command
+    c = Command(["echo", "foo"], 1000)
+    assert c.run() == 0, "Command expected to succeed, but failed"
+    assert c.stdout.strip("\n") == "foo"
+
+    # Failed command, check return code
+    c = Command(["false"], 1000)
+    assert c.run() == 1
+
+  def test_command_timer(self):
+    # Try to run a command that sleeps for 1000s and set a
+    # timer for 1 second. The command should timed out.
+    c = Command(["sleep", "1000"], 1)
+    assert c.run() != 0, "Command expected to timeout but succeeded."
+    assert c.child_killed_by_timeout, "Command didn't timeout as expected."
+
+

[4/5] impala git commit: IMPALA-6845: TestHdfsQueries causes some tests to be run twice

Posted by bh...@apache.org.

IMPALA-6845: TestHdfsQueries causes some tests to be run twice

TestHdfsQueries is a subclass of TestQueries and inherits of all its
'test_*' methods, causing these tests to be run twice any time
test_queries.py is run. This was not intentional (it was subclassed
just to inherit 'add_test_dimensions') and causes test runs to take
longer than necessary.

This patch removes the subclass relationship and copies the logic in
add_test_dimensions() from TestQueries in HdfsTestQueries, with a
convenience function added to minimize code duplication.

Testing:
- Ran test_queries.py under both 'core' and 'exhaustive' and checked
  that the same tests are run, except all now only a single time each.

Change-Id: Ida659aa7b5131a6a7469baa93a41f7581bd0659a
Reviewed-on: http://gerrit.cloudera.org:8080/10053
Reviewed-by: Michael Brown <mi...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/16bed5c3
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/16bed5c3
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/16bed5c3

Branch: refs/heads/master
Commit: 16bed5c3a91d2575e1a0d3327735df15d51b5bf6
Parents: ffb74e7
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Thu Apr 12 23:11:05 2018 +0000
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Fri Apr 13 21:46:17 2018 +0000

----------------------------------------------------------------------
 tests/common/test_dimensions.py  | 17 ++++++++++++++++-
 tests/query_test/test_queries.py | 27 ++++++++++++---------------
 2 files changed, 28 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/16bed5c3/tests/common/test_dimensions.py
----------------------------------------------------------------------
diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py
index df3f8c2..434b884 100644
--- a/tests/common/test_dimensions.py
+++ b/tests/common/test_dimensions.py
@@ -17,10 +17,11 @@
 
 # Common test dimensions and associated utility functions.
 
+import copy
 import os
 from itertools import product
 
-from tests.common.test_vector import ImpalaTestDimension
+from tests.common.test_vector import ImpalaTestDimension, ImpalaTestVector
 
 WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']
 
@@ -179,6 +180,20 @@ def create_exec_option_dimension_from_dict(exec_option_dimensions):
   # Build a test vector out of it
   return ImpalaTestDimension('exec_option', *exec_option_dimension_values)
 
+def extend_exec_option_dimension(test_suite, key, value):
+  """
+  Takes an ImpalaTestSuite object 'test_suite' and extends the exec option test dimension
+  by creating a copy of each existing exec option value that has 'key' set to 'value',
+  doubling the number of tests that will be run.
+  """
+  dim = test_suite.ImpalaTestMatrix.dimensions["exec_option"]
+  new_value = []
+  for v in dim:
+    new_value.append(ImpalaTestVector.Value(v.name, copy.copy(v.value)))
+    new_value[-1].value[key] = value
+  dim.extend(new_value)
+  test_suite.ImpalaTestMatrix.add_dimension(dim)
+
 def get_dataset_from_workload(workload):
   # TODO: We need a better way to define the workload -> dataset mapping so we can
   # extract it without reading the actual test vector file

http://git-wip-us.apache.org/repos/asf/impala/blob/16bed5c3/tests/query_test/test_queries.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_queries.py b/tests/query_test/test_queries.py
index 4667d96..14ecefe 100644
--- a/tests/query_test/test_queries.py
+++ b/tests/query_test/test_queries.py
@@ -22,7 +22,7 @@ import pytest
 import re
 
 from tests.common.impala_test_suite import ImpalaTestSuite
-from tests.common.test_dimensions import create_uncompressed_text_dimension
+from tests.common.test_dimensions import create_uncompressed_text_dimension, extend_exec_option_dimension
 from tests.common.test_vector import ImpalaTestVector
 
 class TestQueries(ImpalaTestSuite):
@@ -33,18 +33,9 @@ class TestQueries(ImpalaTestSuite):
       cls.ImpalaTestMatrix.add_constraint(lambda v:\
           v.get_value('table_format').file_format == 'parquet')
 
-    # Manually adding a test dimension here to test the small query opt
-    # in exhaustive.
-    # TODO Cleanup required, allow adding values to dimensions without having to
-    # manually explode them
+    # Adding a test dimension here to test the small query opt in exhaustive.
     if cls.exploration_strategy() == 'exhaustive':
-      dim = cls.ImpalaTestMatrix.dimensions["exec_option"]
-      new_value = []
-      for v in dim:
-        new_value.append(ImpalaTestVector.Value(v.name, copy.copy(v.value)))
-        new_value[-1].value["exec_single_node_rows_threshold"] = 100
-      dim.extend(new_value)
-      cls.ImpalaTestMatrix.add_dimension(dim)
+      extend_exec_option_dimension(cls, "exec_single_node_rows_threshold", "100")
 
   @classmethod
   def get_workload(cls):
@@ -215,9 +206,7 @@ class TestQueriesParquetTables(ImpalaTestSuite):
     self.run_test_case('QueryTest/single-node-large-sorts', vector)
 
 # Tests for queries in HDFS-specific tables, e.g. AllTypesAggMultiFilesNoPart.
-# This is a subclass of TestQueries to get the extra test dimension for
-# exec_single_node_rows_threshold in exhaustive.
-class TestHdfsQueries(TestQueries):
+class TestHdfsQueries(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestHdfsQueries, cls).add_test_dimensions()
@@ -225,6 +214,14 @@ class TestHdfsQueries(TestQueries):
     cls.ImpalaTestMatrix.add_constraint(lambda v:\
         v.get_value('table_format').file_format != 'kudu')
 
+    # Adding a test dimension here to test the small query opt in exhaustive.
+    if cls.exploration_strategy() == 'exhaustive':
+      extend_exec_option_dimension(cls, "exec_single_node_rows_threshold", "100")
+
+  @classmethod
+  def get_workload(cls):
+    return 'functional-query'
+
   def test_hdfs_scan_node(self, vector):
     self.run_test_case('QueryTest/hdfs-scan-node', vector)

[3/5] impala git commit: IMPALA-6713: Fix request for unneeded memory in partial sort

Posted by bh...@apache.org.

IMPALA-6713: Fix request for unneeded memory in partial sort

When a Sorter::Run is initialized, if it is an initial run and has
varlen data, it requests an extra buffer to have space to sort the
varlen data should it need to spill to disk.

This extra buffer is not needed in the case of partial sorts, which
do not spill, and because this extra buffer was not included in the
calculation of the minimum required reservation, requesting it caused
the partial sort to fail in cases where the partial sort only had its
minimum reservation available to use.

The solution is to not request the extra memory for partial sorts.

Testing:
- Added a test to test_sort.py that ensures the partial sort can
  complete successfully even if additional memory requests beyond its
  minimum reservation are denied.

Change-Id: I2d9c0863009021340d8b684669b371a2cfb1ecad
Reviewed-on: http://gerrit.cloudera.org:8080/10031
Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/ffb74e7c
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/ffb74e7c
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/ffb74e7c

Branch: refs/heads/master
Commit: ffb74e7c0fb75e3062ad08451e474be59264f6a4
Parents: 6f8f757
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Thu Apr 12 03:36:06 2018 +0000
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Fri Apr 13 21:20:47 2018 +0000

----------------------------------------------------------------------
 be/src/runtime/sorter.cc      |  5 +++--
 tests/query_test/test_sort.py | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/ffb74e7c/be/src/runtime/sorter.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/sorter.cc b/be/src/runtime/sorter.cc
index 5ef321b..ead4065 100644
--- a/be/src/runtime/sorter.cc
+++ b/be/src/runtime/sorter.cc
@@ -630,7 +630,8 @@ Sorter::Run::Run(Sorter* parent, TupleDescriptor* sort_tuple_desc, bool initial_
     num_tuples_(0) {}
 
 Status Sorter::Run::Init() {
-  int num_to_create = 1 + has_var_len_slots_ + (has_var_len_slots_ && initial_run_);
+  int num_to_create = 1 + has_var_len_slots_
+      + (has_var_len_slots_ && initial_run_ && sorter_->enable_spilling_);
   int64_t required_mem = num_to_create * sorter_->page_len_;
   if (!sorter_->buffer_pool_client_->IncreaseReservationToFit(required_mem)) {
     return Status(Substitute(
@@ -641,7 +642,7 @@ Status Sorter::Run::Init() {
   RETURN_IF_ERROR(AddPage(&fixed_len_pages_));
   if (has_var_len_slots_) {
     RETURN_IF_ERROR(AddPage(&var_len_pages_));
-    if (initial_run_) {
+    if (initial_run_ && sorter_->enable_spilling_) {
       // Need additional var len page to reorder var len data in UnpinAllPages().
       RETURN_IF_ERROR(var_len_copy_page_.Init(sorter_));
     }

http://git-wip-us.apache.org/repos/asf/impala/blob/ffb74e7c/tests/query_test/test_sort.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_sort.py b/tests/query_test/test_sort.py
index 23de9f2..e44d086 100644
--- a/tests/query_test/test_sort.py
+++ b/tests/query_test/test_sort.py
@@ -203,3 +203,19 @@ class TestRandomSort(ImpalaTestSuite):
       functional.alltypestiny"""
     results = transpose_results(self.execute_query(query).data, lambda x: float(x))
     assert (results == sorted(results))
+
+
+class TestPartialSort(ImpalaTestSuite):
+  """Test class to do functional validation of partial sorts."""
+
+  def test_partial_sort_min_reservation(self, unique_database):
+    """Test that the partial sort node can operate if it only gets its minimum
+    memory reservation."""
+    table_name = "%s.kudu_test" % unique_database
+    self.client.set_configuration_option(
+        "debug_action", "-1:OPEN:SET_DENY_RESERVATION_PROBABILITY@1.0")
+    self.execute_query("""create table %s (col0 string primary key)
+        partition by hash(col0) partitions 8 stored as kudu""" % table_name)
+    result = self.execute_query(
+        "insert into %s select string_col from functional.alltypessmall" % table_name)
+    assert "PARTIAL SORT" in result.runtime_profile, result.runtime_profile