You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/08/23 20:58:56 UTC

[1/6] impala git commit: IMPALA-6373: Allow primitive type widening on parquet tables

Repository: impala
Updated Branches:
  refs/heads/master 971cf179f -> 6ce7ba295


IMPALA-6373: Allow primitive type widening on parquet tables

This patch implements support for primitive type widening on parquet
tables. It only supports conversion to those types without any loss of
precision.
- tinyint (INT32) -> smallint (INT32), int (INT32), bigint (INT64),
                     double (DOUBLE)
- smallint (INT32) -> int (INT32), bigint (INT64), double (DOUBLE)
- int (INT32) -> bigint (INT64), double (DOUBLE)
- float (FLOAT) -> double (DOUBLE)

Testing:
- Added BE test
- Added E2E test
- Ran core tests

Change-Id: If93394b035c64cf6fc5f37b54d29c034cc1f86e4
Reviewed-on: http://gerrit.cloudera.org:8080/11268
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/9934b473
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/9934b473
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/9934b473

Branch: refs/heads/master
Commit: 9934b473b7239b1077dad1f0d308e168b803db6d
Parents: 971cf17
Author: Fredy Wijaya <fw...@cloudera.com>
Authored: Fri Aug 17 16:24:03 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu Aug 23 15:55:53 2018 +0000

----------------------------------------------------------------------
 be/src/exec/parquet-column-readers.cc           |  28 ++++++++++--
 be/src/exec/parquet-column-readers.h            |   7 +++
 be/src/exec/parquet-common.h                    |  28 ++++++++++++
 be/src/exec/parquet-metadata-utils.cc           |   5 ++-
 be/src/exec/parquet-plain-test.cc               |  43 +++++++++++++++++++
 testdata/data/README                            |  14 ++++++
 testdata/data/primitive_type_widening.parquet   | Bin 0 -> 2711 bytes
 .../QueryTest/parquet-type-widening.test        |   9 ++++
 tests/query_test/test_scanners.py               |  11 +++++
 9 files changed, 139 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/be/src/exec/parquet-column-readers.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.cc b/be/src/exec/parquet-column-readers.cc
index 2cb483e..6d8eddd 100644
--- a/be/src/exec/parquet-column-readers.cc
+++ b/be/src/exec/parquet-column-readers.cc
@@ -1573,16 +1573,36 @@ ParquetColumnReader* ParquetColumnReader::Create(const SchemaNode& node,
             slot_desc);
         break;
       case TYPE_BIGINT:
-        reader = new ScalarColumnReader<int64_t, parquet::Type::INT64, true>(parent, node,
-            slot_desc);
+        switch (node.element->type) {
+          case parquet::Type::INT32:
+            reader = new ScalarColumnReader<int64_t, parquet::Type::INT32, true>(parent,
+                node, slot_desc);
+            break;
+          default:
+            reader = new ScalarColumnReader<int64_t, parquet::Type::INT64, true>(parent,
+                node, slot_desc);
+            break;
+        }
         break;
       case TYPE_FLOAT:
         reader = new ScalarColumnReader<float, parquet::Type::FLOAT, true>(parent, node,
             slot_desc);
         break;
       case TYPE_DOUBLE:
-        reader = new ScalarColumnReader<double, parquet::Type::DOUBLE, true>(parent, node,
-            slot_desc);
+        switch (node.element->type) {
+          case parquet::Type::INT32:
+            reader = new ScalarColumnReader<double , parquet::Type::INT32, true>(parent,
+                node, slot_desc);
+            break;
+          case parquet::Type::FLOAT:
+            reader = new ScalarColumnReader<double, parquet::Type::FLOAT, true>(parent,
+                node, slot_desc);
+            break;
+          default:
+            reader = new ScalarColumnReader<double, parquet::Type::DOUBLE, true>(parent,
+                node, slot_desc);
+            break;
+        }
         break;
       case TYPE_TIMESTAMP:
         reader = new ScalarColumnReader<TimestampValue, parquet::Type::INT96, true>(

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/be/src/exec/parquet-column-readers.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-column-readers.h b/be/src/exec/parquet-column-readers.h
index 022a868..790bde4 100644
--- a/be/src/exec/parquet-column-readers.h
+++ b/be/src/exec/parquet-column-readers.h
@@ -145,6 +145,13 @@ class ParquetColumnReader {
   /// false if it reads one value per item).  The reader is added to the runtime state's
   /// object pool. Does not create child readers for collection readers; these must be
   /// added by the caller.
+  ///
+  /// It supports the following primitive type widening that does not have any loss of
+  /// precision.
+  /// - tinyint (INT32) -> smallint (INT32), int (INT32), bigint (INT64), double (DOUBLE)
+  /// - smallint (INT32) -> int (INT32), bigint (INT64), double (DOUBLE)
+  /// - int (INT32) -> bigint (INT64), double (DOUBLE)
+  /// - float (FLOAT) -> double (DOUBLE)
   static ParquetColumnReader* Create(const SchemaNode& node, bool is_collection_field,
       const SlotDescriptor* slot_desc, HdfsParquetScanner* parent);
 

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/be/src/exec/parquet-common.h
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-common.h b/be/src/exec/parquet-common.h
index f3add14..24aafae 100644
--- a/be/src/exec/parquet-common.h
+++ b/be/src/exec/parquet-common.h
@@ -249,6 +249,34 @@ inline int ParquetPlainEncoder::ByteSize(const TimestampValue& v) {
   return 12;
 }
 
+template <typename From, typename To>
+inline int DecodeWithConversion(const uint8_t* buffer, const uint8_t* buffer_end, To* v) {
+  int byte_size = sizeof(From);
+  if (UNLIKELY(buffer_end - buffer < byte_size)) return -1;
+  From dest;
+  memcpy(&dest, buffer, byte_size);
+  *v = dest;
+  return byte_size;
+}
+
+template <>
+inline int ParquetPlainEncoder::Decode<int64_t, parquet::Type::INT32>(
+    const uint8_t* buffer, const uint8_t* buffer_end, int fixed_len_size, int64_t* v) {
+  return DecodeWithConversion<int32_t, int64_t>(buffer, buffer_end, v);
+}
+
+template <>
+inline int ParquetPlainEncoder::Decode<double, parquet::Type::INT32>(
+    const uint8_t* buffer, const uint8_t* buffer_end, int fixed_len_size, double* v) {
+  return DecodeWithConversion<int32_t, double>(buffer, buffer_end, v);
+}
+
+template <>
+inline int ParquetPlainEncoder::Decode<double, parquet::Type::FLOAT>(
+    const uint8_t* buffer, const uint8_t* buffer_end, int fixed_len_size, double* v) {
+  return DecodeWithConversion<float, double>(buffer, buffer_end, v);
+}
+
 template <>
 inline int ParquetPlainEncoder::Decode<int8_t, parquet::Type::INT32>(
     const uint8_t* buffer, const uint8_t* buffer_end, int fixed_len_size, int8_t* v) {

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/be/src/exec/parquet-metadata-utils.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-metadata-utils.cc b/be/src/exec/parquet-metadata-utils.cc
index d199c6e..26dea5f 100644
--- a/be/src/exec/parquet-metadata-utils.cc
+++ b/be/src/exec/parquet-metadata-utils.cc
@@ -49,9 +49,10 @@ const map<PrimitiveType, set<parquet::Type::type>> SUPPORTED_PHYSICAL_TYPES = {
     {PrimitiveType::TYPE_TINYINT, {parquet::Type::INT32}},
     {PrimitiveType::TYPE_SMALLINT, {parquet::Type::INT32}},
     {PrimitiveType::TYPE_INT, {parquet::Type::INT32}},
-    {PrimitiveType::TYPE_BIGINT, {parquet::Type::INT64}},
+    {PrimitiveType::TYPE_BIGINT, {parquet::Type::INT32, parquet::Type::INT64}},
     {PrimitiveType::TYPE_FLOAT, {parquet::Type::FLOAT}},
-    {PrimitiveType::TYPE_DOUBLE, {parquet::Type::DOUBLE}},
+    {PrimitiveType::TYPE_DOUBLE, {parquet::Type::INT32, parquet::Type::FLOAT,
+        parquet::Type::DOUBLE}},
     {PrimitiveType::TYPE_TIMESTAMP, {parquet::Type::INT96}},
     {PrimitiveType::TYPE_STRING, {parquet::Type::BYTE_ARRAY}},
     {PrimitiveType::TYPE_DATE, {parquet::Type::BYTE_ARRAY}},

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/be/src/exec/parquet-plain-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/parquet-plain-test.cc b/be/src/exec/parquet-plain-test.cc
index 2bcfa1d..9e42058 100644
--- a/be/src/exec/parquet-plain-test.cc
+++ b/be/src/exec/parquet-plain-test.cc
@@ -97,6 +97,27 @@ void TestTruncate(const InternalType& v, int expected_byte_size) {
   }
 }
 
+template <typename InternalType, typename WidenInternalType,
+    parquet::Type::type PARQUET_TYPE>
+void TestTruncate(const InternalType& v, int expected_byte_size) {
+  uint8_t buffer[expected_byte_size];
+  int encoded_size = Encode(v, expected_byte_size, buffer, PARQUET_TYPE);
+  EXPECT_EQ(encoded_size, expected_byte_size);
+
+  // Check all possible truncations of the buffer.
+  for (int truncated_size = encoded_size - 1; truncated_size >= 0; --truncated_size) {
+    WidenInternalType result;
+    /// Copy to heap-allocated buffer so that ASAN can detect buffer overruns.
+    uint8_t* truncated_buffer = new uint8_t[truncated_size];
+    memcpy(truncated_buffer, buffer, truncated_size);
+    int decoded_size = ParquetPlainEncoder::Decode<WidenInternalType, PARQUET_TYPE>(
+        truncated_buffer, truncated_buffer + truncated_size, expected_byte_size,
+        &result);
+    EXPECT_EQ(-1, decoded_size);
+    delete[] truncated_buffer;
+  }
+}
+
 template <typename InternalType, parquet::Type::type PARQUET_TYPE>
 void TestType(const InternalType& v, int expected_byte_size) {
   uint8_t buffer[expected_byte_size];
@@ -112,6 +133,23 @@ void TestType(const InternalType& v, int expected_byte_size) {
   TestTruncate<InternalType, PARQUET_TYPE>(v, expected_byte_size);
 }
 
+template <typename InternalType, typename WidenInternalType,
+    parquet::Type::type PARQUET_TYPE>
+void TestTypeWidening(const InternalType& v, int expected_byte_size) {
+  uint8_t buffer[expected_byte_size];
+  int encoded_size = Encode(v, expected_byte_size, buffer, PARQUET_TYPE);
+  EXPECT_EQ(encoded_size, expected_byte_size);
+
+  WidenInternalType result;
+  int decoded_size = ParquetPlainEncoder::Decode<WidenInternalType, PARQUET_TYPE>(
+      buffer, buffer + expected_byte_size, expected_byte_size, &result);
+  EXPECT_EQ(decoded_size, expected_byte_size);
+  EXPECT_EQ(v, result);
+
+  TestTruncate<InternalType, WidenInternalType, PARQUET_TYPE>(
+      v, expected_byte_size);
+}
+
 TEST(PlainEncoding, Basic) {
   int8_t i8 = 12;
   int16_t i16 = 123;
@@ -131,6 +169,11 @@ TEST(PlainEncoding, Basic) {
   TestType<StringValue, parquet::Type::BYTE_ARRAY>(sv, sizeof(int32_t) + sv.len);
   TestType<TimestampValue, parquet::Type::INT96>(tv, 12);
 
+  // Test type widening.
+  TestTypeWidening<int32_t, int64_t, parquet::Type::INT32>(i32, sizeof(int32_t));
+  TestTypeWidening<int32_t, double, parquet::Type::INT32>(i32, sizeof(int32_t));
+  TestTypeWidening<float, double, parquet::Type::FLOAT>(f, sizeof(float));
+
   int test_val = 1234;
   int var_len_decimal_size = sizeof(int32_t)
       + 2 /*min bytes required for storing test_val*/;

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/testdata/data/README
----------------------------------------------------------------------
diff --git a/testdata/data/README b/testdata/data/README
index ee29090..1d4c14b 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -187,3 +187,17 @@ Impala needs to be able to read such values (IMPALA-5542)
 decimal_stored_as_int64.parquet:
 Parquet file generated by Spark 2.3.1 that contains decimals stored as int64.
 Impala needs to be able to read such values (IMPALA-5542)
+
+primitive_type_widening.parquet:
+Parquet file that contains two rows with the following schema:
+- int32 tinyint_col1
+- int32 tinyint_col2
+- int32 tinyint_col3
+- int32 tinyint_col4
+- int32 smallint_col1
+- int32 smallint_col2
+- int32 smallint_col3
+- int32 int_col1
+- int32 int_col2
+- float float_col
+It is used to test primitive type widening (IMPALA-6373).

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/testdata/data/primitive_type_widening.parquet
----------------------------------------------------------------------
diff --git a/testdata/data/primitive_type_widening.parquet b/testdata/data/primitive_type_widening.parquet
new file mode 100644
index 0000000..57027de
Binary files /dev/null and b/testdata/data/primitive_type_widening.parquet differ

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test
----------------------------------------------------------------------
diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test
new file mode 100644
index 0000000..f0f11c7
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-type-widening.test
@@ -0,0 +1,9 @@
+====
+---- QUERY
+select * from primitive_type_widening;
+---- RESULTS
+1,2,3,4,5,6,7,8,9,123.4560012817383
+10,20,30,40,50,60,70,80,90,1230.4560546875
+---- TYPES
+SMALLINT,INT,BIGINT,DOUBLE,INT,BIGINT,DOUBLE,INT,DOUBLE,DOUBLE
+====
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/impala/blob/9934b473/tests/query_test/test_scanners.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 1cd883e..c9ad888 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -663,6 +663,17 @@ class TestParquet(ImpalaTestSuite):
         "select * from {0}.{1}".format(unique_database, TABLE_NAME))
     assert(len(result.data) == 33)
 
+  def test_type_widening(self, vector, unique_database):
+    """IMPALA-6373: Test that Impala can read parquet file with column types smaller than
+       the schema with larger types"""
+    TABLE_NAME = "primitive_type_widening"
+    create_table_and_copy_files(self.client, """CREATE TABLE {db}.{tbl} (
+        a smallint, b int, c bigint, d double, e int, f bigint, g double, h int,
+        i double, j double) STORED AS PARQUET""", unique_database, TABLE_NAME,
+        ["/testdata/data/{0}.parquet".format(TABLE_NAME)])
+
+    self.run_test_case("QueryTest/parquet-type-widening", vector, unique_database)
+
 # We use various scan range lengths to exercise corner cases in the HDFS scanner more
 # thoroughly. In particular, it will exercise:
 # 1. default scan range


[6/6] impala git commit: docs: typo fix in PARQUET_ARRAY_RESOLUTION

Posted by ta...@apache.org.
docs: typo fix in PARQUET_ARRAY_RESOLUTION

Change-Id: I84fcc3f13215879ea4c5bc9737f5188baeaa5749
Reviewed-on: http://gerrit.cloudera.org:8080/11284
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Alex Rodoni <ar...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6ce7ba29
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6ce7ba29
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6ce7ba29

Branch: refs/heads/master
Commit: 6ce7ba29586547515221040b5b19b35e9e36ed16
Parents: d1d07f7
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Tue Aug 21 09:37:26 2018 -0700
Committer: Alex Rodoni <ar...@cloudera.com>
Committed: Thu Aug 23 20:16:30 2018 +0000

----------------------------------------------------------------------
 docs/topics/impala_parquet_array_resolution.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6ce7ba29/docs/topics/impala_parquet_array_resolution.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_parquet_array_resolution.xml b/docs/topics/impala_parquet_array_resolution.xml
index e48555e..7694877 100644
--- a/docs/topics/impala_parquet_array_resolution.xml
+++ b/docs/topics/impala_parquet_array_resolution.xml
@@ -54,7 +54,7 @@ under the License.
       However, there is no reliable metadata within Parquet files to indicate
       which encoding was used. It is even possible to have mixed encodings within
       the same file if there are multiple arrays. The
-      <codeph>PARQUET_ARRAY_RESOLTUTION</codeph> option controls the process of
+      <codeph>PARQUET_ARRAY_RESOLUTION</codeph> option controls the process of
       resolution that is to match every column/field reference from a query to a
       column in the Parquet file.</p>
 


[2/6] impala git commit: IMPALA-7479: Harmonize parquet versions.

Posted by ta...@apache.org.
IMPALA-7479: Harmonize parquet versions.

We have a copy of parquet-avro in testdata/ that wasn't using the same
verion of parquet as everywhere else; fixing that.

I ran core tests.

Change-Id: Ia47b0871f25171510d7cb39593f3e94aadb9adeb
Reviewed-on: http://gerrit.cloudera.org:8080/11299
Reviewed-by: Michael Brown <mi...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/bb9454fc
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/bb9454fc
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/bb9454fc

Branch: refs/heads/master
Commit: bb9454fcef73d6686d3e0c03c6c69652909a8b37
Parents: 9934b47
Author: Philip Zeyliger <ph...@cloudera.com>
Authored: Wed Aug 22 14:32:59 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu Aug 23 18:14:41 2018 +0000

----------------------------------------------------------------------
 testdata/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/bb9454fc/testdata/pom.xml
----------------------------------------------------------------------
diff --git a/testdata/pom.xml b/testdata/pom.xml
index 22bf270..863704e 100644
--- a/testdata/pom.xml
+++ b/testdata/pom.xml
@@ -153,7 +153,7 @@ under the License.
     <dependency>
       <groupId>org.apache.parquet</groupId>
       <artifactId>parquet-avro</artifactId>
-      <version>1.8.0</version>
+      <version>${parquet.version}</version>
     </dependency>
 
     <dependency>


[5/6] impala git commit: IMPALA-5937: [DOCS] Documented ENABLE_EXPR_REWRITES query option

Posted by ta...@apache.org.
IMPALA-5937: [DOCS] Documented ENABLE_EXPR_REWRITES query option

Change-Id: I82a27172a6a6570f9c3cebe1a516a29c755e6d58
Reviewed-on: http://gerrit.cloudera.org:8080/11206
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Thomas Marshall <th...@cmu.edu>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/d1d07f72
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/d1d07f72
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/d1d07f72

Branch: refs/heads/master
Commit: d1d07f7295ae4d6876097f0f86cb722da63f4caa
Parents: df18658
Author: Alex Rodoni <ar...@cloudera.com>
Authored: Mon Aug 13 17:27:26 2018 -0700
Committer: Alex Rodoni <ar...@cloudera.com>
Committed: Thu Aug 23 20:11:31 2018 +0000

----------------------------------------------------------------------
 docs/impala.ditamap                         |  1 +
 docs/topics/impala_enable_expr_rewrites.xml | 86 ++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/d1d07f72/docs/impala.ditamap
----------------------------------------------------------------------
diff --git a/docs/impala.ditamap b/docs/impala.ditamap
index 9260a9b..73c5d23 100644
--- a/docs/impala.ditamap
+++ b/docs/impala.ditamap
@@ -187,6 +187,7 @@ under the License.
           <topicref rev="2.5.0" href="topics/impala_disable_row_runtime_filtering.xml"/>
           <topicref rev="2.5.0" href="topics/impala_disable_streaming_preaggregations.xml"/>
           <topicref href="topics/impala_disable_unsafe_spills.xml"/>
+          <topicref href="topics/impala_enable_expr_rewrites.xml"/>
           <topicref href="topics/impala_exec_single_node_rows_threshold.xml"/>
           <topicref href="topics/impala_exec_time_limit_s.xml"/>
           <topicref href="topics/impala_explain_level.xml"/>

http://git-wip-us.apache.org/repos/asf/impala/blob/d1d07f72/docs/topics/impala_enable_expr_rewrites.xml
----------------------------------------------------------------------
diff --git a/docs/topics/impala_enable_expr_rewrites.xml b/docs/topics/impala_enable_expr_rewrites.xml
new file mode 100644
index 0000000..b52914a
--- /dev/null
+++ b/docs/topics/impala_enable_expr_rewrites.xml
@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="enable_expr_rewrites">
+
+  <title>ENABLE_EXPR_REWRITES Query Option</title>
+
+  <titlealts audience="PDF">
+
+    <navtitle>ENABLE_EXPR_REWRITES</navtitle>
+
+  </titlealts>
+
+  <prolog>
+    <metadata>
+      <data name="Category" value="Impala"/>
+      <data name="Category" value="Impala Query Options"/>
+      <data name="Category" value="Troubleshooting"/>
+      <data name="Category" value="Performance"/>
+    </metadata>
+  </prolog>
+
+  <conbody>
+
+    <p>
+      The <codeph>ENABLE_EXPR_REWRITES</codeph> query option controls whether to enable or
+      disable the query compile time optimizations that rewrite the expression trees to a more
+      compact and optimized form that helps avoid redundant expression evaluation at run time.
+      Performance optimizations controlled by this query option include:
+    </p>
+
+    <ul>
+      <li>
+        Constant folding (added in <keyword keyref="impala28">)</keyword>
+      </li>
+
+      <li>
+        Extracting common conjuncts from disjunctions (added in
+        <keyword
+          keyref="impala28">)</keyword>
+      </li>
+
+      <li>
+        Simplify conditionals with constant conditions (added in
+        <keyword
+          keyref="impala29">)</keyword>
+      </li>
+    </ul>
+
+    <p>
+      Set the option to <codeph>false</codeph> or <codeph>0</codeph> to disable the performance
+      optimizations.
+    </p>
+
+    <p>
+      <b>Type: </b><codeph>boolean</codeph>
+    </p>
+
+    <p>
+      <b>Default:</b> <codeph>true</codeph> (<codeph>1</codeph>)
+    </p>
+
+    <p>
+      <b>Added in:</b> <keyword keyref="impala28"/>
+    </p>
+
+  </conbody>
+
+</concept>


[4/6] impala git commit: IMPALA-7433: reduce logging on executors

Posted by ta...@apache.org.
IMPALA-7433: reduce logging on executors

Moved logs to -v=2 for reasons described in the JIRA. Added
more details to some existing log messages or new
less-frequent log messages so that useful information is not
removed.

Sample logging for an executor after the change:

  I0813 12:10:50.249850 31250 impala-internal-service.cc:49] ExecQueryFInstances(): query_id=fd4ae28bc993236e:27343be100000000 coord=tarmstrong-box:22000 #instances=2
  I0813 12:10:50.250722 31256 query-state.cc:477] Executing instance. instance_id=fd4ae28bc993236e:27343be100000006 fragment_idx=1 per_fragment_instance_idx=2 coord_state_idx=1 #in-flight=1
  I0813 12:10:50.250804 31259 query-state.cc:477] Executing instance. instance_id=fd4ae28bc993236e:27343be100000003 fragment_idx=2 per_fragment_instance_idx=2 coord_state_idx=1 #in-flight=2
  I0813 12:10:50.374167 31259 query-state.cc:485] Instance completed. instance_id=fd4ae28bc993236e:27343be100000003 #in-flight=1 status=OK
  I0813 12:10:50.375370 31269 krpc-data-stream-mgr.cc:294] DeregisterRecvr(): fragment_instance_id=fd4ae28bc993236e:27343be100000006, node=3
  I0813 12:10:50.417552 31256 query-state.cc:485] Instance completed. instance_id=fd4ae28bc993236e:27343be100000006 #in-flight=0 status=OK
  I0813 12:10:50.418007 31256 query-exec-mgr.cc:179] ReleaseQueryState(): deleted query_id=fd4ae28bc993236e:27343be100000000

Change-Id: I6c1db44acc6def2b05a4fd032c63716e08cdf5ff
Reviewed-on: http://gerrit.cloudera.org:8080/11202
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/df186585
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/df186585
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/df186585

Branch: refs/heads/master
Commit: df1865856a7a9988f946c38deeb39cbb3fff80da
Parents: 6e5ec22
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Mon Aug 13 10:13:00 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu Aug 23 19:55:10 2018 +0000

----------------------------------------------------------------------
 be/src/exec/scan-node.cc                  | 10 +++++-----
 be/src/runtime/initial-reservations.cc    |  6 +++---
 be/src/runtime/krpc-data-stream-recvr.cc  |  6 +++---
 be/src/runtime/mem-tracker.cc             |  4 ++--
 be/src/runtime/query-exec-mgr.cc          |  9 +++++----
 be/src/runtime/query-state.cc             | 14 +++++++-------
 be/src/runtime/runtime-filter-bank.cc     |  2 +-
 be/src/service/impala-internal-service.cc | 13 ++++++++++---
 8 files changed, 36 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/exec/scan-node.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/scan-node.cc b/be/src/exec/scan-node.cc
index c764b5a..46f843a 100644
--- a/be/src/exec/scan-node.cc
+++ b/be/src/exec/scan-node.cc
@@ -182,7 +182,7 @@ bool ScanNode::WaitForRuntimeFilters() {
   if (arrived_filter_ids.size() == filter_ctxs_.size()) {
     runtime_profile()->AddInfoString("Runtime filters",
         Substitute("All filters arrived. Waited $0", wait_time));
-    VLOG_QUERY << "Filters arrived. Waited " << wait_time;
+    VLOG(2) << "Filters arrived. Waited " << wait_time;
     return true;
   }
 
@@ -190,7 +190,7 @@ bool ScanNode::WaitForRuntimeFilters() {
       "Not all filters arrived (arrived: [$0], missing [$1]), waited for $2",
       join(arrived_filter_ids, ", "), join(missing_filter_ids, ", "), wait_time);
   runtime_profile()->AddInfoString("Runtime filters", filter_str);
-  VLOG_QUERY << filter_str;
+  VLOG(2) << filter_str;
   return false;
 }
 
@@ -248,9 +248,9 @@ void ScanNode::ScannerThreadState::Open(
     // the producer/consumer.
     max_row_batches = max(2, max_row_batches / state->query_options().mt_dop);
   }
-  VLOG_QUERY << "Max row batch queue size for scan node '" << parent->id()
-      << "' in fragment instance '" << PrintId(state->fragment_instance_id())
-      << "': " << max_row_batches;
+  VLOG(2) << "Max row batch queue size for scan node '" << parent->id()
+          << "' in fragment instance '" << PrintId(state->fragment_instance_id())
+          << "': " << max_row_batches;
   batch_queue_.reset(
       new RowBatchQueue(max_row_batches, FLAGS_max_queued_row_batch_bytes));
 

http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/runtime/initial-reservations.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/initial-reservations.cc b/be/src/runtime/initial-reservations.cc
index e5fcd32..48a911d 100644
--- a/be/src/runtime/initial-reservations.cc
+++ b/be/src/runtime/initial-reservations.cc
@@ -60,9 +60,9 @@ Status InitialReservations::Init(
         PrettyPrinter::Print(query_min_reservation, TUnit::BYTES), FLAGS_hostname,
         FLAGS_be_port, PrintId(query_id), reservation_status.GetDetail());
   }
-  VLOG_QUERY << "Successfully claimed initial reservations ("
-            << PrettyPrinter::Print(query_min_reservation, TUnit::BYTES) << ") for"
-            << " query " << PrintId(query_id);
+  VLOG(2) << "Successfully claimed initial reservations ("
+          << PrettyPrinter::Print(query_min_reservation, TUnit::BYTES) << ") for"
+          << " query " << PrintId(query_id);
   return Status::OK();
 }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/runtime/krpc-data-stream-recvr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/krpc-data-stream-recvr.cc b/be/src/runtime/krpc-data-stream-recvr.cc
index 96cc25f..3f1bc9f 100644
--- a/be/src/runtime/krpc-data-stream-recvr.cc
+++ b/be/src/runtime/krpc-data-stream-recvr.cc
@@ -555,9 +555,9 @@ void KrpcDataStreamRecvr::SenderQueue::Cancel() {
       DequeueDeferredRpc();
     }
   }
-  VLOG_QUERY << "cancelled stream: fragment_instance_id="
-             << PrintId(recvr_->fragment_instance_id())
-             << " node_id=" << recvr_->dest_node_id();
+  VLOG(2) << "cancelled stream: fragment_instance_id="
+          << PrintId(recvr_->fragment_instance_id())
+          << " node_id=" << recvr_->dest_node_id();
   // Wake up all threads waiting to produce/consume batches. They will all
   // notice that the stream is cancelled and handle it.
   data_arrival_cv_.notify_all();

http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/runtime/mem-tracker.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/mem-tracker.cc b/be/src/runtime/mem-tracker.cc
index cb615a1..d204ce8 100644
--- a/be/src/runtime/mem-tracker.cc
+++ b/be/src/runtime/mem-tracker.cc
@@ -220,8 +220,8 @@ MemTracker* MemTracker::CreateQueryMemTracker(const TUniqueId& id,
                    << " exceeds physical memory of "
                    << PrettyPrinter::Print(MemInfo::physical_mem(), TUnit::BYTES);
     }
-    VLOG_QUERY << "Using query memory limit: "
-               << PrettyPrinter::Print(byte_limit, TUnit::BYTES);
+    VLOG(2) << "Using query memory limit: "
+            << PrettyPrinter::Print(byte_limit, TUnit::BYTES);
   }
 
   MemTracker* pool_tracker =

http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/runtime/query-exec-mgr.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-exec-mgr.cc b/be/src/runtime/query-exec-mgr.cc
index 2d66f57..26ed811 100644
--- a/be/src/runtime/query-exec-mgr.cc
+++ b/be/src/runtime/query-exec-mgr.cc
@@ -43,8 +43,8 @@ DEFINE_int32(log_mem_usage_interval, 0, "If non-zero, impalad will output memory
 
 Status QueryExecMgr::StartQuery(const TExecQueryFInstancesParams& params) {
   TUniqueId query_id = params.query_ctx.query_id;
-  VLOG_QUERY << "StartQueryFInstances() query_id=" << PrintId(query_id)
-             << " coord=" << TNetworkAddressToString(params.query_ctx.coord_address);
+  VLOG(2) << "StartQueryFInstances() query_id=" << PrintId(query_id)
+          << " coord=" << TNetworkAddressToString(params.query_ctx.coord_address);
 
   bool dummy;
   QueryState* qs = GetOrCreateQueryState(params.query_ctx, &dummy);
@@ -152,8 +152,8 @@ void QueryExecMgr::ReleaseQueryState(QueryState* qs) {
   // don't reference anything from 'qs' beyond this point, 'qs' might get
   // gc'd out from under us
   qs = nullptr;
-  VLOG_QUERY << "ReleaseQueryState(): query_id=" << PrintId(query_id)
-             << " refcnt=" << cnt + 1;
+  VLOG(2) << "ReleaseQueryState(): query_id=" << PrintId(query_id)
+          << " refcnt=" << cnt + 1;
   DCHECK_GE(cnt, 0);
   if (cnt > 0) return;
 
@@ -176,4 +176,5 @@ void QueryExecMgr::ReleaseQueryState(QueryState* qs) {
   }
   // TODO: send final status report during gc, but do this from a different thread
   delete qs_from_map;
+  VLOG(1) << "ReleaseQueryState(): deleted query_id=" << PrintId(query_id);
 }

http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/runtime/query-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/query-state.cc b/be/src/runtime/query-state.cc
index 329f757..a020596 100644
--- a/be/src/runtime/query-state.cc
+++ b/be/src/runtime/query-state.cc
@@ -160,8 +160,8 @@ void QueryState::InitMemTrackers() {
   int64_t bytes_limit = -1;
   if (query_options().__isset.mem_limit && query_options().mem_limit > 0) {
     bytes_limit = query_options().mem_limit;
-    VLOG_QUERY << "Using query memory limit from query options: "
-               << PrettyPrinter::Print(bytes_limit, TUnit::BYTES);
+    VLOG(2) << "Using query memory limit from query options: "
+            << PrettyPrinter::Print(bytes_limit, TUnit::BYTES);
   }
   query_mem_tracker_ =
       MemTracker::CreateQueryMemTracker(query_id(), query_options(), pool, &obj_pool_);
@@ -182,7 +182,7 @@ Status QueryState::InitBufferPoolState() {
     DCHECK_GE(mem_limit, 0);
     max_reservation = ReservationUtil::GetReservationLimitFromMemLimit(mem_limit);
   }
-  VLOG_QUERY << "Buffer pool limit for " << PrintId(query_id()) << ": " << max_reservation;
+  VLOG(2) << "Buffer pool limit for " << PrintId(query_id()) << ": " << max_reservation;
 
   buffer_reservation_ = obj_pool_.Add(new ReservationTracker);
   buffer_reservation_->InitChildTracker(
@@ -353,8 +353,8 @@ Status QueryState::WaitForFinish() {
 }
 
 void QueryState::StartFInstances() {
-  VLOG_QUERY << "StartFInstances(): query_id=" << PrintId(query_id())
-      << " #instances=" << rpc_params_.fragment_instance_ctxs.size();
+  VLOG(2) << "StartFInstances(): query_id=" << PrintId(query_id())
+          << " #instances=" << rpc_params_.fragment_instance_ctxs.size();
   DCHECK_GT(refcnt_.Load(), 0);
   DCHECK_GT(exec_resource_refcnt_.Load(), 0) << "Should have been taken in Init()";
 
@@ -371,8 +371,8 @@ void QueryState::StartFInstances() {
     ReportExecStatusAux(true, status, nullptr, false);
     return;
   }
-  VLOG_QUERY << "descriptor table for query=" << PrintId(query_id())
-             << "\n" << desc_tbl_->DebugString();
+  VLOG(2) << "descriptor table for query=" << PrintId(query_id())
+          << "\n" << desc_tbl_->DebugString();
 
   Status thread_create_status;
   DCHECK_GT(rpc_params_.fragment_ctxs.size(), 0);

http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/runtime/runtime-filter-bank.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/runtime-filter-bank.cc b/be/src/runtime/runtime-filter-bank.cc
index e1a2512..ffb0a22 100644
--- a/be/src/runtime/runtime-filter-bank.cc
+++ b/be/src/runtime/runtime-filter-bank.cc
@@ -84,7 +84,7 @@ RuntimeFilter* RuntimeFilterBank::RegisterFilter(const TRuntimeFilterDesc& filte
     if (consumed_filters_.find(filter_desc.filter_id) == consumed_filters_.end()) {
       ret = obj_pool_.Add(new RuntimeFilter(filter_desc, filter_desc.filter_size_bytes));
       consumed_filters_[filter_desc.filter_id] = ret;
-      VLOG_QUERY << "registered consumer filter " << filter_desc.filter_id;
+      VLOG(2) << "registered consumer filter " << filter_desc.filter_id;
     } else {
       // The filter has already been registered in this filter bank by another
       // target node.

http://git-wip-us.apache.org/repos/asf/impala/blob/df186585/be/src/service/impala-internal-service.cc
----------------------------------------------------------------------
diff --git a/be/src/service/impala-internal-service.cc b/be/src/service/impala-internal-service.cc
index c479a7f..864a1da 100644
--- a/be/src/service/impala-internal-service.cc
+++ b/be/src/service/impala-internal-service.cc
@@ -41,14 +41,21 @@ ImpalaInternalService::ImpalaInternalService() {
 
 void ImpalaInternalService::ExecQueryFInstances(TExecQueryFInstancesResult& return_val,
     const TExecQueryFInstancesParams& params) {
-  VLOG_QUERY << "ExecQueryFInstances():" << " query_id=" <<
-      PrintId(params.query_ctx.query_id);
   FAULT_INJECTION_RPC_DELAY(RPC_EXECQUERYFINSTANCES);
   DCHECK(params.__isset.coord_state_idx);
   DCHECK(params.__isset.query_ctx);
   DCHECK(params.__isset.fragment_ctxs);
   DCHECK(params.__isset.fragment_instance_ctxs);
-  query_exec_mgr_->StartQuery(params).SetTStatus(&return_val);
+  VLOG_QUERY << "ExecQueryFInstances():" << " query_id="
+             << PrintId(params.query_ctx.query_id)
+             << " coord=" << TNetworkAddressToString(params.query_ctx.coord_address)
+             << " #instances=" << params.fragment_instance_ctxs.size();
+  Status status = query_exec_mgr_->StartQuery(params);
+  status.SetTStatus(&return_val);
+  if (!status.ok()) {
+    LOG(INFO) << "ExecQueryFInstances() failed: query_id="
+              << PrintId(params.query_ctx.query_id) << ": " << status.GetDetail();
+  }
 }
 
 template <typename T> void SetUnknownIdError(


[3/6] impala git commit: IMPALA-7399: Emit a junit xml report when trapping errors

Posted by ta...@apache.org.
IMPALA-7399: Emit a junit xml report when trapping errors

This patch will cause a junitxml file to be emitted in the case of
errors in build scripts. Instead of simply echoing a message to the
console, we set up a trap function that also writes out to a
junit xml report that can be consumed by jenkins.impala.io.

Main things to pay attention to:

- New file that gets sourced by all bash scripts when trapping
  within bash scripts:

  https://gerrit.cloudera.org/c/11257/1/bin/report_build_error.sh

- Installation of the python lib into impala-python venv for use
  from within python files:

  https://gerrit.cloudera.org/c/11257/1/bin/impala-python-common.sh

- Change to the generate_junitxml.py file itself, for ease of
  https://gerrit.cloudera.org/c/11257/1/lib/python/impala_py_lib/jenkins/generate_junitxml.py

Most of the other changes are to source the new report_build_error.sh
script to set up the trap function.

Change-Id: Idd62045bb43357abc2b89a78afff499149d3c3fc
Reviewed-on: http://gerrit.cloudera.org:8080/11257
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/6e5ec22b
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/6e5ec22b
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/6e5ec22b

Branch: refs/heads/master
Commit: 6e5ec22b1237f1d466c095c96a2fc1cb71ccb2d9
Parents: bb9454f
Author: David Knupp <dk...@cloudera.com>
Authored: Thu Aug 16 17:06:04 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu Aug 23 18:33:58 2018 +0000

----------------------------------------------------------------------
 bin/clean-cmake.sh                              |  4 +-
 bin/clean.sh                                    |  4 +-
 bin/create-test-configuration.sh                |  3 +-
 bin/create_testdata.sh                          |  3 +-
 bin/distcc/distcc_server_setup.sh               |  3 +-
 bin/impala-python-common.sh                     |  3 +-
 bin/jenkins/all-tests.sh                        |  3 +-
 bin/jenkins/build-all-flag-combinations.sh      |  3 +-
 bin/jenkins/build-only.sh                       |  3 +-
 bin/make_impala.sh                              |  3 +-
 bin/report_build_error.sh                       | 30 +++++++++++++++
 bin/run-all-tests.sh                            |  3 +-
 bin/run-backend-tests.sh                        |  3 +-
 bin/start-catalogd.sh                           |  3 +-
 bin/start-impalad.sh                            |  3 +-
 bin/start-statestored.sh                        |  3 +-
 buildall.sh                                     |  4 +-
 infra/python/bootstrap_virtualenv.py            | 24 ++++++++----
 .../impala_py_lib/jenkins/generate_junitxml.py  | 40 ++++++++++++--------
 shell/make_shell_tarball.sh                     |  3 +-
 testdata/bin/check-schema-diff.sh               |  4 ++
 testdata/bin/compute-table-stats.sh             |  3 +-
 testdata/bin/copy-data-sources.sh               |  3 +-
 testdata/bin/copy-udfs-udas.sh                  |  3 +-
 testdata/bin/create-load-data.sh                |  3 +-
 testdata/bin/create-table-many-blocks.sh        |  3 +-
 testdata/bin/generate-load-nested.sh            |  3 +-
 testdata/bin/kill-all.sh                        |  3 +-
 testdata/bin/kill-hbase.sh                      |  3 +-
 testdata/bin/kill-hive-server.sh                |  3 +-
 testdata/bin/kill-java-service.sh               |  3 +-
 testdata/bin/kill-sentry-service.sh             |  3 +-
 testdata/bin/load-hive-builtins.sh              |  3 +-
 testdata/bin/load-metastore-snapshot.sh         |  3 +-
 testdata/bin/load-test-warehouse-snapshot.sh    |  3 +-
 testdata/bin/run-all.sh                         |  3 +-
 testdata/bin/run-hbase.sh                       |  3 +-
 testdata/bin/run-hive-server.sh                 |  3 +-
 testdata/bin/run-mini-dfs.sh                    |  3 +-
 testdata/bin/run-sentry-service.sh              |  3 +-
 testdata/bin/setup-hdfs-env.sh                  |  3 +-
 testdata/cluster/admin                          |  3 +-
 testdata/datasets/tpcds/preload                 |  3 +-
 testdata/datasets/tpch/preload                  |  3 +-
 tests/run-custom-cluster-tests.sh               |  3 +-
 tests/run-process-failure-tests.sh              |  3 +-
 46 files changed, 159 insertions(+), 68 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/clean-cmake.sh
----------------------------------------------------------------------
diff --git a/bin/clean-cmake.sh b/bin/clean-cmake.sh
index 92415fc..aa1ea96 100755
--- a/bin/clean-cmake.sh
+++ b/bin/clean-cmake.sh
@@ -20,8 +20,8 @@
 # Removes artifacts generated by cmake.
 
 set -euo pipefail
-trap 'echo Error in ${0} at line ${LINENO}: $(cd "'${PWD}'" && awk "NR == ${LINENO}" \
-  ${0})' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 if [[ -z "${IMPALA_HOME}" || ! -d "${IMPALA_HOME}" ]]; then
   echo IMPALA_HOME=${IMPALA_HOME} is not valid. 1>&2

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/clean.sh
----------------------------------------------------------------------
diff --git a/bin/clean.sh b/bin/clean.sh
index 89a991c..d0b7c3b 100755
--- a/bin/clean.sh
+++ b/bin/clean.sh
@@ -22,8 +22,8 @@
 # branch to a non-toolchain branch due to caching in CMake generated files.
 
 set -euo pipefail
-trap 'echo Error in ${0} at line ${LINENO}: $(cd "'${PWD}'" && awk "NR == ${LINENO}" \
-  ${0})' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 # If the project was never build, no Makefile will exist and thus make clean will fail.
 # Combine the make command with the bash noop to always return true.

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/create-test-configuration.sh
----------------------------------------------------------------------
diff --git a/bin/create-test-configuration.sh b/bin/create-test-configuration.sh
index e68af9c..6e799fb 100755
--- a/bin/create-test-configuration.sh
+++ b/bin/create-test-configuration.sh
@@ -22,7 +22,8 @@
 # as creation of the Hive metastore.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 CREATE_METASTORE=0
 CREATE_SENTRY_POLICY_DB=0

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/create_testdata.sh
----------------------------------------------------------------------
diff --git a/bin/create_testdata.sh b/bin/create_testdata.sh
index cc296bf..5b81d9e 100755
--- a/bin/create_testdata.sh
+++ b/bin/create_testdata.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 bin=`dirname "$0"`
 bin=`cd "$bin"; pwd`

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/distcc/distcc_server_setup.sh
----------------------------------------------------------------------
diff --git a/bin/distcc/distcc_server_setup.sh b/bin/distcc/distcc_server_setup.sh
index 8b5e6a9..6b514e7 100755
--- a/bin/distcc/distcc_server_setup.sh
+++ b/bin/distcc/distcc_server_setup.sh
@@ -39,7 +39,8 @@
 # CCACHE_DIR: directory to use for distccd's ccache.
 # CCACHE_SIZE: size of ccache, passed to ccache's -M option
 set -eu -o pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 if [[ $# != 1 ]]; then
   echo "Usage: $0 <allowed IP address range>"

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/impala-python-common.sh
----------------------------------------------------------------------
diff --git a/bin/impala-python-common.sh b/bin/impala-python-common.sh
index 29a36a2..501d487 100644
--- a/bin/impala-python-common.sh
+++ b/bin/impala-python-common.sh
@@ -19,7 +19,8 @@
 # $IMPALA_HOME/bin/impala-py* executables.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 LD_LIBRARY_PATH+=":$(python "$IMPALA_HOME/infra/python/bootstrap_virtualenv.py" \
   --print-ld-library-path)"

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/jenkins/all-tests.sh
----------------------------------------------------------------------
diff --git a/bin/jenkins/all-tests.sh b/bin/jenkins/all-tests.sh
index 7917358..1e73722 100644
--- a/bin/jenkins/all-tests.sh
+++ b/bin/jenkins/all-tests.sh
@@ -19,7 +19,8 @@
 # Run all Impala tests.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 cd "${IMPALA_HOME}"
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/jenkins/build-all-flag-combinations.sh
----------------------------------------------------------------------
diff --git a/bin/jenkins/build-all-flag-combinations.sh b/bin/jenkins/build-all-flag-combinations.sh
index 8dce06d..200729e 100755
--- a/bin/jenkins/build-all-flag-combinations.sh
+++ b/bin/jenkins/build-all-flag-combinations.sh
@@ -25,7 +25,8 @@
 # Usage: build-all-flag-combinations.sh [--dryrun]
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 export IMPALA_MAVEN_OPTIONS="-U"
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/jenkins/build-only.sh
----------------------------------------------------------------------
diff --git a/bin/jenkins/build-only.sh b/bin/jenkins/build-only.sh
index d14ad6e..e7bdb3f 100644
--- a/bin/jenkins/build-only.sh
+++ b/bin/jenkins/build-only.sh
@@ -19,7 +19,8 @@
 # Only run an Impala build.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 cd "${IMPALA_HOME}"
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/make_impala.sh
----------------------------------------------------------------------
diff --git a/bin/make_impala.sh b/bin/make_impala.sh
index 8e1807b..f33bd74 100755
--- a/bin/make_impala.sh
+++ b/bin/make_impala.sh
@@ -20,7 +20,8 @@
 # Incrementally compiles the frontend and backend.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 : ${IMPALA_TOOLCHAIN=}
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/report_build_error.sh
----------------------------------------------------------------------
diff --git a/bin/report_build_error.sh b/bin/report_build_error.sh
new file mode 100644
index 0000000..295cce0
--- /dev/null
+++ b/bin/report_build_error.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+report_build_error() {
+  ERROR_MSG=$(cd "$PWD" && awk "NR == $1" $0)
+  FILENAME=$(basename -- "$0")
+  echo ERROR in $0 at line $1: $ERROR_MSG
+  $IMPALA_HOME/bin/generate_junitxml.py --step "${FILENAME%.*}" \
+    --error "Error in $0 at line $1: $ERROR_MSG"
+}
+
+setup_report_build_error() {
+  trap 'report_build_error $LINENO' ERR
+}

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/run-all-tests.sh
----------------------------------------------------------------------
diff --git a/bin/run-all-tests.sh b/bin/run-all-tests.sh
index 5f6831e..da7090e 100755
--- a/bin/run-all-tests.sh
+++ b/bin/run-all-tests.sh
@@ -22,7 +22,8 @@
 
 # Exit on reference to uninitialized variables and non-zero exit codes
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . "$IMPALA_HOME/bin/set-pythonpath.sh"
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/run-backend-tests.sh
----------------------------------------------------------------------
diff --git a/bin/run-backend-tests.sh b/bin/run-backend-tests.sh
index 3bc84c2..16a432f 100755
--- a/bin/run-backend-tests.sh
+++ b/bin/run-backend-tests.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 export GTEST_OUTPUT="xml:$IMPALA_BE_TEST_LOGS_DIR/"
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/start-catalogd.sh
----------------------------------------------------------------------
diff --git a/bin/start-catalogd.sh b/bin/start-catalogd.sh
index a8b7e28..35cf592 100755
--- a/bin/start-catalogd.sh
+++ b/bin/start-catalogd.sh
@@ -21,7 +21,8 @@
 # -build_type parameter can be passed to determine the build type to use.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 BUILD_TYPE=latest
 CATALOGD_ARGS=""

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/start-impalad.sh
----------------------------------------------------------------------
diff --git a/bin/start-impalad.sh b/bin/start-impalad.sh
index f052fa2..dacd44c 100755
--- a/bin/start-impalad.sh
+++ b/bin/start-impalad.sh
@@ -21,7 +21,8 @@
 # parameter can be passed to determine the build type to use for the impalad instance.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 BUILD_TYPE=latest
 IMPALAD_ARGS=""

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/bin/start-statestored.sh
----------------------------------------------------------------------
diff --git a/bin/start-statestored.sh b/bin/start-statestored.sh
index f023810..eb94910 100755
--- a/bin/start-statestored.sh
+++ b/bin/start-statestored.sh
@@ -20,7 +20,8 @@
 # Starts up the StateStored with the specified command line arguments.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 BUILD_TYPE=latest
 STATESTORED_ARGS=""

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/buildall.sh
----------------------------------------------------------------------
diff --git a/buildall.sh b/buildall.sh
index 59e27ee..339168e 100755
--- a/buildall.sh
+++ b/buildall.sh
@@ -18,6 +18,8 @@
 # under the License.
 
 set -euo pipefail
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 # run buildall.sh -help to see options
 ROOT=`dirname "$0"`
@@ -29,8 +31,6 @@ then
    exit 1
 fi
 
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
-
 # Grab this *before* we source impala-config.sh to see if the caller has
 # kerberized environment variables already or not.
 NEEDS_RE_SOURCE_NOTE=1

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/infra/python/bootstrap_virtualenv.py
----------------------------------------------------------------------
diff --git a/infra/python/bootstrap_virtualenv.py b/infra/python/bootstrap_virtualenv.py
index 1953935..f2ce839 100644
--- a/infra/python/bootstrap_virtualenv.py
+++ b/infra/python/bootstrap_virtualenv.py
@@ -140,29 +140,37 @@ def exec_pip_install(args, cc="no-cc-available", env=None):
   # Don't call the virtualenv pip directly, it uses a hashbang to to call the python
   # virtualenv using an absolute path. If the path to the virtualenv is very long, the
   # hashbang won't work.
-  #
+  impala_pip_base_cmd = [os.path.join(ENV_DIR, "bin", "python"),
+                         os.path.join(ENV_DIR, "bin", "pip"), "install", "-v"]
+
   # Passes --no-binary for IMPALA-3767: without this, Cython (and
   # several other packages) fail download.
   #
   # --no-cache-dir is used to prevent caching of compiled artifacts, which may be built
   # with different compilers or settings.
-  cmd = [os.path.join(ENV_DIR, "bin", "python"), os.path.join(ENV_DIR, "bin", "pip"),
-      "install", "-v", "--no-binary", ":all:", "--no-cache-dir"]
+  third_party_pkg_install_cmd = \
+      impala_pip_base_cmd[:] + ["--no-binary", ":all:", "--no-cache-dir"]
 
   # When using a custom mirror, we also must use the index of that mirror.
   if "PYPI_MIRROR" in os.environ:
-    cmd.extend(["--index-url", "%s/simple" % os.environ["PYPI_MIRROR"]])
+    third_party_pkg_install_cmd.extend(["--index-url",
+                                        "%s/simple" % os.environ["PYPI_MIRROR"]])
   else:
     # Prevent fetching additional packages from the index. If we forget to add a package
     # to one of the requirements.txt files, this should trigger an error. However, we will
     # still access the index for version/dependency resolution, hence we need to change it
     # when using a private mirror.
-    cmd.append("--no-index")
+    third_party_pkg_install_cmd.append("--no-index")
 
-  cmd.extend(["--find-links",
+  third_party_pkg_install_cmd.extend(["--find-links",
       "file://%s" % urllib.pathname2url(os.path.abspath(DEPS_DIR))])
-  cmd.extend(args)
-  exec_cmd(cmd, env=env)
+  third_party_pkg_install_cmd.extend(args)
+  exec_cmd(third_party_pkg_install_cmd, env=env)
+
+  # Finally, we want to install the packages from our own internal python lib
+  local_package_install_cmd = impala_pip_base_cmd + \
+      ['-e', os.path.join(os.getenv('IMPALA_HOME'), 'lib', 'python')]
+  exec_cmd(local_package_install_cmd)
 
 
 def find_file(*paths):

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/lib/python/impala_py_lib/jenkins/generate_junitxml.py
----------------------------------------------------------------------
diff --git a/lib/python/impala_py_lib/jenkins/generate_junitxml.py b/lib/python/impala_py_lib/jenkins/generate_junitxml.py
index 68bedd1..97dcbdd 100755
--- a/lib/python/impala_py_lib/jenkins/generate_junitxml.py
+++ b/lib/python/impala_py_lib/jenkins/generate_junitxml.py
@@ -32,18 +32,30 @@ from datetime import datetime as dt
 
 IMPALA_HOME = os.getenv('IMPALA_HOME', '.')
 SCRIPT_NAME, _ = os.path.splitext(os.path.basename(__file__))
+JUNITXML_LOGDIR = os.path.join(os.getenv("IMPALA_LOGS_DIR", "."), 'extra_junit_xml_logs')
 
 
 class JunitReport(object):
   """A Junit XML style report parseable by Jenkins for reporting build status.
 
-  Generally, a caller who invokes this script doesn't need to do anything
+  Generally, a caller who invokes this script from bash doesn't need to do
   more than supply the necessary command line parameters. The JunitReport
   class is instantiated using those initial inputs, and a timestamped XML
   file is output to the $IMPALA_HOME/logs/extra_junit_xml_logs/.
 
   Log files are timestamped, so they will not overwrite previous files containing
   output of the same step.
+
+  For use from within a python script (must be invoked with impala-python), an
+  example might look like:
+
+  >>> from impala_py_lib.jenkins.generate_junitxml import JunitReport
+  >>> report = JunitReport(phase='load_data', step='load_hbase', error_msg='oops')
+  >>> report.tofile()
+
+  For now, the class does not support adding more than one step (analogous to a
+  test case) to the same phase (analogous to a test suite). Each report should
+  be unique for a given junit XML file. This may be enhanced at some point.
   """
 
   def __init__(self, phase, step, error_msg=None, stdout=None, stderr=None,
@@ -124,7 +136,7 @@ class JunitReport(object):
     output = ET.SubElement(self.testcase_element, "system-{}".format(output_type))
     output.text = JunitReport.get_xml_content(file_or_string)
 
-  def to_file(self, junitxml_logdir='.'):
+  def to_file(self, junitxml_logdir=JUNITXML_LOGDIR):
     """
     Create a timestamped XML report file.
 
@@ -134,6 +146,15 @@ class JunitReport(object):
     Return:
       junit_log_file: path to the generated file
     """
+    # The equivalent of mkdir -p
+    try:
+      os.makedirs(junitxml_logdir)
+    except OSError as e:
+      if e.errno == errno.EEXIST and os.path.isdir(junitxml_logdir):
+        pass
+      else:
+        raise
+
     filename = '{}.{}.xml'.format(
         self.testsuite_element.attrib['name'],
         self.utc_time.strftime('%Y%m%d_%H_%M_%S')
@@ -226,17 +247,6 @@ def main():
   Phase can be repeated in a given test run, but the step leaf node, which is
   equivalent to a "test case", must be unique within each phase.
   """
-  junitxml_logdir = os.path.join(IMPALA_HOME, 'logs', 'extra_junit_xml_logs')
-
-  # The equivalent of mkdir -p
-  try:
-    os.makedirs(junitxml_logdir)
-  except OSError as e:
-    if e.errno == errno.EEXIST and os.path.isdir(junitxml_logdir):
-      pass
-    else:
-      raise
-
   options = get_options()
 
   junit_report = JunitReport(phase=options.phase,
@@ -246,8 +256,8 @@ def main():
                              stderr=options.stderr,
                              elapsed_time=options.time)
 
-  xml_report = junit_report.to_file(junitxml_logdir)
-  print("Generated: {}".format(xml_report))
+  junit_log_file = junit_report.to_file()
+  print("Generated: {0}".format(junit_log_file))
 
 
 if "__main__" == __name__:

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/shell/make_shell_tarball.sh
----------------------------------------------------------------------
diff --git a/shell/make_shell_tarball.sh b/shell/make_shell_tarball.sh
index 637fa7d..dd6a648 100755
--- a/shell/make_shell_tarball.sh
+++ b/shell/make_shell_tarball.sh
@@ -23,7 +23,8 @@
 # ${IMPALA_HOME}/shell/build.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 if [ "x${IMPALA_HOME}" == "x" ]; then
   echo "\$IMPALA_HOME must be set"

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/check-schema-diff.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/check-schema-diff.sh b/testdata/bin/check-schema-diff.sh
index 4eb8172..dd36b85 100755
--- a/testdata/bin/check-schema-diff.sh
+++ b/testdata/bin/check-schema-diff.sh
@@ -24,6 +24,10 @@
 #  - 1 implies that the schemas have changed.
 
 set -euo pipefail
+
+# We don't want to generate a junit xml report for errors generated here,
+# since exit code 1 here denotes something useful. So in the case of this
+# script, we don't call setup_report_build_error.
 trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/compute-table-stats.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/compute-table-stats.sh b/testdata/bin/compute-table-stats.sh
index 63eb0da..08c7595 100755
--- a/testdata/bin/compute-table-stats.sh
+++ b/testdata/bin/compute-table-stats.sh
@@ -20,7 +20,8 @@
 # Runs compute table stats over a curated set of Impala test tables.
 #
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/copy-data-sources.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/copy-data-sources.sh b/testdata/bin/copy-data-sources.sh
index a1838ce..1782aca 100755
--- a/testdata/bin/copy-data-sources.sh
+++ b/testdata/bin/copy-data-sources.sh
@@ -20,7 +20,8 @@
 # This script copies the test data source library into hdfs.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/copy-udfs-udas.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/copy-udfs-udas.sh b/testdata/bin/copy-udfs-udas.sh
index c1a0454..def7812 100755
--- a/testdata/bin/copy-udfs-udas.sh
+++ b/testdata/bin/copy-udfs-udas.sh
@@ -20,7 +20,8 @@
 # This script copies udf/uda binaries into hdfs.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 if [ x${JAVA_HOME} == x ]; then
   echo JAVA_HOME not set

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/create-load-data.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-load-data.sh b/testdata/bin/create-load-data.sh
index 1953daf..74ae248 100755
--- a/testdata/bin/create-load-data.sh
+++ b/testdata/bin/create-load-data.sh
@@ -29,7 +29,8 @@
 # bin/load-data.py
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
 . ${IMPALA_HOME}/testdata/bin/run-step.sh

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/create-table-many-blocks.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/create-table-many-blocks.sh b/testdata/bin/create-table-many-blocks.sh
index 4c0a57d..2db9bee 100755
--- a/testdata/bin/create-table-many-blocks.sh
+++ b/testdata/bin/create-table-many-blocks.sh
@@ -25,7 +25,8 @@
 # blocks/files.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/generate-load-nested.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-load-nested.sh b/testdata/bin/generate-load-nested.sh
index 4986418..cceea1a 100755
--- a/testdata/bin/generate-load-nested.sh
+++ b/testdata/bin/generate-load-nested.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 SHELL_CMD=${IMPALA_HOME}/bin/impala-shell.sh
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/kill-all.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/kill-all.sh b/testdata/bin/kill-all.sh
index 0e8d201..b6c13a6 100755
--- a/testdata/bin/kill-all.sh
+++ b/testdata/bin/kill-all.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 # Shutdown Impala if it is alive
 ${IMPALA_HOME}/bin/start-impala-cluster.py --kill

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/kill-hbase.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/kill-hbase.sh b/testdata/bin/kill-hbase.sh
index 4e5a42f..0a5fcda 100755
--- a/testdata/bin/kill-hbase.sh
+++ b/testdata/bin/kill-hbase.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 DIR=$(dirname "$0")
 echo Stopping Hbase

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/kill-hive-server.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/kill-hive-server.sh b/testdata/bin/kill-hive-server.sh
index 0ebd18c..59c44e5 100755
--- a/testdata/bin/kill-hive-server.sh
+++ b/testdata/bin/kill-hive-server.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 DIR=$(dirname "$0")
 echo Stopping Hive

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/kill-java-service.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/kill-java-service.sh b/testdata/bin/kill-java-service.sh
index 4cdc1bd..8ae449d 100755
--- a/testdata/bin/kill-java-service.sh
+++ b/testdata/bin/kill-java-service.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 CLASSES=()
 EXTRA_SHUTDOWN_TIME_SECS=1

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/kill-sentry-service.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/kill-sentry-service.sh b/testdata/bin/kill-sentry-service.sh
index e9ed7aa..ceb8958 100755
--- a/testdata/bin/kill-sentry-service.sh
+++ b/testdata/bin/kill-sentry-service.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 DIR=$(dirname "$0")
 echo Stopping Sentry

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/load-hive-builtins.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/load-hive-builtins.sh b/testdata/bin/load-hive-builtins.sh
index 061d42b..55cc845 100755
--- a/testdata/bin/load-hive-builtins.sh
+++ b/testdata/bin/load-hive-builtins.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/load-metastore-snapshot.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/load-metastore-snapshot.sh b/testdata/bin/load-metastore-snapshot.sh
index 1760cdd..dd4e136 100755
--- a/testdata/bin/load-metastore-snapshot.sh
+++ b/testdata/bin/load-metastore-snapshot.sh
@@ -22,7 +22,8 @@
 # full data load build.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/load-test-warehouse-snapshot.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/load-test-warehouse-snapshot.sh b/testdata/bin/load-test-warehouse-snapshot.sh
index fe5dd2a..da5fb07 100755
--- a/testdata/bin/load-test-warehouse-snapshot.sh
+++ b/testdata/bin/load-test-warehouse-snapshot.sh
@@ -25,7 +25,8 @@
 # to backup any data you need before running this script.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/impala-config.sh > /dev/null 2>&1
 : ${REMOTE_LOAD:=}

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/run-all.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-all.sh b/testdata/bin/run-all.sh
index 6820e5d..1e14315 100755
--- a/testdata/bin/run-all.sh
+++ b/testdata/bin/run-all.sh
@@ -20,7 +20,8 @@
 # Starts up a mini-dfs test cluster and related services
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 # If -format is passed, format the mini-dfs cluster.
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/run-hbase.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-hbase.sh b/testdata/bin/run-hbase.sh
index 1433073..e7d67c9 100755
--- a/testdata/bin/run-hbase.sh
+++ b/testdata/bin/run-hbase.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 CLUSTER_BIN=${IMPALA_HOME}/testdata/bin
 HBASE_JAAS_CLIENT=${HBASE_CONF_DIR}/hbase-jaas-client.conf

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/run-hive-server.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-hive-server.sh b/testdata/bin/run-hive-server.sh
index 2b5a486..fbeba72 100755
--- a/testdata/bin/run-hive-server.sh
+++ b/testdata/bin/run-hive-server.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/set-pythonpath.sh
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/run-mini-dfs.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-mini-dfs.sh b/testdata/bin/run-mini-dfs.sh
index ea6c519..be63715 100755
--- a/testdata/bin/run-mini-dfs.sh
+++ b/testdata/bin/run-mini-dfs.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 if [[ $# -eq 1 && "$1" == -format ]]; then
   SHOULD_FORMAT=true

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/run-sentry-service.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/run-sentry-service.sh b/testdata/bin/run-sentry-service.sh
index 755c382..f49f88b 100755
--- a/testdata/bin/run-sentry-service.sh
+++ b/testdata/bin/run-sentry-service.sh
@@ -18,7 +18,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 . ${IMPALA_HOME}/bin/set-classpath.sh
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/bin/setup-hdfs-env.sh
----------------------------------------------------------------------
diff --git a/testdata/bin/setup-hdfs-env.sh b/testdata/bin/setup-hdfs-env.sh
index 552c48b..4308757 100755
--- a/testdata/bin/setup-hdfs-env.sh
+++ b/testdata/bin/setup-hdfs-env.sh
@@ -18,7 +18,8 @@
 # under the License.
 #
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 : ${REMOTE_LOAD:=}
 

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/cluster/admin
----------------------------------------------------------------------
diff --git a/testdata/cluster/admin b/testdata/cluster/admin
index 534bc45..ca438db 100755
--- a/testdata/cluster/admin
+++ b/testdata/cluster/admin
@@ -28,7 +28,8 @@
 # TODO: Run each node on its own IP address, e.g. 127.0.0.1, 127.0.0.2, and so on.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 : ${IMPALA_KERBERIZE=}
 : ${INCLUDE_YARN=}

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/datasets/tpcds/preload
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpcds/preload b/testdata/datasets/tpcds/preload
index 631a1c2..423b3c8 100755
--- a/testdata/datasets/tpcds/preload
+++ b/testdata/datasets/tpcds/preload
@@ -17,7 +17,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 IMPALA_DATA=${IMPALA_HOME}/testdata/impala-data
 TPC_DS_DATA=${IMPALA_DATA}/tpcds

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/testdata/datasets/tpch/preload
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpch/preload b/testdata/datasets/tpch/preload
index 2b0cbb6..619a4d2 100755
--- a/testdata/datasets/tpch/preload
+++ b/testdata/datasets/tpch/preload
@@ -17,7 +17,8 @@
 # under the License.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 IMPALA_DATA=${IMPALA_HOME}/testdata/impala-data
 TPC_H_DATA=${IMPALA_DATA}/tpch

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/tests/run-custom-cluster-tests.sh
----------------------------------------------------------------------
diff --git a/tests/run-custom-cluster-tests.sh b/tests/run-custom-cluster-tests.sh
index ce161b1..6b77e26 100755
--- a/tests/run-custom-cluster-tests.sh
+++ b/tests/run-custom-cluster-tests.sh
@@ -21,7 +21,8 @@
 # clusters will be restarted.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 # Disable HEAPCHECK for the process failure tests because they can cause false positives.
 # TODO: Combine with run-process-failure-tests.sh

http://git-wip-us.apache.org/repos/asf/impala/blob/6e5ec22b/tests/run-process-failure-tests.sh
----------------------------------------------------------------------
diff --git a/tests/run-process-failure-tests.sh b/tests/run-process-failure-tests.sh
index 9ab531c..db47571 100755
--- a/tests/run-process-failure-tests.sh
+++ b/tests/run-process-failure-tests.sh
@@ -20,7 +20,8 @@
 # Runs the Impala process failure tests.
 
 set -euo pipefail
-trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk "NR == $LINENO" $0)' ERR
+. $IMPALA_HOME/bin/report_build_error.sh
+setup_report_build_error
 
 # Disable HEAPCHECK for the process failure tests because they can cause false positives.
 export HEAPCHECK=