You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/11 02:18:01 UTC

[impala] branch master updated (6cfa65e -> 2066b72)

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.


    from 6cfa65e  IMPALA-8334: [DOCS] Document the keyval option for impala-shell config file
     new 06cb9b7  IMPALA-8796: Add unit tests to UnpackAndDecodeValues
     new f34a243  IMPALA-8850: is_percent not initialized in TmpFileMgr parser
     new 2066b72  IMPALA-8848: fix UNION missing input cardinality bug

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 be/src/util/bit-packing-test.cc                    | 214 ++++++++++++++++++---
 be/src/util/bit-packing.cc                         |   6 +
 be/src/util/parse-util-test.cc                     |  23 +++
 be/src/util/parse-util.cc                          |   2 +-
 .../java/org/apache/impala/planner/UnionNode.java  |  15 +-
 .../org/apache/impala/planner/CardinalityTest.java |  16 +-
 6 files changed, 246 insertions(+), 30 deletions(-)


[impala] 02/03: IMPALA-8850: is_percent not initialized in TmpFileMgr parser

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit f34a243a956dacd52f8afe79a122410ccf45ae26
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Aug 8 17:34:03 2019 -0700

    IMPALA-8850: is_percent not initialized in TmpFileMgr parser
    
    This bug was only detected on ASAN. The is_percent variable
    was not initialised and is not set in ParseMemSpec() when it
    does an early exit for an empty string. This caused
    TmpFileMgr to consider the empty memory spec after the colon
    to be invalid.
    
    Testing:
    This reliably reproduced in TmpFileMgrTest under ASAN.
    Initialising is_percent fixed the issue.
    
    Added more detailed tests for ParseMemSpec() that initialise
    is_percent to avoid similar bugs.
    
    Change-Id: I4465944c7043c1fc6b33db9693ec290c24e3ed19
    Reviewed-on: http://gerrit.cloudera.org:8080/14040
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/util/parse-util-test.cc | 23 +++++++++++++++++++++++
 be/src/util/parse-util.cc      |  2 +-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/be/src/util/parse-util-test.cc b/be/src/util/parse-util-test.cc
index c8fb4a0..4d2a5ac 100644
--- a/be/src/util/parse-util-test.cc
+++ b/be/src/util/parse-util-test.cc
@@ -37,54 +37,69 @@ TEST(ParseMemSpecs, Basic) {
   int64_t gigabytes = 1024 * megabytes;
   int64_t terabytes = 1024 * gigabytes;
 
+  // Initialize 'is_percent' to the opposite of the expected value, to confirm that
+  // ParseMemSpec() is actually setting the output parameter.
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("1", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(1, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("100b", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(100, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("100kb", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(100 * 1024, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("5KB", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(5 * 1024, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("4MB", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(4 * megabytes, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("4m", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(4 * megabytes, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("8gb", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(8 * gigabytes, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("8G", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(8 * gigabytes, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("12Gb", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(12 * gigabytes, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("8T", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(8 * terabytes, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("12tb", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(12 * terabytes, bytes);
   ASSERT_FALSE(is_percent);
 
+  is_percent = false;
   bytes = ParseUtil::ParseMemSpec("13%", &is_percent, MemInfo::physical_mem());
   ASSERT_GT(bytes, 0);
   ASSERT_TRUE(is_percent);
 
+  is_percent = false;
   ASSERT_GT(ParseUtil::ParseMemSpec("17%", &is_percent, MemInfo::physical_mem()), bytes);
   ASSERT_EQ(ParseUtil::ParseMemSpec("17%", &is_percent, 100), 17);
   ASSERT_TRUE(is_percent);
@@ -111,17 +126,25 @@ TEST(ParseMemSpecs, Basic) {
     ASSERT_EQ(-1, bytes);
   }
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(0, bytes);
+  ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("-1", &is_percent, MemInfo::physical_mem());
   ASSERT_EQ(0, bytes);
+  ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("-2", &is_percent, MemInfo::physical_mem());
   ASSERT_LT(bytes, 0);
+  ASSERT_FALSE(is_percent);
 
+  is_percent = true;
   bytes = ParseUtil::ParseMemSpec("-2%", &is_percent, MemInfo::physical_mem());
   ASSERT_LT(bytes, 0);
+  ASSERT_TRUE(is_percent);
 }
 
 }
diff --git a/be/src/util/parse-util.cc b/be/src/util/parse-util.cc
index 75253d7..a70db99 100644
--- a/be/src/util/parse-util.cc
+++ b/be/src/util/parse-util.cc
@@ -25,9 +25,9 @@ namespace impala {
 
 int64_t ParseUtil::ParseMemSpec(const string& mem_spec_str, bool* is_percent,
     int64_t relative_reference) {
+  *is_percent = false;
   if (mem_spec_str.empty()) return 0;
 
-  *is_percent = false;
   int64_t multiplier = -1;
   int32_t number_str_len = mem_spec_str.size();
 


[impala] 01/03: IMPALA-8796: Add unit tests to UnpackAndDecodeValues

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 06cb9b7832c20e211696899d7cf7a87244917250
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Mon Aug 5 10:28:53 2019 +0200

    IMPALA-8796: Add unit tests to UnpackAndDecodeValues
    
    Adding unit tests to test BitPacking::UnpackAndDecodeValues. Also
    changing the existing BitPacking tests so that they are parametrised
    by the output type of the unpacking so we can test different output
    types.
    
    Change-Id: I363ff9f5d4750231fdc1aa60cf40d90ebb9ba305
    Reviewed-on: http://gerrit.cloudera.org:8080/14004
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/util/bit-packing-test.cc | 214 +++++++++++++++++++++++++++++++++++-----
 be/src/util/bit-packing.cc      |   6 ++
 2 files changed, 195 insertions(+), 25 deletions(-)

diff --git a/be/src/util/bit-packing-test.cc b/be/src/util/bit-packing-test.cc
index 8010fe3..197ba3f 100644
--- a/be/src/util/bit-packing-test.cc
+++ b/be/src/util/bit-packing-test.cc
@@ -15,11 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <random>
+#include <unordered_map>
 
 #include "testutil/gtest-util.h"
+#include "testutil/rand-util.h"
 #include "testutil/mem-util.h"
 #include "util/bit-packing.h"
 #include "util/bit-stream-utils.inline.h"
@@ -32,9 +35,15 @@ using std::mt19937;
 namespace impala {
 
 namespace {
-uint64_t ComputeMask(int bit_width) {
-  return (bit_width < 64) ? ((1UL << bit_width) - 1) : ~0UL;
+
+constexpr int NUM_IN_VALUES = 64 * 1024;
+
+template <typename UINT_T>
+constexpr UINT_T ComputeMask(int bit_width) {
+  return (bit_width < sizeof(UINT_T) * CHAR_BIT) ?
+      ((1UL << bit_width) - 1) : static_cast<UINT_T>(~0UL);
 }
+
 }
 
 /// Test unpacking a subarray of values to/from smaller buffers that are sized to exactly
@@ -45,19 +54,21 @@ uint64_t ComputeMask(int bit_width) {
 ///
 /// This is to test that we do not overrun either the input or output buffer for smaller
 /// batch sizes.
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
     int bit_width, bool aligned);
 
 /// Test a packing/unpacking round-trip of the 'num_in_values' values in 'in',
 /// packed with 'bit_width'. If 'aligned' is true, buffers for packed and unpacked data
 /// are allocated at a 64-byte aligned address. Otherwise the buffers are misaligned
 /// by 1 byte from a 64-byte aligned address.
-void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool aligned) {
+template <typename UINT_T>
+void PackUnpack(const UINT_T* in, int num_in_values, int bit_width, bool aligned) {
   LOG(INFO) << "num_in_values = " << num_in_values << " bit_width = " << bit_width
             << " aligned = " << aligned;
 
   // Mask out higher bits so that the values to pack are in range.
-  const uint64_t mask = ComputeMask(bit_width);
+  const UINT_T mask = ComputeMask<UINT_T>(bit_width);
   const int misalignment = aligned ? 0 : 1;
 
   const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * num_in_values);
@@ -79,9 +90,9 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
   for (const int num_to_unpack : {num_in_values, num_in_values + 1, num_in_values + 77}) {
     LOG(INFO) << "Unpacking " << num_to_unpack;
     // Size buffer exactly so that ASAN can detect reads/writes that overrun the buffer.
-    AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
-    uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
-    const auto result = BitPacking::UnpackValues(
+    AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+    UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+    const auto result = BitPacking::UnpackValues<UINT_T>(
         bit_width, packed, writer.bytes_written(), num_to_unpack, out);
     ASSERT_EQ(packed + writer.bytes_written(), result.first)
         << "Unpacked different # of bytes from the # written";
@@ -99,13 +110,15 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
     }
 
     for (int i = 0; i < num_in_values; ++i) {
-      EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i;
+      EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i << "."
+          << " Bit width: " << bit_width << ".";
     }
   }
-  UnpackSubset(in, packed, num_in_values, bit_width, aligned);
+  UnpackSubset<UINT_T>(in, packed, num_in_values, bit_width, aligned);
 }
 
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
     int bit_width, bool aligned) {
   const int misalignment = aligned ? 0 : 1;
   for (int num_to_unpack : {1, 10, 77, num_in_values - 7}) {
@@ -116,43 +129,194 @@ void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
     AlignedAllocation packed_copy_storage(bytes_to_read + misalignment);
     uint8_t* packed_copy = packed_copy_storage.data() + misalignment;
     memcpy(packed_copy, packed, bytes_to_read);
-    AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
-    uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
-    const auto result = BitPacking::UnpackValues(
+    AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+    UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+    const auto result = BitPacking::UnpackValues<UINT_T>(
         bit_width, packed_copy, bytes_to_read, num_to_unpack, out);
     ASSERT_EQ(packed_copy + bytes_to_read, result.first) << "Read wrong # of bytes";
     ASSERT_EQ(num_to_unpack, result.second) << "Unpacked wrong # of values";
 
     for (int i = 0; i < num_to_unpack; ++i) {
-      ASSERT_EQ(in[i] & ComputeMask(bit_width), out[i]) << "Didn't get back input value "
-                                                         << i;
+      ASSERT_EQ(in[i] & ComputeMask<UINT_T>(bit_width), out[i])
+          << "Didn't get back input value " << i;
     }
   }
 }
 
-TEST(BitPackingTest, RandomUnpack) {
-  constexpr int NUM_IN_VALUES = 64 * 1024;
-  uint64_t in[NUM_IN_VALUES];
+template <typename INT_T>
+std::vector<INT_T> GenerateRandomInput(int length, INT_T lower, INT_T higher) {
+  std::vector<INT_T> in(length);
   mt19937 rng;
-  uniform_int_distribution<uint64_t> dist;
-  std::generate(std::begin(in), std::end(in), [&rng, &dist] { return dist(rng); });
+  RandTestUtil::SeedRng("BIT_PACKING_TEST_RANDOM_SEED", &rng);
+  uniform_int_distribution<INT_T> dist(lower, higher);
+  std::generate(in.begin(), in.end(), [&rng, &dist] { return dist(rng); });
+
+  return in;
+}
 
-  // Test various odd input lengths to exercise boundary cases for full and partial
-  // batches of 32.
+// Test various odd input lengths to exercise boundary cases for full and partial
+// batches of 32.
+std::vector<int> GetLengths() {
   vector<int> lengths{NUM_IN_VALUES, NUM_IN_VALUES - 1, NUM_IN_VALUES - 16,
       NUM_IN_VALUES - 19, NUM_IN_VALUES - 31};
   for (int i = 0; i < 32; ++i) {
     lengths.push_back(i);
   }
 
-  for (int bit_width = 0; bit_width <= 64; ++bit_width) {
+  return lengths;
+}
+
+template <typename UINT_T>
+void RandomUnpackTest() {
+  const std::vector<UINT_T> in = GenerateRandomInput<UINT_T>(NUM_IN_VALUES,
+      std::numeric_limits<UINT_T>::min(), std::numeric_limits<UINT_T>::max());
+
+  const std::vector<int> lengths = GetLengths();
+
+  constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+  const int max_bit_width = std::min<int>(MAX_BITWIDTH, sizeof(UINT_T) * CHAR_BIT);
+  for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
     for (const int length : lengths) {
       // Test that unpacking to/from aligned and unaligned memory works.
       for (const bool aligned : {true, false}) {
-        PackUnpack(in, length, bit_width, aligned);
+        PackUnpack<UINT_T>(in.data(), length, bit_width, aligned);
       }
     }
   }
 }
+
+TEST(BitPackingTest, RandomUnpack8) {
+  RandomUnpackTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack16) {
+  RandomUnpackTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack32) {
+  RandomUnpackTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack64) {
+  RandomUnpackTest<uint64_t>();
+}
+
+// This is not the full dictionary encoding, only a big bit-packed literal run, no RLE is
+// used.
+template <typename T>
+std::pair<std::vector<T>, std::vector<uint8_t>>
+DictEncode(const std::vector<T>& input, int bit_width) {
+  // Create dictionary and indices.
+  std::unordered_map<T, std::size_t> index_map;
+
+  std::vector<T> dict;
+  std::vector<uint64_t> indices;
+
+  for (const T value : input) {
+    auto it = index_map.find(value);
+    if (it != index_map.end()) {
+      indices.push_back(it->second);
+    } else {
+      const std::size_t next_index = dict.size();
+      index_map[value] = next_index;
+      dict.push_back(value);
+      indices.push_back(next_index);
+    }
+  }
+
+  // Bit pack the indices.
+  const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * input.size());
+  std::vector<uint8_t> out_data(bytes_required);
+
+  // We do not write data if we do not have any. Doing it could also lead to undefined
+  // behaviour as out_data.data() may be nullptr and BitWriter uses memcpy internally, and
+  // passing a null pointer to memcpy is undefined behaviour.
+  if (bytes_required > 0) {
+    BitWriter writer(out_data.data(), bytes_required);
+    if (bit_width > 0) {
+      for (const uint64_t index : indices) {
+        EXPECT_TRUE(writer.PutValue(index, bit_width));
+      }
+    }
+    writer.Flush();
+  }
+
+  return std::make_pair(dict, out_data);
 }
 
+template <typename T>
+void ExpectEqualsWithStride(const T* expected, int num_values,
+    const uint8_t* actual, int actual_length, int stride) {
+  for (std::size_t i = 0; i < num_values; i++) {
+    const T expected_value = expected[i];
+
+    DCHECK_LE(i * stride + sizeof(T), actual_length);
+    const T actual_value = *reinterpret_cast<const T*>(&actual[i * stride]);
+
+    EXPECT_EQ(expected_value, actual_value);
+  }
+}
+
+// This does not test the full dictionary decoding, only the unpacking of a single
+// bit-packed literal run (no RLE).
+template <typename UINT_T>
+void RandomUnpackAndDecodeTest() {
+  constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+  const int max_bit_width = std::min<int>({MAX_BITWIDTH, sizeof(UINT_T) * 8,
+      sizeof(uint32_t) * 8 /* dictionary indices are 32 bit values */});
+
+  const std::vector<int> lengths = GetLengths();
+
+  for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
+    const UINT_T max_dict_size = bit_width == sizeof(UINT_T) * 8
+        ? std::numeric_limits<UINT_T>::max() : (1UL << bit_width);
+
+    const std::vector<UINT_T> input = GenerateRandomInput<UINT_T>(NUM_IN_VALUES, 0,
+        max_dict_size - 1 /* inclusive range */);
+
+    for (int length : lengths) {
+      const std::vector<UINT_T> in_data(input.begin(), input.begin() + length);
+
+      std::pair<std::vector<UINT_T>, std::vector<uint8_t>> dict_and_data
+          = DictEncode(in_data, bit_width);
+      std::vector<UINT_T>& dict = dict_and_data.first;
+      const std::vector<uint8_t>& data = dict_and_data.second;
+
+      const std::vector<int> strides = {sizeof(UINT_T), sizeof(UINT_T) + 5,
+        2 * sizeof(UINT_T) + 5};
+
+      for (int stride : strides) {
+        bool decode_error = false;
+        std::vector<uint8_t> out(in_data.size() * stride);
+
+        std::pair<const uint8_t*, int64_t> res
+            = BitPacking::UnpackAndDecodeValues<UINT_T>(bit_width, data.data(),
+                data.size(), dict.data(), dict.size(), in_data.size(),
+            reinterpret_cast<UINT_T*>(out.data()), stride, &decode_error);
+
+        EXPECT_FALSE(decode_error);
+        EXPECT_EQ(in_data.size(), res.second);
+        ExpectEqualsWithStride<UINT_T>(in_data.data(), in_data.size(), out.data(),
+            out.size(), stride);
+      }
+    }
+  }
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode8) {
+  RandomUnpackAndDecodeTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode16) {
+  RandomUnpackAndDecodeTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode32) {
+  RandomUnpackAndDecodeTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode64) {
+  RandomUnpackAndDecodeTest<uint64_t>();
+}
+
+}
diff --git a/be/src/util/bit-packing.cc b/be/src/util/bit-packing.cc
index 52f86fd..851bc0b 100644
--- a/be/src/util/bit-packing.cc
+++ b/be/src/util/bit-packing.cc
@@ -32,7 +32,9 @@ namespace impala {
 
 INSTANTIATE_UNPACK_VALUES(bool);
 INSTANTIATE_UNPACK_VALUES(uint8_t);
+INSTANTIATE_UNPACK_VALUES(uint16_t);
 INSTANTIATE_UNPACK_VALUES(uint32_t);
+INSTANTIATE_UNPACK_VALUES(uint64_t);
 
 #define INSTANTIATE_UNPACK_AND_DECODE(OUT_TYPE)                                         \
   template std::pair<const uint8_t*, int64_t>                                           \
@@ -48,6 +50,10 @@ INSTANTIATE_UNPACK_AND_DECODE(int8_t);
 INSTANTIATE_UNPACK_AND_DECODE(int16_t);
 INSTANTIATE_UNPACK_AND_DECODE(int32_t);
 INSTANTIATE_UNPACK_AND_DECODE(int64_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint8_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint16_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint32_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint64_t);
 INSTANTIATE_UNPACK_AND_DECODE(Decimal4Value);
 INSTANTIATE_UNPACK_AND_DECODE(Decimal8Value);
 INSTANTIATE_UNPACK_AND_DECODE(Decimal16Value);


[impala] 03/03: IMPALA-8848: fix UNION missing input cardinality bug

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 2066b72bc4b43fd57679c37145eecebaa6be8b27
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Aug 8 13:52:00 2019 -0700

    IMPALA-8848: fix UNION missing input cardinality bug
    
    If a UNION has children, and none of those children has
    a known cardinality, then we can make no reasonable
    estimate of the output cardinality, so the planner
    should consider the output cardinality to be unknown.
    
    The previous behaviour was to report a cardinality of
    0, which is unsafe because the planner may make further
    decisions under the incorrect assumption that the output
    of the UNION is tiny.
    
    Testing:
    An existing CardinalityTest already tested this but had
    the wrong estimate.
    
    Change-Id: Ic3ed670ffb685d8ff24824933ca303f3219737bb
    Reviewed-on: http://gerrit.cloudera.org:8080/14036
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 .../main/java/org/apache/impala/planner/UnionNode.java   | 15 ++++++++++++---
 .../java/org/apache/impala/planner/CardinalityTest.java  | 16 +++++++++++++++-
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/fe/src/main/java/org/apache/impala/planner/UnionNode.java b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
index d119ac7..cfdd076 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnionNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
@@ -110,15 +110,24 @@ public class UnionNode extends PlanNode {
   @Override
   public void computeStats(Analyzer analyzer) {
     super.computeStats(analyzer);
-    cardinality_ = constExprLists_.size();
+    long totalChildCardinality = 0;
+    boolean haveChildWithCardinality = false;
     for (PlanNode child: children_) {
       // ignore missing child cardinality info in the hope it won't matter enough
       // to change the planning outcome
-      if (child.cardinality_ > 0) {
-        cardinality_ = checkedAdd(cardinality_, child.cardinality_);
+      if (child.cardinality_ >= 0) {
+        totalChildCardinality = checkedAdd(totalChildCardinality, child.cardinality_);
+        haveChildWithCardinality = true;
       }
       numNodes_ = Math.max(child.getNumNodes(), numNodes_);
     }
+    // Consider estimate valid if we have at least one child with known cardinality, or
+    // only constant values.
+    if (haveChildWithCardinality || children_.size() == 0) {
+      cardinality_ = checkedAdd(totalChildCardinality, constExprLists_.size());
+    } else {
+      cardinality_ = -1;
+    }
     // The number of nodes of a union node is -1 (invalid) if all the referenced tables
     // are inline views (e.g. select 1 FROM (VALUES(1 x, 1 y)) a FULL OUTER JOIN
     // (VALUES(1 x, 1 y)) b ON (a.x = b.y)). We need to set the correct value.
diff --git a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
index 441c010..2b71ee3 100644
--- a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
@@ -712,9 +712,23 @@ public class CardinalityTest extends PlannerTestBase {
     // There is no available statistics in functional.tinytable.
     // True cardinality of functional.tinytable is 3.
     String subQuery = "SELECT * FROM functional.tinytable";
-    verifyApproxCardinality(subQuery + " UNION ALL " + subQuery, 0, true,
+    verifyApproxCardinality(subQuery + " UNION ALL " + subQuery, -1, true,
         ImmutableSet.of(PlannerTestOption.DISABLE_HDFS_NUM_ROWS_ESTIMATE),
         path, UnionNode.class);
+
+    // Same test, except with some constant expressions added to the union.
+    // The output cardinality still cannot be estimated.
+    verifyApproxCardinality(
+        subQuery + " UNION ALL " + subQuery + " UNION ALL SELECT 'a', 'b'", -1, true,
+        ImmutableSet.of(PlannerTestOption.DISABLE_HDFS_NUM_ROWS_ESTIMATE),
+        path, UnionNode.class);
+
+    // If one branch of the union has known cardinality, that value is used.
+    // alltypestiny has 8 rows and computed stats.
+    verifyApproxCardinality(subQuery + " UNION ALL " +
+        "SELECT string_col, date_string_col from functional.alltypestiny",
+        8, true, ImmutableSet.of(PlannerTestOption.DISABLE_HDFS_NUM_ROWS_ESTIMATE),
+        path, UnionNode.class);
   }
 
   @Test