You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/11 02:18:01 UTC
[impala] branch master updated (6cfa65e -> 2066b72)
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git.
from 6cfa65e IMPALA-8334: [DOCS] Document the keyval option for impala-shell config file
new 06cb9b7 IMPALA-8796: Add unit tests to UnpackAndDecodeValues
new f34a243 IMPALA-8850: is_percent not initialized in TmpFileMgr parser
new 2066b72 IMPALA-8848: fix UNION missing input cardinality bug
The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
be/src/util/bit-packing-test.cc | 214 ++++++++++++++++++---
be/src/util/bit-packing.cc | 6 +
be/src/util/parse-util-test.cc | 23 +++
be/src/util/parse-util.cc | 2 +-
.../java/org/apache/impala/planner/UnionNode.java | 15 +-
.../org/apache/impala/planner/CardinalityTest.java | 16 +-
6 files changed, 246 insertions(+), 30 deletions(-)
[impala] 02/03: IMPALA-8850: is_percent not initialized in
TmpFileMgr parser
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit f34a243a956dacd52f8afe79a122410ccf45ae26
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Aug 8 17:34:03 2019 -0700
IMPALA-8850: is_percent not initialized in TmpFileMgr parser
This bug was only detected on ASAN. The is_percent variable
was not initialised and is not set in ParseMemSpec() when it
does an early exit for an empty string. This caused
TmpFileMgr to consider the empty memory spec after the colon
to be invalid.
Testing:
This reliably reproduced in TmpFileMgrTest under ASAN.
Initialising is_percent fixed the issue.
Added more detailed tests for ParseMemSpec() that initialise
is_percent to avoid similar bugs.
Change-Id: I4465944c7043c1fc6b33db9693ec290c24e3ed19
Reviewed-on: http://gerrit.cloudera.org:8080/14040
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/util/parse-util-test.cc | 23 +++++++++++++++++++++++
be/src/util/parse-util.cc | 2 +-
2 files changed, 24 insertions(+), 1 deletion(-)
diff --git a/be/src/util/parse-util-test.cc b/be/src/util/parse-util-test.cc
index c8fb4a0..4d2a5ac 100644
--- a/be/src/util/parse-util-test.cc
+++ b/be/src/util/parse-util-test.cc
@@ -37,54 +37,69 @@ TEST(ParseMemSpecs, Basic) {
int64_t gigabytes = 1024 * megabytes;
int64_t terabytes = 1024 * gigabytes;
+ // Initialize 'is_percent' to the opposite of the expected value, to confirm that
+ // ParseMemSpec() is actually setting the output parameter.
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("1", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(1, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("100b", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(100, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("100kb", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(100 * 1024, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("5KB", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(5 * 1024, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("4MB", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(4 * megabytes, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("4m", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(4 * megabytes, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("8gb", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(8 * gigabytes, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("8G", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(8 * gigabytes, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("12Gb", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(12 * gigabytes, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("8T", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(8 * terabytes, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("12tb", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(12 * terabytes, bytes);
ASSERT_FALSE(is_percent);
+ is_percent = false;
bytes = ParseUtil::ParseMemSpec("13%", &is_percent, MemInfo::physical_mem());
ASSERT_GT(bytes, 0);
ASSERT_TRUE(is_percent);
+ is_percent = false;
ASSERT_GT(ParseUtil::ParseMemSpec("17%", &is_percent, MemInfo::physical_mem()), bytes);
ASSERT_EQ(ParseUtil::ParseMemSpec("17%", &is_percent, 100), 17);
ASSERT_TRUE(is_percent);
@@ -111,17 +126,25 @@ TEST(ParseMemSpecs, Basic) {
ASSERT_EQ(-1, bytes);
}
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(0, bytes);
+ ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("-1", &is_percent, MemInfo::physical_mem());
ASSERT_EQ(0, bytes);
+ ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("-2", &is_percent, MemInfo::physical_mem());
ASSERT_LT(bytes, 0);
+ ASSERT_FALSE(is_percent);
+ is_percent = true;
bytes = ParseUtil::ParseMemSpec("-2%", &is_percent, MemInfo::physical_mem());
ASSERT_LT(bytes, 0);
+ ASSERT_TRUE(is_percent);
}
}
diff --git a/be/src/util/parse-util.cc b/be/src/util/parse-util.cc
index 75253d7..a70db99 100644
--- a/be/src/util/parse-util.cc
+++ b/be/src/util/parse-util.cc
@@ -25,9 +25,9 @@ namespace impala {
int64_t ParseUtil::ParseMemSpec(const string& mem_spec_str, bool* is_percent,
int64_t relative_reference) {
+ *is_percent = false;
if (mem_spec_str.empty()) return 0;
- *is_percent = false;
int64_t multiplier = -1;
int32_t number_str_len = mem_spec_str.size();
[impala] 01/03: IMPALA-8796: Add unit tests to UnpackAndDecodeValues
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 06cb9b7832c20e211696899d7cf7a87244917250
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Mon Aug 5 10:28:53 2019 +0200
IMPALA-8796: Add unit tests to UnpackAndDecodeValues
Adding unit tests to test BitPacking::UnpackAndDecodeValues. Also
changing the existing BitPacking tests so that they are parametrised
by the output type of the unpacking so we can test different output
types.
Change-Id: I363ff9f5d4750231fdc1aa60cf40d90ebb9ba305
Reviewed-on: http://gerrit.cloudera.org:8080/14004
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/util/bit-packing-test.cc | 214 +++++++++++++++++++++++++++++++++++-----
be/src/util/bit-packing.cc | 6 ++
2 files changed, 195 insertions(+), 25 deletions(-)
diff --git a/be/src/util/bit-packing-test.cc b/be/src/util/bit-packing-test.cc
index 8010fe3..197ba3f 100644
--- a/be/src/util/bit-packing-test.cc
+++ b/be/src/util/bit-packing-test.cc
@@ -15,11 +15,14 @@
// specific language governing permissions and limitations
// under the License.
+#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <random>
+#include <unordered_map>
#include "testutil/gtest-util.h"
+#include "testutil/rand-util.h"
#include "testutil/mem-util.h"
#include "util/bit-packing.h"
#include "util/bit-stream-utils.inline.h"
@@ -32,9 +35,15 @@ using std::mt19937;
namespace impala {
namespace {
-uint64_t ComputeMask(int bit_width) {
- return (bit_width < 64) ? ((1UL << bit_width) - 1) : ~0UL;
+
+constexpr int NUM_IN_VALUES = 64 * 1024;
+
+template <typename UINT_T>
+constexpr UINT_T ComputeMask(int bit_width) {
+ return (bit_width < sizeof(UINT_T) * CHAR_BIT) ?
+ ((1UL << bit_width) - 1) : static_cast<UINT_T>(~0UL);
}
+
}
/// Test unpacking a subarray of values to/from smaller buffers that are sized to exactly
@@ -45,19 +54,21 @@ uint64_t ComputeMask(int bit_width) {
///
/// This is to test that we do not overrun either the input or output buffer for smaller
/// batch sizes.
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
int bit_width, bool aligned);
/// Test a packing/unpacking round-trip of the 'num_in_values' values in 'in',
/// packed with 'bit_width'. If 'aligned' is true, buffers for packed and unpacked data
/// are allocated at a 64-byte aligned address. Otherwise the buffers are misaligned
/// by 1 byte from a 64-byte aligned address.
-void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool aligned) {
+template <typename UINT_T>
+void PackUnpack(const UINT_T* in, int num_in_values, int bit_width, bool aligned) {
LOG(INFO) << "num_in_values = " << num_in_values << " bit_width = " << bit_width
<< " aligned = " << aligned;
// Mask out higher bits so that the values to pack are in range.
- const uint64_t mask = ComputeMask(bit_width);
+ const UINT_T mask = ComputeMask<UINT_T>(bit_width);
const int misalignment = aligned ? 0 : 1;
const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * num_in_values);
@@ -79,9 +90,9 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
for (const int num_to_unpack : {num_in_values, num_in_values + 1, num_in_values + 77}) {
LOG(INFO) << "Unpacking " << num_to_unpack;
// Size buffer exactly so that ASAN can detect reads/writes that overrun the buffer.
- AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
- uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
- const auto result = BitPacking::UnpackValues(
+ AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+ UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+ const auto result = BitPacking::UnpackValues<UINT_T>(
bit_width, packed, writer.bytes_written(), num_to_unpack, out);
ASSERT_EQ(packed + writer.bytes_written(), result.first)
<< "Unpacked different # of bytes from the # written";
@@ -99,13 +110,15 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
}
for (int i = 0; i < num_in_values; ++i) {
- EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i;
+ EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i << "."
+ << " Bit width: " << bit_width << ".";
}
}
- UnpackSubset(in, packed, num_in_values, bit_width, aligned);
+ UnpackSubset<UINT_T>(in, packed, num_in_values, bit_width, aligned);
}
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
int bit_width, bool aligned) {
const int misalignment = aligned ? 0 : 1;
for (int num_to_unpack : {1, 10, 77, num_in_values - 7}) {
@@ -116,43 +129,194 @@ void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
AlignedAllocation packed_copy_storage(bytes_to_read + misalignment);
uint8_t* packed_copy = packed_copy_storage.data() + misalignment;
memcpy(packed_copy, packed, bytes_to_read);
- AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
- uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
- const auto result = BitPacking::UnpackValues(
+ AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+ UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+ const auto result = BitPacking::UnpackValues<UINT_T>(
bit_width, packed_copy, bytes_to_read, num_to_unpack, out);
ASSERT_EQ(packed_copy + bytes_to_read, result.first) << "Read wrong # of bytes";
ASSERT_EQ(num_to_unpack, result.second) << "Unpacked wrong # of values";
for (int i = 0; i < num_to_unpack; ++i) {
- ASSERT_EQ(in[i] & ComputeMask(bit_width), out[i]) << "Didn't get back input value "
- << i;
+ ASSERT_EQ(in[i] & ComputeMask<UINT_T>(bit_width), out[i])
+ << "Didn't get back input value " << i;
}
}
}
-TEST(BitPackingTest, RandomUnpack) {
- constexpr int NUM_IN_VALUES = 64 * 1024;
- uint64_t in[NUM_IN_VALUES];
+template <typename INT_T>
+std::vector<INT_T> GenerateRandomInput(int length, INT_T lower, INT_T higher) {
+ std::vector<INT_T> in(length);
mt19937 rng;
- uniform_int_distribution<uint64_t> dist;
- std::generate(std::begin(in), std::end(in), [&rng, &dist] { return dist(rng); });
+ RandTestUtil::SeedRng("BIT_PACKING_TEST_RANDOM_SEED", &rng);
+ uniform_int_distribution<INT_T> dist(lower, higher);
+ std::generate(in.begin(), in.end(), [&rng, &dist] { return dist(rng); });
+
+ return in;
+}
- // Test various odd input lengths to exercise boundary cases for full and partial
- // batches of 32.
+// Test various odd input lengths to exercise boundary cases for full and partial
+// batches of 32.
+std::vector<int> GetLengths() {
vector<int> lengths{NUM_IN_VALUES, NUM_IN_VALUES - 1, NUM_IN_VALUES - 16,
NUM_IN_VALUES - 19, NUM_IN_VALUES - 31};
for (int i = 0; i < 32; ++i) {
lengths.push_back(i);
}
- for (int bit_width = 0; bit_width <= 64; ++bit_width) {
+ return lengths;
+}
+
+template <typename UINT_T>
+void RandomUnpackTest() {
+ const std::vector<UINT_T> in = GenerateRandomInput<UINT_T>(NUM_IN_VALUES,
+ std::numeric_limits<UINT_T>::min(), std::numeric_limits<UINT_T>::max());
+
+ const std::vector<int> lengths = GetLengths();
+
+ constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+ const int max_bit_width = std::min<int>(MAX_BITWIDTH, sizeof(UINT_T) * CHAR_BIT);
+ for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
for (const int length : lengths) {
// Test that unpacking to/from aligned and unaligned memory works.
for (const bool aligned : {true, false}) {
- PackUnpack(in, length, bit_width, aligned);
+ PackUnpack<UINT_T>(in.data(), length, bit_width, aligned);
}
}
}
}
+
+TEST(BitPackingTest, RandomUnpack8) {
+ RandomUnpackTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack16) {
+ RandomUnpackTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack32) {
+ RandomUnpackTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack64) {
+ RandomUnpackTest<uint64_t>();
+}
+
+// This is not the full dictionary encoding, only a big bit-packed literal run, no RLE is
+// used.
+template <typename T>
+std::pair<std::vector<T>, std::vector<uint8_t>>
+DictEncode(const std::vector<T>& input, int bit_width) {
+ // Create dictionary and indices.
+ std::unordered_map<T, std::size_t> index_map;
+
+ std::vector<T> dict;
+ std::vector<uint64_t> indices;
+
+ for (const T value : input) {
+ auto it = index_map.find(value);
+ if (it != index_map.end()) {
+ indices.push_back(it->second);
+ } else {
+ const std::size_t next_index = dict.size();
+ index_map[value] = next_index;
+ dict.push_back(value);
+ indices.push_back(next_index);
+ }
+ }
+
+ // Bit pack the indices.
+ const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * input.size());
+ std::vector<uint8_t> out_data(bytes_required);
+
+ // We do not write data if we do not have any. Doing it could also lead to undefined
+ // behaviour as out_data.data() may be nullptr and BitWriter uses memcpy internally, and
+ // passing a null pointer to memcpy is undefined behaviour.
+ if (bytes_required > 0) {
+ BitWriter writer(out_data.data(), bytes_required);
+ if (bit_width > 0) {
+ for (const uint64_t index : indices) {
+ EXPECT_TRUE(writer.PutValue(index, bit_width));
+ }
+ }
+ writer.Flush();
+ }
+
+ return std::make_pair(dict, out_data);
}
+template <typename T>
+void ExpectEqualsWithStride(const T* expected, int num_values,
+ const uint8_t* actual, int actual_length, int stride) {
+ for (std::size_t i = 0; i < num_values; i++) {
+ const T expected_value = expected[i];
+
+ DCHECK_LE(i * stride + sizeof(T), actual_length);
+ const T actual_value = *reinterpret_cast<const T*>(&actual[i * stride]);
+
+ EXPECT_EQ(expected_value, actual_value);
+ }
+}
+
+// This does not test the full dictionary decoding, only the unpacking of a single
+// bit-packed literal run (no RLE).
+template <typename UINT_T>
+void RandomUnpackAndDecodeTest() {
+ constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+ const int max_bit_width = std::min<int>({MAX_BITWIDTH, sizeof(UINT_T) * 8,
+ sizeof(uint32_t) * 8 /* dictionary indices are 32 bit values */});
+
+ const std::vector<int> lengths = GetLengths();
+
+ for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
+ const UINT_T max_dict_size = bit_width == sizeof(UINT_T) * 8
+ ? std::numeric_limits<UINT_T>::max() : (1UL << bit_width);
+
+ const std::vector<UINT_T> input = GenerateRandomInput<UINT_T>(NUM_IN_VALUES, 0,
+ max_dict_size - 1 /* inclusive range */);
+
+ for (int length : lengths) {
+ const std::vector<UINT_T> in_data(input.begin(), input.begin() + length);
+
+ std::pair<std::vector<UINT_T>, std::vector<uint8_t>> dict_and_data
+ = DictEncode(in_data, bit_width);
+ std::vector<UINT_T>& dict = dict_and_data.first;
+ const std::vector<uint8_t>& data = dict_and_data.second;
+
+ const std::vector<int> strides = {sizeof(UINT_T), sizeof(UINT_T) + 5,
+ 2 * sizeof(UINT_T) + 5};
+
+ for (int stride : strides) {
+ bool decode_error = false;
+ std::vector<uint8_t> out(in_data.size() * stride);
+
+ std::pair<const uint8_t*, int64_t> res
+ = BitPacking::UnpackAndDecodeValues<UINT_T>(bit_width, data.data(),
+ data.size(), dict.data(), dict.size(), in_data.size(),
+ reinterpret_cast<UINT_T*>(out.data()), stride, &decode_error);
+
+ EXPECT_FALSE(decode_error);
+ EXPECT_EQ(in_data.size(), res.second);
+ ExpectEqualsWithStride<UINT_T>(in_data.data(), in_data.size(), out.data(),
+ out.size(), stride);
+ }
+ }
+ }
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode8) {
+ RandomUnpackAndDecodeTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode16) {
+ RandomUnpackAndDecodeTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode32) {
+ RandomUnpackAndDecodeTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode64) {
+ RandomUnpackAndDecodeTest<uint64_t>();
+}
+
+}
diff --git a/be/src/util/bit-packing.cc b/be/src/util/bit-packing.cc
index 52f86fd..851bc0b 100644
--- a/be/src/util/bit-packing.cc
+++ b/be/src/util/bit-packing.cc
@@ -32,7 +32,9 @@ namespace impala {
INSTANTIATE_UNPACK_VALUES(bool);
INSTANTIATE_UNPACK_VALUES(uint8_t);
+INSTANTIATE_UNPACK_VALUES(uint16_t);
INSTANTIATE_UNPACK_VALUES(uint32_t);
+INSTANTIATE_UNPACK_VALUES(uint64_t);
#define INSTANTIATE_UNPACK_AND_DECODE(OUT_TYPE) \
template std::pair<const uint8_t*, int64_t> \
@@ -48,6 +50,10 @@ INSTANTIATE_UNPACK_AND_DECODE(int8_t);
INSTANTIATE_UNPACK_AND_DECODE(int16_t);
INSTANTIATE_UNPACK_AND_DECODE(int32_t);
INSTANTIATE_UNPACK_AND_DECODE(int64_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint8_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint16_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint32_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint64_t);
INSTANTIATE_UNPACK_AND_DECODE(Decimal4Value);
INSTANTIATE_UNPACK_AND_DECODE(Decimal8Value);
INSTANTIATE_UNPACK_AND_DECODE(Decimal16Value);
[impala] 03/03: IMPALA-8848: fix UNION missing input cardinality bug
Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 2066b72bc4b43fd57679c37145eecebaa6be8b27
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Aug 8 13:52:00 2019 -0700
IMPALA-8848: fix UNION missing input cardinality bug
If a UNION has children, and none of those children has
a known cardinality, then we can make no reasonable
estimate of the output cardinality, so the planner
should consider the output cardinality to be unknown.
The previous behaviour was to report a cardinality of
0, which is unsafe because the planner may make further
decisions under the incorrect assumption that the output
of the UNION is tiny.
Testing:
An existing CardinalityTest already tested this but had
the wrong estimate.
Change-Id: Ic3ed670ffb685d8ff24824933ca303f3219737bb
Reviewed-on: http://gerrit.cloudera.org:8080/14036
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
.../main/java/org/apache/impala/planner/UnionNode.java | 15 ++++++++++++---
.../java/org/apache/impala/planner/CardinalityTest.java | 16 +++++++++++++++-
2 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/fe/src/main/java/org/apache/impala/planner/UnionNode.java b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
index d119ac7..cfdd076 100644
--- a/fe/src/main/java/org/apache/impala/planner/UnionNode.java
+++ b/fe/src/main/java/org/apache/impala/planner/UnionNode.java
@@ -110,15 +110,24 @@ public class UnionNode extends PlanNode {
@Override
public void computeStats(Analyzer analyzer) {
super.computeStats(analyzer);
- cardinality_ = constExprLists_.size();
+ long totalChildCardinality = 0;
+ boolean haveChildWithCardinality = false;
for (PlanNode child: children_) {
// ignore missing child cardinality info in the hope it won't matter enough
// to change the planning outcome
- if (child.cardinality_ > 0) {
- cardinality_ = checkedAdd(cardinality_, child.cardinality_);
+ if (child.cardinality_ >= 0) {
+ totalChildCardinality = checkedAdd(totalChildCardinality, child.cardinality_);
+ haveChildWithCardinality = true;
}
numNodes_ = Math.max(child.getNumNodes(), numNodes_);
}
+ // Consider estimate valid if we have at least one child with known cardinality, or
+ // only constant values.
+ if (haveChildWithCardinality || children_.size() == 0) {
+ cardinality_ = checkedAdd(totalChildCardinality, constExprLists_.size());
+ } else {
+ cardinality_ = -1;
+ }
// The number of nodes of a union node is -1 (invalid) if all the referenced tables
// are inline views (e.g. select 1 FROM (VALUES(1 x, 1 y)) a FULL OUTER JOIN
// (VALUES(1 x, 1 y)) b ON (a.x = b.y)). We need to set the correct value.
diff --git a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
index 441c010..2b71ee3 100644
--- a/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
+++ b/fe/src/test/java/org/apache/impala/planner/CardinalityTest.java
@@ -712,9 +712,23 @@ public class CardinalityTest extends PlannerTestBase {
// There is no available statistics in functional.tinytable.
// True cardinality of functional.tinytable is 3.
String subQuery = "SELECT * FROM functional.tinytable";
- verifyApproxCardinality(subQuery + " UNION ALL " + subQuery, 0, true,
+ verifyApproxCardinality(subQuery + " UNION ALL " + subQuery, -1, true,
ImmutableSet.of(PlannerTestOption.DISABLE_HDFS_NUM_ROWS_ESTIMATE),
path, UnionNode.class);
+
+ // Same test, except with some constant expressions added to the union.
+ // The output cardinality still cannot be estimated.
+ verifyApproxCardinality(
+ subQuery + " UNION ALL " + subQuery + " UNION ALL SELECT 'a', 'b'", -1, true,
+ ImmutableSet.of(PlannerTestOption.DISABLE_HDFS_NUM_ROWS_ESTIMATE),
+ path, UnionNode.class);
+
+ // If one branch of the union has known cardinality, that value is used.
+ // alltypestiny has 8 rows and computed stats.
+ verifyApproxCardinality(subQuery + " UNION ALL " +
+ "SELECT string_col, date_string_col from functional.alltypestiny",
+ 8, true, ImmutableSet.of(PlannerTestOption.DISABLE_HDFS_NUM_ROWS_ESTIMATE),
+ path, UnionNode.class);
}
@Test