You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/11 02:18:02 UTC
[impala] 01/03: IMPALA-8796: Add unit tests to UnpackAndDecodeValues
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 06cb9b7832c20e211696899d7cf7a87244917250
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Mon Aug 5 10:28:53 2019 +0200
IMPALA-8796: Add unit tests to UnpackAndDecodeValues
Adding unit tests to test BitPacking::UnpackAndDecodeValues. Also
changing the existing BitPacking tests so that they are parametrised
by the output type of the unpacking so we can test different output
types.
Change-Id: I363ff9f5d4750231fdc1aa60cf40d90ebb9ba305
Reviewed-on: http://gerrit.cloudera.org:8080/14004
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/util/bit-packing-test.cc | 214 +++++++++++++++++++++++++++++++++++-----
be/src/util/bit-packing.cc | 6 ++
2 files changed, 195 insertions(+), 25 deletions(-)
diff --git a/be/src/util/bit-packing-test.cc b/be/src/util/bit-packing-test.cc
index 8010fe3..197ba3f 100644
--- a/be/src/util/bit-packing-test.cc
+++ b/be/src/util/bit-packing-test.cc
@@ -15,11 +15,14 @@
// specific language governing permissions and limitations
// under the License.
+#include <algorithm>
#include <cstdio>
#include <cstdlib>
#include <random>
+#include <unordered_map>
#include "testutil/gtest-util.h"
+#include "testutil/rand-util.h"
#include "testutil/mem-util.h"
#include "util/bit-packing.h"
#include "util/bit-stream-utils.inline.h"
@@ -32,9 +35,15 @@ using std::mt19937;
namespace impala {
namespace {
-uint64_t ComputeMask(int bit_width) {
- return (bit_width < 64) ? ((1UL << bit_width) - 1) : ~0UL;
+
+constexpr int NUM_IN_VALUES = 64 * 1024;
+
+template <typename UINT_T>
+constexpr UINT_T ComputeMask(int bit_width) {
+ return (bit_width < sizeof(UINT_T) * CHAR_BIT) ?
+ ((1UL << bit_width) - 1) : static_cast<UINT_T>(~0UL);
}
+
}
/// Test unpacking a subarray of values to/from smaller buffers that are sized to exactly
@@ -45,19 +54,21 @@ uint64_t ComputeMask(int bit_width) {
///
/// This is to test that we do not overrun either the input or output buffer for smaller
/// batch sizes.
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
int bit_width, bool aligned);
/// Test a packing/unpacking round-trip of the 'num_in_values' values in 'in',
/// packed with 'bit_width'. If 'aligned' is true, buffers for packed and unpacked data
/// are allocated at a 64-byte aligned address. Otherwise the buffers are misaligned
/// by 1 byte from a 64-byte aligned address.
-void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool aligned) {
+template <typename UINT_T>
+void PackUnpack(const UINT_T* in, int num_in_values, int bit_width, bool aligned) {
LOG(INFO) << "num_in_values = " << num_in_values << " bit_width = " << bit_width
<< " aligned = " << aligned;
// Mask out higher bits so that the values to pack are in range.
- const uint64_t mask = ComputeMask(bit_width);
+ const UINT_T mask = ComputeMask<UINT_T>(bit_width);
const int misalignment = aligned ? 0 : 1;
const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * num_in_values);
@@ -79,9 +90,9 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
for (const int num_to_unpack : {num_in_values, num_in_values + 1, num_in_values + 77}) {
LOG(INFO) << "Unpacking " << num_to_unpack;
// Size buffer exactly so that ASAN can detect reads/writes that overrun the buffer.
- AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
- uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
- const auto result = BitPacking::UnpackValues(
+ AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+ UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+ const auto result = BitPacking::UnpackValues<UINT_T>(
bit_width, packed, writer.bytes_written(), num_to_unpack, out);
ASSERT_EQ(packed + writer.bytes_written(), result.first)
<< "Unpacked different # of bytes from the # written";
@@ -99,13 +110,15 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
}
for (int i = 0; i < num_in_values; ++i) {
- EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i;
+ EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i << "."
+ << " Bit width: " << bit_width << ".";
}
}
- UnpackSubset(in, packed, num_in_values, bit_width, aligned);
+ UnpackSubset<UINT_T>(in, packed, num_in_values, bit_width, aligned);
}
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
int bit_width, bool aligned) {
const int misalignment = aligned ? 0 : 1;
for (int num_to_unpack : {1, 10, 77, num_in_values - 7}) {
@@ -116,43 +129,194 @@ void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
AlignedAllocation packed_copy_storage(bytes_to_read + misalignment);
uint8_t* packed_copy = packed_copy_storage.data() + misalignment;
memcpy(packed_copy, packed, bytes_to_read);
- AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
- uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
- const auto result = BitPacking::UnpackValues(
+ AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+ UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+ const auto result = BitPacking::UnpackValues<UINT_T>(
bit_width, packed_copy, bytes_to_read, num_to_unpack, out);
ASSERT_EQ(packed_copy + bytes_to_read, result.first) << "Read wrong # of bytes";
ASSERT_EQ(num_to_unpack, result.second) << "Unpacked wrong # of values";
for (int i = 0; i < num_to_unpack; ++i) {
- ASSERT_EQ(in[i] & ComputeMask(bit_width), out[i]) << "Didn't get back input value "
- << i;
+ ASSERT_EQ(in[i] & ComputeMask<UINT_T>(bit_width), out[i])
+ << "Didn't get back input value " << i;
}
}
}
-TEST(BitPackingTest, RandomUnpack) {
- constexpr int NUM_IN_VALUES = 64 * 1024;
- uint64_t in[NUM_IN_VALUES];
+template <typename INT_T>
+std::vector<INT_T> GenerateRandomInput(int length, INT_T lower, INT_T higher) {
+ std::vector<INT_T> in(length);
mt19937 rng;
- uniform_int_distribution<uint64_t> dist;
- std::generate(std::begin(in), std::end(in), [&rng, &dist] { return dist(rng); });
+ RandTestUtil::SeedRng("BIT_PACKING_TEST_RANDOM_SEED", &rng);
+ uniform_int_distribution<INT_T> dist(lower, higher);
+ std::generate(in.begin(), in.end(), [&rng, &dist] { return dist(rng); });
+
+ return in;
+}
- // Test various odd input lengths to exercise boundary cases for full and partial
- // batches of 32.
+// Test various odd input lengths to exercise boundary cases for full and partial
+// batches of 32.
+std::vector<int> GetLengths() {
vector<int> lengths{NUM_IN_VALUES, NUM_IN_VALUES - 1, NUM_IN_VALUES - 16,
NUM_IN_VALUES - 19, NUM_IN_VALUES - 31};
for (int i = 0; i < 32; ++i) {
lengths.push_back(i);
}
- for (int bit_width = 0; bit_width <= 64; ++bit_width) {
+ return lengths;
+}
+
+template <typename UINT_T>
+void RandomUnpackTest() {
+ const std::vector<UINT_T> in = GenerateRandomInput<UINT_T>(NUM_IN_VALUES,
+ std::numeric_limits<UINT_T>::min(), std::numeric_limits<UINT_T>::max());
+
+ const std::vector<int> lengths = GetLengths();
+
+ constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+ const int max_bit_width = std::min<int>(MAX_BITWIDTH, sizeof(UINT_T) * CHAR_BIT);
+ for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
for (const int length : lengths) {
// Test that unpacking to/from aligned and unaligned memory works.
for (const bool aligned : {true, false}) {
- PackUnpack(in, length, bit_width, aligned);
+ PackUnpack<UINT_T>(in.data(), length, bit_width, aligned);
}
}
}
}
+
+TEST(BitPackingTest, RandomUnpack8) {
+ RandomUnpackTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack16) {
+ RandomUnpackTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack32) {
+ RandomUnpackTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack64) {
+ RandomUnpackTest<uint64_t>();
+}
+
+// This is not the full dictionary encoding, only a big bit-packed literal run, no RLE is
+// used.
+template <typename T>
+std::pair<std::vector<T>, std::vector<uint8_t>>
+DictEncode(const std::vector<T>& input, int bit_width) {
+ // Create dictionary and indices.
+ std::unordered_map<T, std::size_t> index_map;
+
+ std::vector<T> dict;
+ std::vector<uint64_t> indices;
+
+ for (const T value : input) {
+ auto it = index_map.find(value);
+ if (it != index_map.end()) {
+ indices.push_back(it->second);
+ } else {
+ const std::size_t next_index = dict.size();
+ index_map[value] = next_index;
+ dict.push_back(value);
+ indices.push_back(next_index);
+ }
+ }
+
+ // Bit pack the indices.
+ const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * input.size());
+ std::vector<uint8_t> out_data(bytes_required);
+
+ // We do not write data if we do not have any. Doing it could also lead to undefined
+ // behaviour as out_data.data() may be nullptr and BitWriter uses memcpy internally, and
+ // passing a null pointer to memcpy is undefined behaviour.
+ if (bytes_required > 0) {
+ BitWriter writer(out_data.data(), bytes_required);
+ if (bit_width > 0) {
+ for (const uint64_t index : indices) {
+ EXPECT_TRUE(writer.PutValue(index, bit_width));
+ }
+ }
+ writer.Flush();
+ }
+
+ return std::make_pair(dict, out_data);
}
+template <typename T>
+void ExpectEqualsWithStride(const T* expected, int num_values,
+ const uint8_t* actual, int actual_length, int stride) {
+ for (std::size_t i = 0; i < num_values; i++) {
+ const T expected_value = expected[i];
+
+ DCHECK_LE(i * stride + sizeof(T), actual_length);
+ const T actual_value = *reinterpret_cast<const T*>(&actual[i * stride]);
+
+ EXPECT_EQ(expected_value, actual_value);
+ }
+}
+
+// This does not test the full dictionary decoding, only the unpacking of a single
+// bit-packed literal run (no RLE).
+template <typename UINT_T>
+void RandomUnpackAndDecodeTest() {
+ constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+ const int max_bit_width = std::min<int>({MAX_BITWIDTH, sizeof(UINT_T) * 8,
+ sizeof(uint32_t) * 8 /* dictionary indices are 32 bit values */});
+
+ const std::vector<int> lengths = GetLengths();
+
+ for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
+ const UINT_T max_dict_size = bit_width == sizeof(UINT_T) * 8
+ ? std::numeric_limits<UINT_T>::max() : (1UL << bit_width);
+
+ const std::vector<UINT_T> input = GenerateRandomInput<UINT_T>(NUM_IN_VALUES, 0,
+ max_dict_size - 1 /* inclusive range */);
+
+ for (int length : lengths) {
+ const std::vector<UINT_T> in_data(input.begin(), input.begin() + length);
+
+ std::pair<std::vector<UINT_T>, std::vector<uint8_t>> dict_and_data
+ = DictEncode(in_data, bit_width);
+ std::vector<UINT_T>& dict = dict_and_data.first;
+ const std::vector<uint8_t>& data = dict_and_data.second;
+
+ const std::vector<int> strides = {sizeof(UINT_T), sizeof(UINT_T) + 5,
+ 2 * sizeof(UINT_T) + 5};
+
+ for (int stride : strides) {
+ bool decode_error = false;
+ std::vector<uint8_t> out(in_data.size() * stride);
+
+ std::pair<const uint8_t*, int64_t> res
+ = BitPacking::UnpackAndDecodeValues<UINT_T>(bit_width, data.data(),
+ data.size(), dict.data(), dict.size(), in_data.size(),
+ reinterpret_cast<UINT_T*>(out.data()), stride, &decode_error);
+
+ EXPECT_FALSE(decode_error);
+ EXPECT_EQ(in_data.size(), res.second);
+ ExpectEqualsWithStride<UINT_T>(in_data.data(), in_data.size(), out.data(),
+ out.size(), stride);
+ }
+ }
+ }
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode8) {
+ RandomUnpackAndDecodeTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode16) {
+ RandomUnpackAndDecodeTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode32) {
+ RandomUnpackAndDecodeTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode64) {
+ RandomUnpackAndDecodeTest<uint64_t>();
+}
+
+}
diff --git a/be/src/util/bit-packing.cc b/be/src/util/bit-packing.cc
index 52f86fd..851bc0b 100644
--- a/be/src/util/bit-packing.cc
+++ b/be/src/util/bit-packing.cc
@@ -32,7 +32,9 @@ namespace impala {
INSTANTIATE_UNPACK_VALUES(bool);
INSTANTIATE_UNPACK_VALUES(uint8_t);
+INSTANTIATE_UNPACK_VALUES(uint16_t);
INSTANTIATE_UNPACK_VALUES(uint32_t);
+INSTANTIATE_UNPACK_VALUES(uint64_t);
#define INSTANTIATE_UNPACK_AND_DECODE(OUT_TYPE) \
template std::pair<const uint8_t*, int64_t> \
@@ -48,6 +50,10 @@ INSTANTIATE_UNPACK_AND_DECODE(int8_t);
INSTANTIATE_UNPACK_AND_DECODE(int16_t);
INSTANTIATE_UNPACK_AND_DECODE(int32_t);
INSTANTIATE_UNPACK_AND_DECODE(int64_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint8_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint16_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint32_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint64_t);
INSTANTIATE_UNPACK_AND_DECODE(Decimal4Value);
INSTANTIATE_UNPACK_AND_DECODE(Decimal8Value);
INSTANTIATE_UNPACK_AND_DECODE(Decimal16Value);