You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/11 02:18:02 UTC

[impala] 01/03: IMPALA-8796: Add unit tests to UnpackAndDecodeValues

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 06cb9b7832c20e211696899d7cf7a87244917250
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Mon Aug 5 10:28:53 2019 +0200

    IMPALA-8796: Add unit tests to UnpackAndDecodeValues
    
    Adding unit tests to test BitPacking::UnpackAndDecodeValues. Also
    changing the existing BitPacking tests so that they are parametrised
    by the output type of the unpacking so we can test different output
    types.
    
    Change-Id: I363ff9f5d4750231fdc1aa60cf40d90ebb9ba305
    Reviewed-on: http://gerrit.cloudera.org:8080/14004
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/util/bit-packing-test.cc | 214 +++++++++++++++++++++++++++++++++++-----
 be/src/util/bit-packing.cc      |   6 ++
 2 files changed, 195 insertions(+), 25 deletions(-)

diff --git a/be/src/util/bit-packing-test.cc b/be/src/util/bit-packing-test.cc
index 8010fe3..197ba3f 100644
--- a/be/src/util/bit-packing-test.cc
+++ b/be/src/util/bit-packing-test.cc
@@ -15,11 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <random>
+#include <unordered_map>
 
 #include "testutil/gtest-util.h"
+#include "testutil/rand-util.h"
 #include "testutil/mem-util.h"
 #include "util/bit-packing.h"
 #include "util/bit-stream-utils.inline.h"
@@ -32,9 +35,15 @@ using std::mt19937;
 namespace impala {
 
 namespace {
-uint64_t ComputeMask(int bit_width) {
-  return (bit_width < 64) ? ((1UL << bit_width) - 1) : ~0UL;
+
+constexpr int NUM_IN_VALUES = 64 * 1024;
+
+template <typename UINT_T>
+constexpr UINT_T ComputeMask(int bit_width) {
+  return (bit_width < sizeof(UINT_T) * CHAR_BIT) ?
+      ((1UL << bit_width) - 1) : static_cast<UINT_T>(~0UL);
 }
+
 }
 
 /// Test unpacking a subarray of values to/from smaller buffers that are sized to exactly
@@ -45,19 +54,21 @@ uint64_t ComputeMask(int bit_width) {
 ///
 /// This is to test that we do not overrun either the input or output buffer for smaller
 /// batch sizes.
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
     int bit_width, bool aligned);
 
 /// Test a packing/unpacking round-trip of the 'num_in_values' values in 'in',
 /// packed with 'bit_width'. If 'aligned' is true, buffers for packed and unpacked data
 /// are allocated at a 64-byte aligned address. Otherwise the buffers are misaligned
 /// by 1 byte from a 64-byte aligned address.
-void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool aligned) {
+template <typename UINT_T>
+void PackUnpack(const UINT_T* in, int num_in_values, int bit_width, bool aligned) {
   LOG(INFO) << "num_in_values = " << num_in_values << " bit_width = " << bit_width
             << " aligned = " << aligned;
 
   // Mask out higher bits so that the values to pack are in range.
-  const uint64_t mask = ComputeMask(bit_width);
+  const UINT_T mask = ComputeMask<UINT_T>(bit_width);
   const int misalignment = aligned ? 0 : 1;
 
   const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * num_in_values);
@@ -79,9 +90,9 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
   for (const int num_to_unpack : {num_in_values, num_in_values + 1, num_in_values + 77}) {
     LOG(INFO) << "Unpacking " << num_to_unpack;
     // Size buffer exactly so that ASAN can detect reads/writes that overrun the buffer.
-    AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
-    uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
-    const auto result = BitPacking::UnpackValues(
+    AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+    UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+    const auto result = BitPacking::UnpackValues<UINT_T>(
         bit_width, packed, writer.bytes_written(), num_to_unpack, out);
     ASSERT_EQ(packed + writer.bytes_written(), result.first)
         << "Unpacked different # of bytes from the # written";
@@ -99,13 +110,15 @@ void PackUnpack(const uint64_t* in, int num_in_values, int bit_width, bool align
     }
 
     for (int i = 0; i < num_in_values; ++i) {
-      EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i;
+      EXPECT_EQ(in[i] & mask, out[i]) << "Didn't get back input value " << i << "."
+          << " Bit width: " << bit_width << ".";
     }
   }
-  UnpackSubset(in, packed, num_in_values, bit_width, aligned);
+  UnpackSubset<UINT_T>(in, packed, num_in_values, bit_width, aligned);
 }
 
-void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
+template <typename UINT_T>
+void UnpackSubset(const UINT_T* in, const uint8_t* packed, int num_in_values,
     int bit_width, bool aligned) {
   const int misalignment = aligned ? 0 : 1;
   for (int num_to_unpack : {1, 10, 77, num_in_values - 7}) {
@@ -116,43 +129,194 @@ void UnpackSubset(const uint64_t* in, const uint8_t* packed, int num_in_values,
     AlignedAllocation packed_copy_storage(bytes_to_read + misalignment);
     uint8_t* packed_copy = packed_copy_storage.data() + misalignment;
     memcpy(packed_copy, packed, bytes_to_read);
-    AlignedAllocation out_storage(num_to_unpack * sizeof(uint64_t) + misalignment);
-    uint64_t* out = reinterpret_cast<uint64_t*>(out_storage.data() + misalignment);
-    const auto result = BitPacking::UnpackValues(
+    AlignedAllocation out_storage(num_to_unpack * sizeof(UINT_T) + misalignment);
+    UINT_T* out = reinterpret_cast<UINT_T*>(out_storage.data() + misalignment);
+    const auto result = BitPacking::UnpackValues<UINT_T>(
         bit_width, packed_copy, bytes_to_read, num_to_unpack, out);
     ASSERT_EQ(packed_copy + bytes_to_read, result.first) << "Read wrong # of bytes";
     ASSERT_EQ(num_to_unpack, result.second) << "Unpacked wrong # of values";
 
     for (int i = 0; i < num_to_unpack; ++i) {
-      ASSERT_EQ(in[i] & ComputeMask(bit_width), out[i]) << "Didn't get back input value "
-                                                         << i;
+      ASSERT_EQ(in[i] & ComputeMask<UINT_T>(bit_width), out[i])
+          << "Didn't get back input value " << i;
     }
   }
 }
 
-TEST(BitPackingTest, RandomUnpack) {
-  constexpr int NUM_IN_VALUES = 64 * 1024;
-  uint64_t in[NUM_IN_VALUES];
+template <typename INT_T>
+std::vector<INT_T> GenerateRandomInput(int length, INT_T lower, INT_T higher) {
+  std::vector<INT_T> in(length);
   mt19937 rng;
-  uniform_int_distribution<uint64_t> dist;
-  std::generate(std::begin(in), std::end(in), [&rng, &dist] { return dist(rng); });
+  RandTestUtil::SeedRng("BIT_PACKING_TEST_RANDOM_SEED", &rng);
+  uniform_int_distribution<INT_T> dist(lower, higher);
+  std::generate(in.begin(), in.end(), [&rng, &dist] { return dist(rng); });
+
+  return in;
+}
 
-  // Test various odd input lengths to exercise boundary cases for full and partial
-  // batches of 32.
+// Test various odd input lengths to exercise boundary cases for full and partial
+// batches of 32.
+std::vector<int> GetLengths() {
   vector<int> lengths{NUM_IN_VALUES, NUM_IN_VALUES - 1, NUM_IN_VALUES - 16,
       NUM_IN_VALUES - 19, NUM_IN_VALUES - 31};
   for (int i = 0; i < 32; ++i) {
     lengths.push_back(i);
   }
 
-  for (int bit_width = 0; bit_width <= 64; ++bit_width) {
+  return lengths;
+}
+
+template <typename UINT_T>
+void RandomUnpackTest() {
+  const std::vector<UINT_T> in = GenerateRandomInput<UINT_T>(NUM_IN_VALUES,
+      std::numeric_limits<UINT_T>::min(), std::numeric_limits<UINT_T>::max());
+
+  const std::vector<int> lengths = GetLengths();
+
+  constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+  const int max_bit_width = std::min<int>(MAX_BITWIDTH, sizeof(UINT_T) * CHAR_BIT);
+  for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
     for (const int length : lengths) {
       // Test that unpacking to/from aligned and unaligned memory works.
       for (const bool aligned : {true, false}) {
-        PackUnpack(in, length, bit_width, aligned);
+        PackUnpack<UINT_T>(in.data(), length, bit_width, aligned);
       }
     }
   }
 }
+
+TEST(BitPackingTest, RandomUnpack8) {
+  RandomUnpackTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack16) {
+  RandomUnpackTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack32) {
+  RandomUnpackTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpack64) {
+  RandomUnpackTest<uint64_t>();
+}
+
+// This is not the full dictionary encoding, only a big bit-packed literal run, no RLE is
+// used.
+template <typename T>
+std::pair<std::vector<T>, std::vector<uint8_t>>
+DictEncode(const std::vector<T>& input, int bit_width) {
+  // Create dictionary and indices.
+  std::unordered_map<T, std::size_t> index_map;
+
+  std::vector<T> dict;
+  std::vector<uint64_t> indices;
+
+  for (const T value : input) {
+    auto it = index_map.find(value);
+    if (it != index_map.end()) {
+      indices.push_back(it->second);
+    } else {
+      const std::size_t next_index = dict.size();
+      index_map[value] = next_index;
+      dict.push_back(value);
+      indices.push_back(next_index);
+    }
+  }
+
+  // Bit pack the indices.
+  const int bytes_required = BitUtil::RoundUpNumBytes(bit_width * input.size());
+  std::vector<uint8_t> out_data(bytes_required);
+
+  // We do not write data if we do not have any. Doing it could also lead to undefined
+  // behaviour as out_data.data() may be nullptr and BitWriter uses memcpy internally, and
+  // passing a null pointer to memcpy is undefined behaviour.
+  if (bytes_required > 0) {
+    BitWriter writer(out_data.data(), bytes_required);
+    if (bit_width > 0) {
+      for (const uint64_t index : indices) {
+        EXPECT_TRUE(writer.PutValue(index, bit_width));
+      }
+    }
+    writer.Flush();
+  }
+
+  return std::make_pair(dict, out_data);
 }
 
+template <typename T>
+void ExpectEqualsWithStride(const T* expected, int num_values,
+    const uint8_t* actual, int actual_length, int stride) {
+  for (std::size_t i = 0; i < num_values; i++) {
+    const T expected_value = expected[i];
+
+    DCHECK_LE(i * stride + sizeof(T), actual_length);
+    const T actual_value = *reinterpret_cast<const T*>(&actual[i * stride]);
+
+    EXPECT_EQ(expected_value, actual_value);
+  }
+}
+
+// This does not test the full dictionary decoding, only the unpacking of a single
+// bit-packed literal run (no RLE).
+template <typename UINT_T>
+void RandomUnpackAndDecodeTest() {
+  constexpr int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
+  const int max_bit_width = std::min<int>({MAX_BITWIDTH, sizeof(UINT_T) * 8,
+      sizeof(uint32_t) * 8 /* dictionary indices are 32 bit values */});
+
+  const std::vector<int> lengths = GetLengths();
+
+  for (int bit_width = 0; bit_width <= max_bit_width; ++bit_width) {
+    const UINT_T max_dict_size = bit_width == sizeof(UINT_T) * 8
+        ? std::numeric_limits<UINT_T>::max() : (1UL << bit_width);
+
+    const std::vector<UINT_T> input = GenerateRandomInput<UINT_T>(NUM_IN_VALUES, 0,
+        max_dict_size - 1 /* inclusive range */);
+
+    for (int length : lengths) {
+      const std::vector<UINT_T> in_data(input.begin(), input.begin() + length);
+
+      std::pair<std::vector<UINT_T>, std::vector<uint8_t>> dict_and_data
+          = DictEncode(in_data, bit_width);
+      std::vector<UINT_T>& dict = dict_and_data.first;
+      const std::vector<uint8_t>& data = dict_and_data.second;
+
+      const std::vector<int> strides = {sizeof(UINT_T), sizeof(UINT_T) + 5,
+        2 * sizeof(UINT_T) + 5};
+
+      for (int stride : strides) {
+        bool decode_error = false;
+        std::vector<uint8_t> out(in_data.size() * stride);
+
+        std::pair<const uint8_t*, int64_t> res
+            = BitPacking::UnpackAndDecodeValues<UINT_T>(bit_width, data.data(),
+                data.size(), dict.data(), dict.size(), in_data.size(),
+            reinterpret_cast<UINT_T*>(out.data()), stride, &decode_error);
+
+        EXPECT_FALSE(decode_error);
+        EXPECT_EQ(in_data.size(), res.second);
+        ExpectEqualsWithStride<UINT_T>(in_data.data(), in_data.size(), out.data(),
+            out.size(), stride);
+      }
+    }
+  }
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode8) {
+  RandomUnpackAndDecodeTest<uint8_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode16) {
+  RandomUnpackAndDecodeTest<uint16_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode32) {
+  RandomUnpackAndDecodeTest<uint32_t>();
+}
+
+TEST(BitPackingTest, RandomUnpackAndDecode64) {
+  RandomUnpackAndDecodeTest<uint64_t>();
+}
+
+}
diff --git a/be/src/util/bit-packing.cc b/be/src/util/bit-packing.cc
index 52f86fd..851bc0b 100644
--- a/be/src/util/bit-packing.cc
+++ b/be/src/util/bit-packing.cc
@@ -32,7 +32,9 @@ namespace impala {
 
 INSTANTIATE_UNPACK_VALUES(bool);
 INSTANTIATE_UNPACK_VALUES(uint8_t);
+INSTANTIATE_UNPACK_VALUES(uint16_t);
 INSTANTIATE_UNPACK_VALUES(uint32_t);
+INSTANTIATE_UNPACK_VALUES(uint64_t);
 
 #define INSTANTIATE_UNPACK_AND_DECODE(OUT_TYPE)                                         \
   template std::pair<const uint8_t*, int64_t>                                           \
@@ -48,6 +50,10 @@ INSTANTIATE_UNPACK_AND_DECODE(int8_t);
 INSTANTIATE_UNPACK_AND_DECODE(int16_t);
 INSTANTIATE_UNPACK_AND_DECODE(int32_t);
 INSTANTIATE_UNPACK_AND_DECODE(int64_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint8_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint16_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint32_t);
+INSTANTIATE_UNPACK_AND_DECODE(uint64_t);
 INSTANTIATE_UNPACK_AND_DECODE(Decimal4Value);
 INSTANTIATE_UNPACK_AND_DECODE(Decimal8Value);
 INSTANTIATE_UNPACK_AND_DECODE(Decimal16Value);