You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/16 16:03:26 UTC

[impala] 02/04: IMPALA-8796: Restrict bit unpacking to unsigned integer types

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit d596f1c01aca2128bc15ec704f44dfe58867a19d
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Wed Aug 7 17:50:54 2019 +0200

    IMPALA-8796: Restrict bit unpacking to unsigned integer types
    
    Restrict bit unpacking to the unsigned integer types uint8_t, uint16_t,
    uint32_t and uint64_t to simplify the interface of bit unpacking.
    
    Testing:
      Modified some tests that used bool or signed types for unpacking.
    
    Change-Id: I36cb57f43cc93e5dda6b6b2ff1617f1631f8cbe1
    Reviewed-on: http://gerrit.cloudera.org:8080/14031
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exec/parquet/parquet-bool-decoder.h |  8 +++++---
 be/src/util/bit-packing.cc                 |  1 -
 be/src/util/bit-packing.inline.h           | 14 ++++++++++++++
 be/src/util/bit-stream-utils-test.cc       | 30 +++++++++++++++++-------------
 be/src/util/bit-stream-utils.h             |  6 ++++--
 5 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/be/src/exec/parquet/parquet-bool-decoder.h b/be/src/exec/parquet/parquet-bool-decoder.h
index 1498ca6..3e89853 100644
--- a/be/src/exec/parquet/parquet-bool-decoder.h
+++ b/be/src/exec/parquet/parquet-bool-decoder.h
@@ -52,9 +52,11 @@ class ParquetBoolDecoder {
   parquet::Encoding::type encoding_;
 
   /// A buffer to store unpacked values. Must be a multiple of 32 size to use the
-  /// batch-oriented interface of BatchedBitReader.
+  /// batch-oriented interface of BatchedBitReader. We use uint8_t instead of bool because
+  /// bit unpacking is only supported for unsigned integers. The values are converted to
+  /// bool when returned to the user.
   static const int UNPACKED_BUFFER_LEN = 128;
-  bool unpacked_values_[UNPACKED_BUFFER_LEN];
+  uint8_t unpacked_values_[UNPACKED_BUFFER_LEN];
 
   /// The number of valid values in 'unpacked_values_'.
   int num_unpacked_values_ = 0;
@@ -66,7 +68,7 @@ class ParquetBoolDecoder {
   BatchedBitReader bool_values_;
 
   /// RLE decoder, used if 'encoding_' is RLE.
-  RleBatchDecoder<bool> rle_decoder_;
+  RleBatchDecoder<uint8_t> rle_decoder_;
 };
 
 template <parquet::Encoding::type ENCODING>
diff --git a/be/src/util/bit-packing.cc b/be/src/util/bit-packing.cc
index 851bc0b..5cdcee9 100644
--- a/be/src/util/bit-packing.cc
+++ b/be/src/util/bit-packing.cc
@@ -30,7 +30,6 @@ namespace impala {
       int bit_width, const uint8_t* __restrict__ in, int64_t in_bytes,            \
       int64_t num_values, OUT_TYPE* __restrict__ out);
 
-INSTANTIATE_UNPACK_VALUES(bool);
 INSTANTIATE_UNPACK_VALUES(uint8_t);
 INSTANTIATE_UNPACK_VALUES(uint16_t);
 INSTANTIATE_UNPACK_VALUES(uint32_t);
diff --git a/be/src/util/bit-packing.inline.h b/be/src/util/bit-packing.inline.h
index a279fa7..b2d57eb 100644
--- a/be/src/util/bit-packing.inline.h
+++ b/be/src/util/bit-packing.inline.h
@@ -48,10 +48,21 @@ inline int64_t BitPacking::NumValuesToUnpack(
   }
 }
 
+template <typename T>
+constexpr bool IsSupportedUnpackingType () {
+  return std::is_same<T, uint8_t>::value
+      || std::is_same<T, uint16_t>::value
+      || std::is_same<T, uint32_t>::value
+      || std::is_same<T, uint64_t>::value;
+}
+
 template <typename OutType>
 std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width,
     const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
     OutType* __restrict__ out) {
+  static_assert(IsSupportedUnpackingType<OutType>(),
+      "Only unsigned integers are supported.");
+
 #pragma push_macro("UNPACK_VALUES_CASE")
 #define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
   case i:                                       \
@@ -71,6 +82,9 @@ template <typename OutType, int BIT_WIDTH>
 std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(
     const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
     OutType* __restrict__ out) {
+  static_assert(IsSupportedUnpackingType<OutType>(),
+      "Only unsigned integers are supported.");
+
   constexpr int BATCH_SIZE = 32;
   const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
   const int64_t batches_to_read = values_to_read / BATCH_SIZE;
diff --git a/be/src/util/bit-stream-utils-test.cc b/be/src/util/bit-stream-utils-test.cc
index aa157e3..abdb63a 100644
--- a/be/src/util/bit-stream-utils-test.cc
+++ b/be/src/util/bit-stream-utils-test.cc
@@ -66,7 +66,9 @@ TEST(BitArray, TestBool) {
 
   // Ensure it returns the same results after Reset().
   for (int trial = 0; trial < 2; ++trial) {
-    bool batch_vals[16];
+    // We use uint8_t instead of bool because unpacking is only supported for unsigned
+    // integers.
+    uint8_t batch_vals[16];
     EXPECT_EQ(16, reader.UnpackBatch(1, 16, batch_vals));
     for (int i = 0; i < 8; ++i)  EXPECT_EQ(batch_vals[i], i % 2);
 
@@ -127,12 +129,14 @@ TEST(BitArray, TestBoolSkip) {
   for (int i = 0; i < 8; ++i) ASSERT_TRUE(writer.PutValue(0, 1));
   writer.Flush();
 
-  TestSkipping<bool>(buffer, len, 1, 0, 8);
-  TestSkipping<bool>(buffer, len, 1, 0, 16);
-  TestSkipping<bool>(buffer, len, 1, 8, 8);
-  TestSkipping<bool>(buffer, len, 1, 8, 16);
-  TestSkipping<bool>(buffer, len, 1, 16, 8);
-  TestSkipping<bool>(buffer, len, 1, 16, 16);
+  // We use uint8_t instead of bool because unpacking is only supported for unsigned
+  // integers.
+  TestSkipping<uint8_t>(buffer, len, 1, 0, 8);
+  TestSkipping<uint8_t>(buffer, len, 1, 0, 16);
+  TestSkipping<uint8_t>(buffer, len, 1, 8, 8);
+  TestSkipping<uint8_t>(buffer, len, 1, 8, 16);
+  TestSkipping<uint8_t>(buffer, len, 1, 16, 8);
+  TestSkipping<uint8_t>(buffer, len, 1, 16, 16);
 }
 
 TEST(BitArray, TestIntSkip) {
@@ -145,10 +149,10 @@ TEST(BitArray, TestIntSkip) {
     ASSERT_TRUE(writer.PutValue(i, bit_width));
   }
   int bytes_written = writer.bytes_written();
-  TestSkipping<int>(buffer, bytes_written, bit_width, 0, 4);
-  TestSkipping<int>(buffer, bytes_written, bit_width, 4, 4);
-  TestSkipping<int>(buffer, bytes_written, bit_width, 4, 8);
-  TestSkipping<int>(buffer, bytes_written, bit_width, 8, 56);
+  TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 0, 4);
+  TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 4, 4);
+  TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 4, 8);
+  TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 8, 56);
 }
 
 // Writes 'num_vals' values with width 'bit_width' starting from 'start' and increasing
@@ -172,9 +176,9 @@ void TestBitArrayValues(int bit_width, uint64_t start, uint64_t num_vals) {
   for (int trial = 0; trial < 2; ++trial) {
     // Unpack all values at once with one batched reader and in small batches with the
     // other batched reader.
-    vector<int64_t> batch_vals(num_vals);
+    vector<uint64_t> batch_vals(num_vals);
     const uint64_t BATCH_SIZE = 32;
-    vector<int64_t> batch_vals2(BATCH_SIZE);
+    vector<uint64_t> batch_vals2(BATCH_SIZE);
     EXPECT_EQ(num_vals,
         reader.UnpackBatch(bit_width, num_vals, batch_vals.data()));
     for (uint64_t i = 0; i < num_vals; ++i) {
diff --git a/be/src/util/bit-stream-utils.h b/be/src/util/bit-stream-utils.h
index 230c9dd..b94386a 100644
--- a/be/src/util/bit-stream-utils.h
+++ b/be/src/util/bit-stream-utils.h
@@ -54,7 +54,7 @@ class BitWriter {
   int buffer_len() const { return max_bytes_; }
 
   /// Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
-  /// packed.  Returns false if there was not enough space. num_bits must be <= 32.
+  /// packed.  Returns false if there was not enough space. num_bits must be <= 64.
   bool PutValue(uint64_t v, int num_bits);
 
   /// Writes v to the next aligned byte using num_bytes. If T is larger than num_bytes, the
@@ -130,7 +130,7 @@ class BatchedBitReader {
   }
 
   /// Gets up to 'num_values' bit-packed values, starting from the current byte in the
-  /// buffer and advance the read position. 'bit_width' must be <= 32.
+  /// buffer and advance the read position. 'bit_width' must be <= 64.
   /// If 'bit_width' * 'num_values' is not a multiple of 8, the trailing bytes are
   /// skipped and the next UnpackBatch() call will start reading from the next byte.
   ///
@@ -139,6 +139,8 @@ class BatchedBitReader {
   /// 'bit_width' * 'num_values' must be a multiple of 8. This condition is always
   /// satisfied if 'num_values' is a multiple of 32.
   ///
+  /// The output type 'T' must be an unsigned integer.
+  ///
   /// Returns the number of values read.
   template<typename T>
   int UnpackBatch(int bit_width, int num_values, T* v);