You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/16 16:03:26 UTC
[impala] 02/04: IMPALA-8796: Restrict bit unpacking to unsigned
integer types
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit d596f1c01aca2128bc15ec704f44dfe58867a19d
Author: Daniel Becker <da...@cloudera.com>
AuthorDate: Wed Aug 7 17:50:54 2019 +0200
IMPALA-8796: Restrict bit unpacking to unsigned integer types
Restrict bit unpacking to the unsigned integer types uint8_t, uint16_t,
uint32_t and uint64_t to simplify the interface of bit unpacking.
Testing:
Modified some tests that used bool or signed types for unpacking.
Change-Id: I36cb57f43cc93e5dda6b6b2ff1617f1631f8cbe1
Reviewed-on: http://gerrit.cloudera.org:8080/14031
Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exec/parquet/parquet-bool-decoder.h | 8 +++++---
be/src/util/bit-packing.cc | 1 -
be/src/util/bit-packing.inline.h | 14 ++++++++++++++
be/src/util/bit-stream-utils-test.cc | 30 +++++++++++++++++-------------
be/src/util/bit-stream-utils.h | 6 ++++--
5 files changed, 40 insertions(+), 19 deletions(-)
diff --git a/be/src/exec/parquet/parquet-bool-decoder.h b/be/src/exec/parquet/parquet-bool-decoder.h
index 1498ca6..3e89853 100644
--- a/be/src/exec/parquet/parquet-bool-decoder.h
+++ b/be/src/exec/parquet/parquet-bool-decoder.h
@@ -52,9 +52,11 @@ class ParquetBoolDecoder {
parquet::Encoding::type encoding_;
/// A buffer to store unpacked values. Must be a multiple of 32 size to use the
- /// batch-oriented interface of BatchedBitReader.
+ /// batch-oriented interface of BatchedBitReader. We use uint8_t instead of bool because
+ /// bit unpacking is only supported for unsigned integers. The values are converted to
+ /// bool when returned to the user.
static const int UNPACKED_BUFFER_LEN = 128;
- bool unpacked_values_[UNPACKED_BUFFER_LEN];
+ uint8_t unpacked_values_[UNPACKED_BUFFER_LEN];
/// The number of valid values in 'unpacked_values_'.
int num_unpacked_values_ = 0;
@@ -66,7 +68,7 @@ class ParquetBoolDecoder {
BatchedBitReader bool_values_;
/// RLE decoder, used if 'encoding_' is RLE.
- RleBatchDecoder<bool> rle_decoder_;
+ RleBatchDecoder<uint8_t> rle_decoder_;
};
template <parquet::Encoding::type ENCODING>
diff --git a/be/src/util/bit-packing.cc b/be/src/util/bit-packing.cc
index 851bc0b..5cdcee9 100644
--- a/be/src/util/bit-packing.cc
+++ b/be/src/util/bit-packing.cc
@@ -30,7 +30,6 @@ namespace impala {
int bit_width, const uint8_t* __restrict__ in, int64_t in_bytes, \
int64_t num_values, OUT_TYPE* __restrict__ out);
-INSTANTIATE_UNPACK_VALUES(bool);
INSTANTIATE_UNPACK_VALUES(uint8_t);
INSTANTIATE_UNPACK_VALUES(uint16_t);
INSTANTIATE_UNPACK_VALUES(uint32_t);
diff --git a/be/src/util/bit-packing.inline.h b/be/src/util/bit-packing.inline.h
index a279fa7..b2d57eb 100644
--- a/be/src/util/bit-packing.inline.h
+++ b/be/src/util/bit-packing.inline.h
@@ -48,10 +48,21 @@ inline int64_t BitPacking::NumValuesToUnpack(
}
}
+template <typename T>
+constexpr bool IsSupportedUnpackingType () {
+ return std::is_same<T, uint8_t>::value
+ || std::is_same<T, uint16_t>::value
+ || std::is_same<T, uint32_t>::value
+ || std::is_same<T, uint64_t>::value;
+}
+
template <typename OutType>
std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(int bit_width,
const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
OutType* __restrict__ out) {
+ static_assert(IsSupportedUnpackingType<OutType>(),
+ "Only unsigned integers are supported.");
+
#pragma push_macro("UNPACK_VALUES_CASE")
#define UNPACK_VALUES_CASE(ignore1, i, ignore2) \
case i: \
@@ -71,6 +82,9 @@ template <typename OutType, int BIT_WIDTH>
std::pair<const uint8_t*, int64_t> BitPacking::UnpackValues(
const uint8_t* __restrict__ in, int64_t in_bytes, int64_t num_values,
OutType* __restrict__ out) {
+ static_assert(IsSupportedUnpackingType<OutType>(),
+ "Only unsigned integers are supported.");
+
constexpr int BATCH_SIZE = 32;
const int64_t values_to_read = NumValuesToUnpack(BIT_WIDTH, in_bytes, num_values);
const int64_t batches_to_read = values_to_read / BATCH_SIZE;
diff --git a/be/src/util/bit-stream-utils-test.cc b/be/src/util/bit-stream-utils-test.cc
index aa157e3..abdb63a 100644
--- a/be/src/util/bit-stream-utils-test.cc
+++ b/be/src/util/bit-stream-utils-test.cc
@@ -66,7 +66,9 @@ TEST(BitArray, TestBool) {
// Ensure it returns the same results after Reset().
for (int trial = 0; trial < 2; ++trial) {
- bool batch_vals[16];
+ // We use uint8_t instead of bool because unpacking is only supported for unsigned
+ // integers.
+ uint8_t batch_vals[16];
EXPECT_EQ(16, reader.UnpackBatch(1, 16, batch_vals));
for (int i = 0; i < 8; ++i) EXPECT_EQ(batch_vals[i], i % 2);
@@ -127,12 +129,14 @@ TEST(BitArray, TestBoolSkip) {
for (int i = 0; i < 8; ++i) ASSERT_TRUE(writer.PutValue(0, 1));
writer.Flush();
- TestSkipping<bool>(buffer, len, 1, 0, 8);
- TestSkipping<bool>(buffer, len, 1, 0, 16);
- TestSkipping<bool>(buffer, len, 1, 8, 8);
- TestSkipping<bool>(buffer, len, 1, 8, 16);
- TestSkipping<bool>(buffer, len, 1, 16, 8);
- TestSkipping<bool>(buffer, len, 1, 16, 16);
+ // We use uint8_t instead of bool because unpacking is only supported for unsigned
+ // integers.
+ TestSkipping<uint8_t>(buffer, len, 1, 0, 8);
+ TestSkipping<uint8_t>(buffer, len, 1, 0, 16);
+ TestSkipping<uint8_t>(buffer, len, 1, 8, 8);
+ TestSkipping<uint8_t>(buffer, len, 1, 8, 16);
+ TestSkipping<uint8_t>(buffer, len, 1, 16, 8);
+ TestSkipping<uint8_t>(buffer, len, 1, 16, 16);
}
TEST(BitArray, TestIntSkip) {
@@ -145,10 +149,10 @@ TEST(BitArray, TestIntSkip) {
ASSERT_TRUE(writer.PutValue(i, bit_width));
}
int bytes_written = writer.bytes_written();
- TestSkipping<int>(buffer, bytes_written, bit_width, 0, 4);
- TestSkipping<int>(buffer, bytes_written, bit_width, 4, 4);
- TestSkipping<int>(buffer, bytes_written, bit_width, 4, 8);
- TestSkipping<int>(buffer, bytes_written, bit_width, 8, 56);
+ TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 0, 4);
+ TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 4, 4);
+ TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 4, 8);
+ TestSkipping<uint32_t>(buffer, bytes_written, bit_width, 8, 56);
}
// Writes 'num_vals' values with width 'bit_width' starting from 'start' and increasing
@@ -172,9 +176,9 @@ void TestBitArrayValues(int bit_width, uint64_t start, uint64_t num_vals) {
for (int trial = 0; trial < 2; ++trial) {
// Unpack all values at once with one batched reader and in small batches with the
// other batched reader.
- vector<int64_t> batch_vals(num_vals);
+ vector<uint64_t> batch_vals(num_vals);
const uint64_t BATCH_SIZE = 32;
- vector<int64_t> batch_vals2(BATCH_SIZE);
+ vector<uint64_t> batch_vals2(BATCH_SIZE);
EXPECT_EQ(num_vals,
reader.UnpackBatch(bit_width, num_vals, batch_vals.data()));
for (uint64_t i = 0; i < num_vals; ++i) {
diff --git a/be/src/util/bit-stream-utils.h b/be/src/util/bit-stream-utils.h
index 230c9dd..b94386a 100644
--- a/be/src/util/bit-stream-utils.h
+++ b/be/src/util/bit-stream-utils.h
@@ -54,7 +54,7 @@ class BitWriter {
int buffer_len() const { return max_bytes_; }
/// Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit
- /// packed. Returns false if there was not enough space. num_bits must be <= 32.
+ /// packed. Returns false if there was not enough space. num_bits must be <= 64.
bool PutValue(uint64_t v, int num_bits);
/// Writes v to the next aligned byte using num_bytes. If T is larger than num_bytes, the
@@ -130,7 +130,7 @@ class BatchedBitReader {
}
/// Gets up to 'num_values' bit-packed values, starting from the current byte in the
- /// buffer and advance the read position. 'bit_width' must be <= 32.
+ /// buffer and advance the read position. 'bit_width' must be <= 64.
/// If 'bit_width' * 'num_values' is not a multiple of 8, the trailing bytes are
/// skipped and the next UnpackBatch() call will start reading from the next byte.
///
@@ -139,6 +139,8 @@ class BatchedBitReader {
/// 'bit_width' * 'num_values' must be a multiple of 8. This condition is always
/// satisfied if 'num_values' is a multiple of 32.
///
+ /// The output type 'T' must be an unsigned integer.
+ ///
/// Returns the number of values read.
template<typename T>
int UnpackBatch(int bit_width, int num_values, T* v);