You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2018/08/01 19:14:26 UTC
[parquet-cpp] branch master updated: PARQUET-1366: [C++] Streamline
use of Arrow's bit-util.h APIs
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git
The following commit(s) were added to refs/heads/master by this push:
new a0d1669 PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs
a0d1669 is described below
commit a0d1669cf67b055cd7b724dea04886a0ded53c8f
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Aug 1 15:14:15 2018 -0400
PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs
This is required before we can remove some duplicate or little-used APIs in ARROW-2950.
Author: Antoine Pitrou <an...@python.org>
Closes #483 from pitrou/PARQUET-1366-arrow-bit-util and squashes the following commits:
1eb6ef0 [Antoine Pitrou] Avoid using FirstTimeBitmapWriter for now (this Arrow API is too recent)
57aa82c [Antoine Pitrou] Fix line size
86c4ca5 [Antoine Pitrou] PARQUET-1366: [C++] Streamline use of Arrow bit-util.h
---
src/parquet/arrow/test-util.h | 2 +-
src/parquet/column_reader.cc | 2 +-
src/parquet/column_writer-test.cc | 2 +-
src/parquet/column_writer.cc | 5 +++--
src/parquet/encoding-internal.h | 7 ++++++-
src/parquet/encoding-test.cc | 4 ++--
src/parquet/statistics-test.cc | 4 ++--
7 files changed, 16 insertions(+), 10 deletions(-)
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index bfc78c8..2babacb 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -368,7 +368,7 @@ Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
auto null_bitmap = AllocateBuffer();
- int64_t bitmap_size = ::arrow::BitUtil::CeilByte(size) / 8;
+ int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size);
RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
memset(null_bitmap_ptr, 0, bitmap_size);
diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc
index bc3ee8a..28d0dcb 100644
--- a/src/parquet/column_reader.cc
+++ b/src/parquet/column_reader.cc
@@ -60,7 +60,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
}
case Encoding::BIT_PACKED: {
num_bytes =
- static_cast<int32_t>(BitUtil::Ceil(num_buffered_values * bit_width_, 8));
+ static_cast<int32_t>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
if (!bit_packed_decoder_) {
bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes));
} else {
diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc
index aac582a..6c0794a 100644
--- a/src/parquet/column_writer-test.cc
+++ b/src/parquet/column_writer-test.cc
@@ -137,7 +137,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
bool enable_dictionary, bool enable_statistics,
int64_t num_rows) {
std::vector<uint8_t> valid_bits(
- BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) + 1, 255);
+ BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255);
ColumnProperties column_properties(encoding, compression, enable_dictionary,
enable_statistics);
std::shared_ptr<TypedColumnWriter<TestType>> writer =
diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc
index 7d47d3f..48fba55 100644
--- a/src/parquet/column_writer.cc
+++ b/src/parquet/column_writer.cc
@@ -50,7 +50,7 @@ void LevelEncoder::Init(Encoding::type encoding, int16_t max_level,
}
case Encoding::BIT_PACKED: {
int num_bytes =
- static_cast<int>(BitUtil::Ceil(num_buffered_values * bit_width_, 8));
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
break;
}
@@ -72,7 +72,8 @@ int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level,
break;
}
case Encoding::BIT_PACKED: {
- num_bytes = static_cast<int>(BitUtil::Ceil(num_buffered_values * bit_width, 8));
+ num_bytes =
+ static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width));
break;
}
default:
diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h
index 98f9e4a..2dfb9ff 100644
--- a/src/parquet/encoding-internal.h
+++ b/src/parquet/encoding-internal.h
@@ -151,12 +151,17 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {
int Decode(uint8_t* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
bool val;
+ ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
for (int i = 0; i < max_values; ++i) {
if (!bit_reader_.GetValue(1, &val)) {
ParquetException::EofException();
}
- BitUtil::SetArrayBit(buffer, i, val);
+ if (val) {
+ bit_writer.Set();
+ }
+ bit_writer.Next();
}
+ bit_writer.Finish();
num_values_ -= max_values;
return max_values;
}
diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc
index 60285ab..50e1394 100644
--- a/src/parquet/encoding-test.cc
+++ b/src/parquet/encoding-test.cc
@@ -43,7 +43,7 @@ namespace test {
TEST(VectorBooleanTest, TestEncodeDecode) {
// PARQUET-454
int nvalues = 10000;
- int nbytes = static_cast<int>(BitUtil::Ceil(nvalues, 8));
+ int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues));
// seed the prng so failure is deterministic
vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0);
@@ -252,7 +252,7 @@ class TestDictionaryEncoding : public TestEncodingBase<Type> {
static constexpr int TYPE = Type::type_num;
void CheckRoundtrip() {
- std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(num_values_) + 1, 255);
+ std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255);
DictEncoder<Type> encoder(descr_.get(), &pool_);
ASSERT_NO_THROW(encoder.Put(draws_, num_values_));
diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc
index 943d5cc..d2ecede 100644
--- a/src/parquet/statistics-test.cc
+++ b/src/parquet/statistics-test.cc
@@ -72,7 +72,7 @@ class TestRowGroupStatistics : public PrimitiveTypedTest<TestType> {
TypedStats statistics3(this->schema_.Column(0));
std::vector<uint8_t> valid_bits(
- BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) + 1, 255);
+ BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255);
statistics3.UpdateSpaced(this->values_ptr_, valid_bits.data(), 0,
this->values_.size(), 0);
std::string encoded_min_spaced = statistics3.EncodeMin();
@@ -722,7 +722,7 @@ TEST(TestStatisticsFloatNaN, NaNValuesSpaced) {
for (int i = 0; i < NUM_VALUES; i++) {
nan_values[i] = std::nanf("");
}
- std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(NUM_VALUES) + 1, 255);
+ std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255);
// Test values
TypedRowGroupStatistics<FloatType> nan_stats(&descr);