You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2018/08/01 19:14:26 UTC

[parquet-cpp] branch master updated: PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git


The following commit(s) were added to refs/heads/master by this push:
     new a0d1669  PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs
a0d1669 is described below

commit a0d1669cf67b055cd7b724dea04886a0ded53c8f
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Aug 1 15:14:15 2018 -0400

    PARQUET-1366: [C++] Streamline use of Arrow's bit-util.h APIs
    
    This is required before we can remove some duplicate or little-used APIs in ARROW-2950.
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #483 from pitrou/PARQUET-1366-arrow-bit-util and squashes the following commits:
    
    1eb6ef0 [Antoine Pitrou] Avoid using FirstTimeBitmapWriter for now (this Arrow API is too recent)
    57aa82c [Antoine Pitrou] Fix line size
    86c4ca5 [Antoine Pitrou] PARQUET-1366: [C++] Streamline use of Arrow bit-util.h
---
 src/parquet/arrow/test-util.h     | 2 +-
 src/parquet/column_reader.cc      | 2 +-
 src/parquet/column_writer-test.cc | 2 +-
 src/parquet/column_writer.cc      | 5 +++--
 src/parquet/encoding-internal.h   | 7 ++++++-
 src/parquet/encoding-test.cc      | 4 ++--
 src/parquet/statistics-test.cc    | 4 ++--
 7 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index bfc78c8..2babacb 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -368,7 +368,7 @@ Status MakeListArray(const std::shared_ptr<Array>& values, int64_t size,
   int32_t* offsets_ptr = reinterpret_cast<int32_t*>(offsets->mutable_data());
 
   auto null_bitmap = AllocateBuffer();
-  int64_t bitmap_size = ::arrow::BitUtil::CeilByte(size) / 8;
+  int64_t bitmap_size = ::arrow::BitUtil::BytesForBits(size);
   RETURN_NOT_OK(null_bitmap->Resize(bitmap_size));
   uint8_t* null_bitmap_ptr = null_bitmap->mutable_data();
   memset(null_bitmap_ptr, 0, bitmap_size);
diff --git a/src/parquet/column_reader.cc b/src/parquet/column_reader.cc
index bc3ee8a..28d0dcb 100644
--- a/src/parquet/column_reader.cc
+++ b/src/parquet/column_reader.cc
@@ -60,7 +60,7 @@ int LevelDecoder::SetData(Encoding::type encoding, int16_t max_level,
     }
     case Encoding::BIT_PACKED: {
       num_bytes =
-          static_cast<int32_t>(BitUtil::Ceil(num_buffered_values * bit_width_, 8));
+          static_cast<int32_t>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
       if (!bit_packed_decoder_) {
         bit_packed_decoder_.reset(new ::arrow::BitReader(data, num_bytes));
       } else {
diff --git a/src/parquet/column_writer-test.cc b/src/parquet/column_writer-test.cc
index aac582a..6c0794a 100644
--- a/src/parquet/column_writer-test.cc
+++ b/src/parquet/column_writer-test.cc
@@ -137,7 +137,7 @@ class TestPrimitiveWriter : public PrimitiveTypedTest<TestType> {
                                        bool enable_dictionary, bool enable_statistics,
                                        int64_t num_rows) {
     std::vector<uint8_t> valid_bits(
-        BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) + 1, 255);
+        BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255);
     ColumnProperties column_properties(encoding, compression, enable_dictionary,
                                        enable_statistics);
     std::shared_ptr<TypedColumnWriter<TestType>> writer =
diff --git a/src/parquet/column_writer.cc b/src/parquet/column_writer.cc
index 7d47d3f..48fba55 100644
--- a/src/parquet/column_writer.cc
+++ b/src/parquet/column_writer.cc
@@ -50,7 +50,7 @@ void LevelEncoder::Init(Encoding::type encoding, int16_t max_level,
     }
     case Encoding::BIT_PACKED: {
       int num_bytes =
-          static_cast<int>(BitUtil::Ceil(num_buffered_values * bit_width_, 8));
+          static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width_));
       bit_packed_encoder_.reset(new BitWriter(data, num_bytes));
       break;
     }
@@ -72,7 +72,8 @@ int LevelEncoder::MaxBufferSize(Encoding::type encoding, int16_t max_level,
       break;
     }
     case Encoding::BIT_PACKED: {
-      num_bytes = static_cast<int>(BitUtil::Ceil(num_buffered_values * bit_width, 8));
+      num_bytes =
+          static_cast<int>(BitUtil::BytesForBits(num_buffered_values * bit_width));
       break;
     }
     default:
diff --git a/src/parquet/encoding-internal.h b/src/parquet/encoding-internal.h
index 98f9e4a..2dfb9ff 100644
--- a/src/parquet/encoding-internal.h
+++ b/src/parquet/encoding-internal.h
@@ -151,12 +151,17 @@ class PlainDecoder<BooleanType> : public Decoder<BooleanType> {
   int Decode(uint8_t* buffer, int max_values) {
     max_values = std::min(max_values, num_values_);
     bool val;
+    ::arrow::internal::BitmapWriter bit_writer(buffer, 0, max_values);
     for (int i = 0; i < max_values; ++i) {
       if (!bit_reader_.GetValue(1, &val)) {
         ParquetException::EofException();
       }
-      BitUtil::SetArrayBit(buffer, i, val);
+      if (val) {
+        bit_writer.Set();
+      }
+      bit_writer.Next();
     }
+    bit_writer.Finish();
     num_values_ -= max_values;
     return max_values;
   }
diff --git a/src/parquet/encoding-test.cc b/src/parquet/encoding-test.cc
index 60285ab..50e1394 100644
--- a/src/parquet/encoding-test.cc
+++ b/src/parquet/encoding-test.cc
@@ -43,7 +43,7 @@ namespace test {
 TEST(VectorBooleanTest, TestEncodeDecode) {
   // PARQUET-454
   int nvalues = 10000;
-  int nbytes = static_cast<int>(BitUtil::Ceil(nvalues, 8));
+  int nbytes = static_cast<int>(BitUtil::BytesForBits(nvalues));
 
   // seed the prng so failure is deterministic
   vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0);
@@ -252,7 +252,7 @@ class TestDictionaryEncoding : public TestEncodingBase<Type> {
   static constexpr int TYPE = Type::type_num;
 
   void CheckRoundtrip() {
-    std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(num_values_) + 1, 255);
+    std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255);
     DictEncoder<Type> encoder(descr_.get(), &pool_);
 
     ASSERT_NO_THROW(encoder.Put(draws_, num_values_));
diff --git a/src/parquet/statistics-test.cc b/src/parquet/statistics-test.cc
index 943d5cc..d2ecede 100644
--- a/src/parquet/statistics-test.cc
+++ b/src/parquet/statistics-test.cc
@@ -72,7 +72,7 @@ class TestRowGroupStatistics : public PrimitiveTypedTest<TestType> {
 
     TypedStats statistics3(this->schema_.Column(0));
     std::vector<uint8_t> valid_bits(
-        BitUtil::RoundUpNumBytes(static_cast<uint32_t>(this->values_.size())) + 1, 255);
+        BitUtil::BytesForBits(static_cast<uint32_t>(this->values_.size())) + 1, 255);
     statistics3.UpdateSpaced(this->values_ptr_, valid_bits.data(), 0,
                              this->values_.size(), 0);
     std::string encoded_min_spaced = statistics3.EncodeMin();
@@ -722,7 +722,7 @@ TEST(TestStatisticsFloatNaN, NaNValuesSpaced) {
   for (int i = 0; i < NUM_VALUES; i++) {
     nan_values[i] = std::nanf("");
   }
-  std::vector<uint8_t> valid_bits(BitUtil::RoundUpNumBytes(NUM_VALUES) + 1, 255);
+  std::vector<uint8_t> valid_bits(BitUtil::BytesForBits(NUM_VALUES) + 1, 255);
 
   // Test values
   TypedRowGroupStatistics<FloatType> nan_stats(&descr);