You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/07/29 14:42:23 UTC
[arrow] branch master updated: ARROW-6042: [C++][Parquet] Add
Dictionary32Builder that always returns 32-bit dictionary indices
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 089e3db ARROW-6042: [C++][Parquet] Add Dictionary32Builder that always returns 32-bit dictionary indices
089e3db is described below
commit 089e3db8859f48ad32657165788bb50373ddef75
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Jul 29 09:42:08 2019 -0500
ARROW-6042: [C++][Parquet] Add Dictionary32Builder that always returns 32-bit dictionary indices
Without this, DictionaryArrays produced by different parts of a Parquet file could have different index types depending on the cardinality of each decoded part.
Refactors DictionaryBuilder (which uses AdaptiveIntBuilder) and Dictionary32Builder (which uses Int32Builder) to have a common base class.
Closes #4956 from wesm/ARROW-6042 and squashes the following commits:
5210b30be <Wes McKinney> Make implementation templates internal
8603ded84 <Wes McKinney> Change Parquet to use 32-bit dictionary builder
c2e40ed37 <Wes McKinney> Implement 32-bit-only variant of DictionaryBuilder, use common base class
Authored-by: Wes McKinney <we...@apache.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/arrow/array-dict-test.cc | 40 +++++-
cpp/src/arrow/array/builder_dict.h | 168 +++++++++++++++-------
cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 16 ++-
cpp/src/parquet/column_reader.cc | 2 +-
cpp/src/parquet/encoding-benchmark.cc | 21 +--
cpp/src/parquet/encoding-test.cc | 2 +-
cpp/src/parquet/encoding.cc | 24 ++--
cpp/src/parquet/encoding.h | 6 +-
8 files changed, 191 insertions(+), 88 deletions(-)
diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index ff00620..a930572 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -282,6 +282,26 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
ASSERT_TRUE(expected_delta2.Equals(result_delta2));
}
+TYPED_TEST(TestDictionaryBuilder, Dictionary32_BasicPrimitive) {
+ using c_type = typename TypeParam::c_type;
+ auto type = std::make_shared<TypeParam>();
+ auto dict_type = dictionary(int32(), type);
+
+ Dictionary32Builder<TypeParam> builder;
+
+ ASSERT_OK(builder.Append(static_cast<c_type>(1)));
+ ASSERT_OK(builder.Append(static_cast<c_type>(2)));
+ ASSERT_OK(builder.Append(static_cast<c_type>(1)));
+ ASSERT_OK(builder.Append(static_cast<c_type>(2)));
+ std::shared_ptr<Array> result;
+ FinishAndCheckPadding(&builder, &result);
+
+ // Build expected data for the initial dictionary
+ auto ex_dict1 = ArrayFromJSON(type, "[1, 2]");
+ DictionaryArray expected(dict_type, ArrayFromJSON(int32(), "[0, 1, 0, 1]"), ex_dict1);
+ ASSERT_TRUE(expected.Equals(result));
+}
+
TEST(TestStringDictionaryBuilder, Basic) {
// Build the dictionary Array
StringDictionaryBuilder builder;
@@ -301,11 +321,14 @@ TEST(TestStringDictionaryBuilder, Basic) {
ASSERT_TRUE(expected.Equals(result));
}
-TEST(TestStringDictionaryBuilder, AppendIndices) {
+template <typename BuilderType, typename IndexType, typename AppendCType>
+void TestStringDictionaryAppendIndices() {
+ auto index_type = TypeTraits<IndexType>::type_singleton();
+
auto ex_dict = ArrayFromJSON(utf8(), R"(["c", "a", "b", "d"])");
auto invalid_dict = ArrayFromJSON(binary(), R"(["e", "f"])");
- StringDictionaryBuilder builder;
+ BuilderType builder;
ASSERT_OK(builder.InsertMemoValues(*ex_dict));
// Inserting again should have no effect
@@ -314,7 +337,7 @@ TEST(TestStringDictionaryBuilder, AppendIndices) {
// Type mismatch
ASSERT_RAISES(Invalid, builder.InsertMemoValues(*invalid_dict));
- std::vector<int64_t> raw_indices = {0, 1, 2, -1, 3};
+ std::vector<AppendCType> raw_indices = {0, 1, 2, -1, 3};
std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
for (int i = 0; i < 2; ++i) {
ASSERT_OK(builder.AppendIndices(
@@ -326,12 +349,19 @@ TEST(TestStringDictionaryBuilder, AppendIndices) {
std::shared_ptr<Array> result;
ASSERT_OK(builder.Finish(&result));
- auto ex_indices = ArrayFromJSON(int8(), R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
- auto dtype = dictionary(int8(), utf8());
+ auto ex_indices = ArrayFromJSON(index_type, R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
+ auto dtype = dictionary(index_type, utf8());
DictionaryArray expected(dtype, ex_indices, ex_dict);
ASSERT_TRUE(expected.Equals(result));
}
+TEST(TestStringDictionaryBuilder, AppendIndices) {
+ // Currently AdaptiveIntBuilder only accepts int64_t in bulk appends
+ TestStringDictionaryAppendIndices<StringDictionaryBuilder, Int8Type, int64_t>();
+
+ TestStringDictionaryAppendIndices<StringDictionary32Builder, Int32Type, int32_t>();
+}
+
TEST(TestStringDictionaryBuilder, ArrayInit) {
auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index e1cfe8a..4a93d95 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -20,8 +20,9 @@
#include <algorithm>
#include <memory>
-#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
-#include "arrow/array/builder_base.h" // IWYU pragma: export
+#include "arrow/array/builder_adaptive.h" // IWYU pragma: export
+#include "arrow/array/builder_base.h" // IWYU pragma: export
+#include "arrow/array/builder_primitive.h" // IWYU pragma: export
#include "arrow/array.h"
@@ -84,8 +85,6 @@ class ARROW_EXPORT DictionaryMemoTable {
std::unique_ptr<DictionaryMemoTableImpl> impl_;
};
-} // namespace internal
-
/// \brief Array builder for created encoded DictionaryArray from
/// dense array
///
@@ -95,50 +94,50 @@ class ARROW_EXPORT DictionaryMemoTable {
/// build a delta dictionary when new terms occur.
///
/// data
-template <typename T>
-class DictionaryBuilder : public ArrayBuilder {
+template <typename BuilderType, typename T>
+class DictionaryBuilderBase : public ArrayBuilder {
public:
- using Scalar = typename internal::DictionaryScalar<T>::type;
+ using Scalar = typename DictionaryScalar<T>::type;
// WARNING: the type given below is the value type, not the DictionaryType.
// The DictionaryType is instantiated on the Finish() call.
template <typename T1 = T>
- DictionaryBuilder(
+ DictionaryBuilderBase(
typename std::enable_if<!std::is_base_of<FixedSizeBinaryType, T1>::value,
const std::shared_ptr<DataType>&>::type type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(type, pool),
- memo_table_(new internal::DictionaryMemoTable(type)),
+ memo_table_(new DictionaryMemoTable(type)),
delta_offset_(0),
byte_width_(-1),
values_builder_(pool) {}
template <typename T1 = T>
- explicit DictionaryBuilder(
+ explicit DictionaryBuilderBase(
typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value,
const std::shared_ptr<DataType>&>::type type,
MemoryPool* pool = default_memory_pool())
: ArrayBuilder(type, pool),
- memo_table_(new internal::DictionaryMemoTable(type)),
+ memo_table_(new DictionaryMemoTable(type)),
delta_offset_(0),
byte_width_(static_cast<const T1&>(*type).byte_width()),
values_builder_(pool) {}
template <typename T1 = T>
- explicit DictionaryBuilder(
+ explicit DictionaryBuilderBase(
typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool =
default_memory_pool())
- : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
+ : DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
- DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
- MemoryPool* pool = default_memory_pool())
+ DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
+ MemoryPool* pool = default_memory_pool())
: ArrayBuilder(dictionary->type(), pool),
- memo_table_(new internal::DictionaryMemoTable(dictionary)),
+ memo_table_(new DictionaryMemoTable(dictionary)),
delta_offset_(0),
byte_width_(-1),
values_builder_(pool) {}
- ~DictionaryBuilder() override = default;
+ ~DictionaryBuilderBase() override = default;
/// \brief Append a scalar value
Status Append(const Scalar& value) {
@@ -189,18 +188,6 @@ class DictionaryBuilder : public ArrayBuilder {
return memo_table_->InsertValues(values);
}
- /// \brief Append dictionary indices directly without modifying memo
- ///
- /// NOTE: Experimental API
- Status AppendIndices(const int64_t* values, int64_t length,
- const uint8_t* valid_bytes = NULLPTR) {
- int64_t null_count_before = values_builder_.null_count();
- ARROW_RETURN_NOT_OK(values_builder_.AppendValues(values, length, valid_bytes));
- length_ += length;
- null_count_ += values_builder_.null_count() - null_count_before;
- return Status::OK();
- }
-
/// \brief Append a whole dense array to the builder
template <typename T1 = T>
Status AppendArray(
@@ -242,7 +229,7 @@ class DictionaryBuilder : public ArrayBuilder {
void Reset() override {
ArrayBuilder::Reset();
values_builder_.Reset();
- memo_table_.reset(new internal::DictionaryMemoTable(type_));
+ memo_table_.reset(new DictionaryMemoTable(type_));
delta_offset_ = 0;
}
@@ -291,26 +278,27 @@ class DictionaryBuilder : public ArrayBuilder {
bool is_building_delta() { return delta_offset_ > 0; }
protected:
- std::unique_ptr<internal::DictionaryMemoTable> memo_table_;
+ std::unique_ptr<DictionaryMemoTable> memo_table_;
int32_t delta_offset_;
// Only used for FixedSizeBinaryType
int32_t byte_width_;
- AdaptiveIntBuilder values_builder_;
+ BuilderType values_builder_;
};
-template <>
-class DictionaryBuilder<NullType> : public ArrayBuilder {
+template <typename BuilderType>
+class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
public:
- DictionaryBuilder(const std::shared_ptr<DataType>& type,
- MemoryPool* pool = default_memory_pool())
+ DictionaryBuilderBase(const std::shared_ptr<DataType>& type,
+ MemoryPool* pool = default_memory_pool())
: ArrayBuilder(type, pool), values_builder_(pool) {}
- explicit DictionaryBuilder(MemoryPool* pool = default_memory_pool())
+
+ explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
: ArrayBuilder(null(), pool), values_builder_(pool) {}
- DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
- MemoryPool* pool = default_memory_pool())
+ DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
+ MemoryPool* pool = default_memory_pool())
: ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {}
/// \brief Append a scalar null value
@@ -362,16 +350,68 @@ class DictionaryBuilder<NullType> : public ArrayBuilder {
Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
protected:
- AdaptiveIntBuilder values_builder_;
+ BuilderType values_builder_;
};
-class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
+} // namespace internal
+
+/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
+/// smallest index size that can accommodate the dictionary indices
+template <typename T>
+class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
public:
- using DictionaryBuilder::Append;
- using DictionaryBuilder::AppendIndices;
- using DictionaryBuilder::DictionaryBuilder;
+ using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
+ using BASE::BASE;
+
+ /// \brief Append dictionary indices directly without modifying memo
+ ///
+ /// NOTE: Experimental API
+ Status AppendIndices(const int64_t* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ int64_t null_count_before = this->values_builder_.null_count();
+ ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes));
+ this->length_ += length;
+ this->null_count_ += this->values_builder_.null_count() - null_count_before;
+ return Status::OK();
+ }
+};
- BinaryDictionaryBuilder() : BinaryDictionaryBuilder(default_memory_pool()) {}
+/// \brief A DictionaryArray builder that always returns int32 dictionary
+/// indices so that data cast to dictionary form will have a consistent index
+/// type, e.g. for creating a ChunkedArray
+template <typename T>
+class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
+ public:
+ using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
+ using BASE::BASE;
+
+ /// \brief Append dictionary indices directly without modifying memo
+ ///
+ /// NOTE: Experimental API
+ Status AppendIndices(const int32_t* values, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ int64_t null_count_before = this->values_builder_.null_count();
+ ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes));
+ this->length_ += length;
+ this->null_count_ += this->values_builder_.null_count() - null_count_before;
+ return Status::OK();
+ }
+};
+
+// ----------------------------------------------------------------------
+// Binary / Unicode builders with slightly expanded APIs
+
+namespace internal {
+
+template <typename T>
+class BinaryDictionaryBuilderImpl : public DictionaryBuilder<T> {
+ public:
+ using BASE = DictionaryBuilder<T>;
+ using BASE::Append;
+ using BASE::AppendIndices;
+ using BASE::BASE;
+
+ BinaryDictionaryBuilderImpl() : BinaryDictionaryBuilderImpl(default_memory_pool()) {}
Status Append(const uint8_t* value, int32_t length) {
return Append(reinterpret_cast<const char*>(value), length);
@@ -382,14 +422,16 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType
}
};
-/// \brief Dictionary array builder with convenience methods for strings
-class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
+template <typename T>
+class BinaryDictionary32BuilderImpl : public Dictionary32Builder<T> {
public:
- using DictionaryBuilder::Append;
- using DictionaryBuilder::AppendIndices;
- using DictionaryBuilder::DictionaryBuilder;
+ using BASE = Dictionary32Builder<T>;
+ using BASE::Append;
+ using BASE::AppendIndices;
+ using BASE::BASE;
- StringDictionaryBuilder() : StringDictionaryBuilder(default_memory_pool()) {}
+ BinaryDictionary32BuilderImpl()
+ : BinaryDictionary32BuilderImpl(default_memory_pool()) {}
Status Append(const uint8_t* value, int32_t length) {
return Append(reinterpret_cast<const char*>(value), length);
@@ -400,4 +442,28 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType
}
};
+} // namespace internal
+
+class BinaryDictionaryBuilder : public internal::BinaryDictionaryBuilderImpl<BinaryType> {
+ using BASE = internal::BinaryDictionaryBuilderImpl<BinaryType>;
+ using BASE::BASE;
+};
+
+class StringDictionaryBuilder : public internal::BinaryDictionaryBuilderImpl<StringType> {
+ using BASE = BinaryDictionaryBuilderImpl<StringType>;
+ using BASE::BASE;
+};
+
+class BinaryDictionary32Builder
+ : public internal::BinaryDictionary32BuilderImpl<BinaryType> {
+ using BASE = internal::BinaryDictionary32BuilderImpl<BinaryType>;
+ using BASE::BASE;
+};
+
+class StringDictionary32Builder
+ : public internal::BinaryDictionary32BuilderImpl<StringType> {
+ using BASE = internal::BinaryDictionary32BuilderImpl<StringType>;
+ using BASE::BASE;
+};
+
} // namespace arrow
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index cfc1c1f..f733ea5 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2736,20 +2736,22 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) {
class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
public:
+ static constexpr int kNumRowGroups = 10;
+
void SetUp() override {
GenerateData(GetParam());
// Write 4 row groups; each row group will have a different dictionary
ASSERT_NO_FATAL_FAILURE(
- WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 4,
+ WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / kNumRowGroups,
default_arrow_writer_properties(), &buffer_));
properties_ = default_arrow_reader_properties();
}
void GenerateData(double null_probability) {
- constexpr int num_unique = 100;
- constexpr int repeat = 100;
+ constexpr int num_unique = 1000;
+ constexpr int repeat = 50;
constexpr int64_t min_length = 2;
constexpr int64_t max_length = 100;
::arrow::random::RandomArrayGenerator rag(0);
@@ -2781,7 +2783,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
};
void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
- ::arrow::StringDictionaryBuilder builder(default_memory_pool());
+ ::arrow::StringDictionary32Builder builder(default_memory_pool());
const auto& string_array = static_cast<const ::arrow::StringArray&>(arr);
ASSERT_OK(builder.AppendArray(string_array));
ASSERT_OK(builder.Finish(out));
@@ -2790,9 +2792,9 @@ void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
properties_.set_read_dictionary(0, true);
- std::vector<std::shared_ptr<Array>> chunks(4);
- const int64_t chunk_size = expected_dense_->num_rows() / 4;
- for (int i = 0; i < 4; ++i) {
+ std::vector<std::shared_ptr<Array>> chunks(kNumRowGroups);
+ const int64_t chunk_size = expected_dense_->num_rows() / kNumRowGroups;
+ for (int i = 0; i < kNumRowGroups; ++i) {
AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]);
}
auto ex_table = MakeSimpleTable(std::make_shared<ChunkedArray>(chunks),
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 4562d7b..4626869 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1311,7 +1311,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
private:
using BinaryDictDecoder = DictDecoder<ByteArrayType>;
- ::arrow::BinaryDictionaryBuilder builder_;
+ ::arrow::BinaryDictionary32Builder builder_;
std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
};
diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc
index 71659ea..2079f25 100644
--- a/cpp/src/parquet/encoding-benchmark.cc
+++ b/cpp/src/parquet/encoding-benchmark.cc
@@ -36,6 +36,7 @@ using arrow::default_memory_pool;
using arrow::MemoryPool;
namespace {
+
// The min/max number of values used to drive each family of encoding benchmarks
constexpr int MIN_RANGE = 1024;
constexpr int MAX_RANGE = 65536;
@@ -335,7 +336,7 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture {
std::shared_ptr<Buffer> buffer_;
};
-using ::arrow::BinaryDictionaryBuilder;
+using ::arrow::BinaryDictionary32Builder;
using ::arrow::internal::ChunkedBinaryBuilder;
template <>
@@ -346,9 +347,9 @@ std::unique_ptr<ChunkedBinaryBuilder> BenchmarkDecodeArrow::CreateBuilder() {
}
template <>
-std::unique_ptr<BinaryDictionaryBuilder> BenchmarkDecodeArrow::CreateBuilder() {
- return std::unique_ptr<BinaryDictionaryBuilder>(
- new BinaryDictionaryBuilder(default_memory_pool()));
+std::unique_ptr<BinaryDictionary32Builder> BenchmarkDecodeArrow::CreateBuilder() {
+ return std::unique_ptr<BinaryDictionary32Builder>(
+ new BinaryDictionary32Builder(default_memory_pool()));
}
// ----------------------------------------------------------------------
@@ -379,12 +380,14 @@ BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense)
->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict)
-(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionary32Builder>(state); }
BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict)
->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict)
-(benchmark::State& state) { DecodeArrowNonNullBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) {
+ DecodeArrowNonNullBenchmark<BinaryDictionary32Builder>(state);
+}
BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict)
->Range(MIN_RANGE, MAX_RANGE);
@@ -439,12 +442,14 @@ BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dense)
->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dict)
-(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionary32Builder>(state); }
BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrow_Dict)
->Range(MIN_RANGE, MAX_RANGE);
BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict)
-(benchmark::State& state) { DecodeArrowNonNullBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) {
+ DecodeArrowNonNullBenchmark<BinaryDictionary32Builder>(state);
+}
BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict)
->Range(MIN_RANGE, MAX_RANGE);
diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc
index 799b81d..9497534 100644
--- a/cpp/src/parquet/encoding-test.cc
+++ b/cpp/src/parquet/encoding-test.cc
@@ -328,7 +328,7 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) {
class TestArrowBuilderDecoding : public ::testing::Test {
public:
using DenseBuilder = ::arrow::internal::ChunkedBinaryBuilder;
- using DictBuilder = ::arrow::BinaryDictionaryBuilder;
+ using DictBuilder = ::arrow::BinaryDictionary32Builder;
void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; }
void TearDown() override {}
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index ef42e88..26b9050 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -711,7 +711,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
- ::arrow::BinaryDictionaryBuilder* builder) override {
+ ::arrow::BinaryDictionary32Builder* builder) override {
int result = 0;
PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
valid_bits_offset, builder, &result));
@@ -728,7 +728,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
}
int DecodeArrowNonNull(int num_values,
- ::arrow::BinaryDictionaryBuilder* builder) override {
+ ::arrow::BinaryDictionary32Builder* builder) override {
int result = 0;
PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
return result;
@@ -878,12 +878,12 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
if (num_values > 0) {
// TODO(wesm): Refactor to batch reads for improved memory use. It is not
// trivial because the null_count is relative to the entire bitmap
- PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int64_t>(
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
num_values, /*shrink_to_fit=*/false));
}
auto indices_buffer =
- reinterpret_cast<int64_t*>(indices_scratch_space_->mutable_data());
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits,
valid_bits_offset, indices_buffer)) {
@@ -898,7 +898,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
bit_reader.Next();
}
- auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder);
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
PARQUET_THROW_NOT_OK(
binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
num_values_ -= num_values;
@@ -912,15 +912,15 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
// TODO(wesm): Refactor to batch reads for improved memory use. This is
// relatively simple here because we don't have to do any bookkeeping of
// nulls
- PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int64_t>(
+ PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
num_values, /*shrink_to_fit=*/false));
}
auto indices_buffer =
- reinterpret_cast<int64_t*>(indices_scratch_space_->mutable_data());
+ reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
ParquetException::EofException();
}
- auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder);
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
num_values_ -= num_values;
return num_values;
@@ -952,7 +952,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
std::shared_ptr<ResizableBuffer> byte_array_offsets_;
// Reusable buffer for decoding dictionary indices to be appended to a
- // BinaryDictionaryBuilder
+ // BinaryDictionary32Builder
std::shared_ptr<ResizableBuffer> indices_scratch_space_;
::arrow::util::RleDecoder idx_decoder_;
@@ -1024,7 +1024,7 @@ void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
template <>
void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
- auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder);
+ auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
// Make an BinaryArray referencing the internal dictionary data
auto arr = std::make_shared<::arrow::BinaryArray>(
@@ -1040,7 +1040,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
- ::arrow::BinaryDictionaryBuilder* builder) override {
+ ::arrow::BinaryDictionary32Builder* builder) override {
int result = 0;
PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
valid_bits_offset, builder, &result));
@@ -1057,7 +1057,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
}
int DecodeArrowNonNull(int num_values,
- ::arrow::BinaryDictionaryBuilder* builder) override {
+ ::arrow::BinaryDictionary32Builder* builder) override {
int result = 0;
PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
return result;
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 4918a13..5aa1fed 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -29,7 +29,7 @@
namespace arrow {
class ArrayBuilder;
-class BinaryDictionaryBuilder;
+class BinaryDictionary32Builder;
namespace internal {
@@ -225,10 +225,10 @@ class ByteArrayDecoder : virtual public TypedDecoder<ByteArrayType> {
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,
- ::arrow::BinaryDictionaryBuilder* builder) = 0;
+ ::arrow::BinaryDictionary32Builder* builder) = 0;
virtual int DecodeArrowNonNull(int num_values,
- ::arrow::BinaryDictionaryBuilder* builder) = 0;
+ ::arrow::BinaryDictionary32Builder* builder) = 0;
virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
int64_t valid_bits_offset,