You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/07/29 14:42:23 UTC

[arrow] branch master updated: ARROW-6042: [C++][Parquet] Add Dictionary32Builder that always returns 32-bit dictionary indices

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 089e3db  ARROW-6042: [C++][Parquet] Add Dictionary32Builder that always returns 32-bit dictionary indices
089e3db is described below

commit 089e3db8859f48ad32657165788bb50373ddef75
Author: Wes McKinney <we...@apache.org>
AuthorDate: Mon Jul 29 09:42:08 2019 -0500

    ARROW-6042: [C++][Parquet] Add Dictionary32Builder that always returns 32-bit dictionary indices
    
    Without this, DictionaryArrays produced by different parts of a Parquet file could have different index types depending on the cardinality of each decoded part.
    
    Refactors DictionaryBuilder (which uses AdaptiveIntBuilder) and Dictionary32Builder (which uses Int32Builder) to have a common base class.
    
    Closes #4956 from wesm/ARROW-6042 and squashes the following commits:
    
    5210b30be <Wes McKinney> Make implementation templates internal
    8603ded84 <Wes McKinney> Change Parquet to use 32-bit dictionary builder
    c2e40ed37 <Wes McKinney> Implement 32-bit-only variant of DictionaryBuilder, use common base class
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/arrow/array-dict-test.cc                  |  40 +++++-
 cpp/src/arrow/array/builder_dict.h                | 168 +++++++++++++++-------
 cpp/src/parquet/arrow/arrow-reader-writer-test.cc |  16 ++-
 cpp/src/parquet/column_reader.cc                  |   2 +-
 cpp/src/parquet/encoding-benchmark.cc             |  21 +--
 cpp/src/parquet/encoding-test.cc                  |   2 +-
 cpp/src/parquet/encoding.cc                       |  24 ++--
 cpp/src/parquet/encoding.h                        |   6 +-
 8 files changed, 191 insertions(+), 88 deletions(-)

diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index ff00620..a930572 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -282,6 +282,26 @@ TYPED_TEST(TestDictionaryBuilder, DoubleDeltaDictionary) {
   ASSERT_TRUE(expected_delta2.Equals(result_delta2));
 }
 
+TYPED_TEST(TestDictionaryBuilder, Dictionary32_BasicPrimitive) {
+  using c_type = typename TypeParam::c_type;
+  auto type = std::make_shared<TypeParam>();
+  auto dict_type = dictionary(int32(), type);
+
+  Dictionary32Builder<TypeParam> builder;
+
+  ASSERT_OK(builder.Append(static_cast<c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<c_type>(2)));
+  std::shared_ptr<Array> result;
+  FinishAndCheckPadding(&builder, &result);
+
+  // Build expected data for the initial dictionary
+  auto ex_dict1 = ArrayFromJSON(type, "[1, 2]");
+  DictionaryArray expected(dict_type, ArrayFromJSON(int32(), "[0, 1, 0, 1]"), ex_dict1);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
 TEST(TestStringDictionaryBuilder, Basic) {
   // Build the dictionary Array
   StringDictionaryBuilder builder;
@@ -301,11 +321,14 @@ TEST(TestStringDictionaryBuilder, Basic) {
   ASSERT_TRUE(expected.Equals(result));
 }
 
-TEST(TestStringDictionaryBuilder, AppendIndices) {
+template <typename BuilderType, typename IndexType, typename AppendCType>
+void TestStringDictionaryAppendIndices() {
+  auto index_type = TypeTraits<IndexType>::type_singleton();
+
   auto ex_dict = ArrayFromJSON(utf8(), R"(["c", "a", "b", "d"])");
   auto invalid_dict = ArrayFromJSON(binary(), R"(["e", "f"])");
 
-  StringDictionaryBuilder builder;
+  BuilderType builder;
   ASSERT_OK(builder.InsertMemoValues(*ex_dict));
 
   // Inserting again should have no effect
@@ -314,7 +337,7 @@ TEST(TestStringDictionaryBuilder, AppendIndices) {
   // Type mismatch
   ASSERT_RAISES(Invalid, builder.InsertMemoValues(*invalid_dict));
 
-  std::vector<int64_t> raw_indices = {0, 1, 2, -1, 3};
+  std::vector<AppendCType> raw_indices = {0, 1, 2, -1, 3};
   std::vector<uint8_t> is_valid = {1, 1, 1, 0, 1};
   for (int i = 0; i < 2; ++i) {
     ASSERT_OK(builder.AppendIndices(
@@ -326,12 +349,19 @@ TEST(TestStringDictionaryBuilder, AppendIndices) {
   std::shared_ptr<Array> result;
   ASSERT_OK(builder.Finish(&result));
 
-  auto ex_indices = ArrayFromJSON(int8(), R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
-  auto dtype = dictionary(int8(), utf8());
+  auto ex_indices = ArrayFromJSON(index_type, R"([0, 1, 2, null, 3, 0, 1, 2, null, 3])");
+  auto dtype = dictionary(index_type, utf8());
   DictionaryArray expected(dtype, ex_indices, ex_dict);
   ASSERT_TRUE(expected.Equals(result));
 }
 
+TEST(TestStringDictionaryBuilder, AppendIndices) {
+  // Currently AdaptiveIntBuilder only accepts int64_t in bulk appends
+  TestStringDictionaryAppendIndices<StringDictionaryBuilder, Int8Type, int64_t>();
+
+  TestStringDictionaryAppendIndices<StringDictionary32Builder, Int32Type, int32_t>();
+}
+
 TEST(TestStringDictionaryBuilder, ArrayInit) {
   auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
   auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index e1cfe8a..4a93d95 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -20,8 +20,9 @@
 #include <algorithm>
 #include <memory>
 
-#include "arrow/array/builder_adaptive.h"  // IWYU pragma: export
-#include "arrow/array/builder_base.h"      // IWYU pragma: export
+#include "arrow/array/builder_adaptive.h"   // IWYU pragma: export
+#include "arrow/array/builder_base.h"       // IWYU pragma: export
+#include "arrow/array/builder_primitive.h"  // IWYU pragma: export
 
 #include "arrow/array.h"
 
@@ -84,8 +85,6 @@ class ARROW_EXPORT DictionaryMemoTable {
   std::unique_ptr<DictionaryMemoTableImpl> impl_;
 };
 
-}  // namespace internal
-
 /// \brief Array builder for created encoded DictionaryArray from
 /// dense array
 ///
@@ -95,50 +94,50 @@ class ARROW_EXPORT DictionaryMemoTable {
 /// build a delta dictionary when new terms occur.
 ///
 /// data
-template <typename T>
-class DictionaryBuilder : public ArrayBuilder {
+template <typename BuilderType, typename T>
+class DictionaryBuilderBase : public ArrayBuilder {
  public:
-  using Scalar = typename internal::DictionaryScalar<T>::type;
+  using Scalar = typename DictionaryScalar<T>::type;
 
   // WARNING: the type given below is the value type, not the DictionaryType.
   // The DictionaryType is instantiated on the Finish() call.
   template <typename T1 = T>
-  DictionaryBuilder(
+  DictionaryBuilderBase(
       typename std::enable_if<!std::is_base_of<FixedSizeBinaryType, T1>::value,
                               const std::shared_ptr<DataType>&>::type type,
       MemoryPool* pool = default_memory_pool())
       : ArrayBuilder(type, pool),
-        memo_table_(new internal::DictionaryMemoTable(type)),
+        memo_table_(new DictionaryMemoTable(type)),
         delta_offset_(0),
         byte_width_(-1),
         values_builder_(pool) {}
 
   template <typename T1 = T>
-  explicit DictionaryBuilder(
+  explicit DictionaryBuilderBase(
       typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value,
                               const std::shared_ptr<DataType>&>::type type,
       MemoryPool* pool = default_memory_pool())
       : ArrayBuilder(type, pool),
-        memo_table_(new internal::DictionaryMemoTable(type)),
+        memo_table_(new DictionaryMemoTable(type)),
         delta_offset_(0),
         byte_width_(static_cast<const T1&>(*type).byte_width()),
         values_builder_(pool) {}
 
   template <typename T1 = T>
-  explicit DictionaryBuilder(
+  explicit DictionaryBuilderBase(
       typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool =
           default_memory_pool())
-      : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
+      : DictionaryBuilderBase<BuilderType, T1>(TypeTraits<T1>::type_singleton(), pool) {}
 
-  DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
-                    MemoryPool* pool = default_memory_pool())
+  DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
+                        MemoryPool* pool = default_memory_pool())
       : ArrayBuilder(dictionary->type(), pool),
-        memo_table_(new internal::DictionaryMemoTable(dictionary)),
+        memo_table_(new DictionaryMemoTable(dictionary)),
         delta_offset_(0),
         byte_width_(-1),
         values_builder_(pool) {}
 
-  ~DictionaryBuilder() override = default;
+  ~DictionaryBuilderBase() override = default;
 
   /// \brief Append a scalar value
   Status Append(const Scalar& value) {
@@ -189,18 +188,6 @@ class DictionaryBuilder : public ArrayBuilder {
     return memo_table_->InsertValues(values);
   }
 
-  /// \brief Append dictionary indices directly without modifying memo
-  ///
-  /// NOTE: Experimental API
-  Status AppendIndices(const int64_t* values, int64_t length,
-                       const uint8_t* valid_bytes = NULLPTR) {
-    int64_t null_count_before = values_builder_.null_count();
-    ARROW_RETURN_NOT_OK(values_builder_.AppendValues(values, length, valid_bytes));
-    length_ += length;
-    null_count_ += values_builder_.null_count() - null_count_before;
-    return Status::OK();
-  }
-
   /// \brief Append a whole dense array to the builder
   template <typename T1 = T>
   Status AppendArray(
@@ -242,7 +229,7 @@ class DictionaryBuilder : public ArrayBuilder {
   void Reset() override {
     ArrayBuilder::Reset();
     values_builder_.Reset();
-    memo_table_.reset(new internal::DictionaryMemoTable(type_));
+    memo_table_.reset(new DictionaryMemoTable(type_));
     delta_offset_ = 0;
   }
 
@@ -291,26 +278,27 @@ class DictionaryBuilder : public ArrayBuilder {
   bool is_building_delta() { return delta_offset_ > 0; }
 
  protected:
-  std::unique_ptr<internal::DictionaryMemoTable> memo_table_;
+  std::unique_ptr<DictionaryMemoTable> memo_table_;
 
   int32_t delta_offset_;
   // Only used for FixedSizeBinaryType
   int32_t byte_width_;
 
-  AdaptiveIntBuilder values_builder_;
+  BuilderType values_builder_;
 };
 
-template <>
-class DictionaryBuilder<NullType> : public ArrayBuilder {
+template <typename BuilderType>
+class DictionaryBuilderBase<BuilderType, NullType> : public ArrayBuilder {
  public:
-  DictionaryBuilder(const std::shared_ptr<DataType>& type,
-                    MemoryPool* pool = default_memory_pool())
+  DictionaryBuilderBase(const std::shared_ptr<DataType>& type,
+                        MemoryPool* pool = default_memory_pool())
       : ArrayBuilder(type, pool), values_builder_(pool) {}
-  explicit DictionaryBuilder(MemoryPool* pool = default_memory_pool())
+
+  explicit DictionaryBuilderBase(MemoryPool* pool = default_memory_pool())
       : ArrayBuilder(null(), pool), values_builder_(pool) {}
 
-  DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
-                    MemoryPool* pool = default_memory_pool())
+  DictionaryBuilderBase(const std::shared_ptr<Array>& dictionary,
+                        MemoryPool* pool = default_memory_pool())
       : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {}
 
   /// \brief Append a scalar null value
@@ -362,16 +350,68 @@ class DictionaryBuilder<NullType> : public ArrayBuilder {
   Status Finish(std::shared_ptr<DictionaryArray>* out) { return FinishTyped(out); }
 
  protected:
-  AdaptiveIntBuilder values_builder_;
+  BuilderType values_builder_;
 };
 
-class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
+}  // namespace internal
+
+/// \brief A DictionaryArray builder that uses AdaptiveIntBuilder to return the
+/// smallest index size that can accommodate the dictionary indices
+template <typename T>
+class DictionaryBuilder : public internal::DictionaryBuilderBase<AdaptiveIntBuilder, T> {
  public:
-  using DictionaryBuilder::Append;
-  using DictionaryBuilder::AppendIndices;
-  using DictionaryBuilder::DictionaryBuilder;
+  using BASE = internal::DictionaryBuilderBase<AdaptiveIntBuilder, T>;
+  using BASE::BASE;
+
+  /// \brief Append dictionary indices directly without modifying memo
+  ///
+  /// NOTE: Experimental API
+  Status AppendIndices(const int64_t* values, int64_t length,
+                       const uint8_t* valid_bytes = NULLPTR) {
+    int64_t null_count_before = this->values_builder_.null_count();
+    ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes));
+    this->length_ += length;
+    this->null_count_ += this->values_builder_.null_count() - null_count_before;
+    return Status::OK();
+  }
+};
 
-  BinaryDictionaryBuilder() : BinaryDictionaryBuilder(default_memory_pool()) {}
+/// \brief A DictionaryArray builder that always returns int32 dictionary
+/// indices so that data cast to dictionary form will have a consistent index
+/// type, e.g. for creating a ChunkedArray
+template <typename T>
+class Dictionary32Builder : public internal::DictionaryBuilderBase<Int32Builder, T> {
+ public:
+  using BASE = internal::DictionaryBuilderBase<Int32Builder, T>;
+  using BASE::BASE;
+
+  /// \brief Append dictionary indices directly without modifying memo
+  ///
+  /// NOTE: Experimental API
+  Status AppendIndices(const int32_t* values, int64_t length,
+                       const uint8_t* valid_bytes = NULLPTR) {
+    int64_t null_count_before = this->values_builder_.null_count();
+    ARROW_RETURN_NOT_OK(this->values_builder_.AppendValues(values, length, valid_bytes));
+    this->length_ += length;
+    this->null_count_ += this->values_builder_.null_count() - null_count_before;
+    return Status::OK();
+  }
+};
+
+// ----------------------------------------------------------------------
+// Binary / Unicode builders with slightly expanded APIs
+
+namespace internal {
+
+template <typename T>
+class BinaryDictionaryBuilderImpl : public DictionaryBuilder<T> {
+ public:
+  using BASE = DictionaryBuilder<T>;
+  using BASE::Append;
+  using BASE::AppendIndices;
+  using BASE::BASE;
+
+  BinaryDictionaryBuilderImpl() : BinaryDictionaryBuilderImpl(default_memory_pool()) {}
 
   Status Append(const uint8_t* value, int32_t length) {
     return Append(reinterpret_cast<const char*>(value), length);
@@ -382,14 +422,16 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType
   }
 };
 
-/// \brief Dictionary array builder with convenience methods for strings
-class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
+template <typename T>
+class BinaryDictionary32BuilderImpl : public Dictionary32Builder<T> {
  public:
-  using DictionaryBuilder::Append;
-  using DictionaryBuilder::AppendIndices;
-  using DictionaryBuilder::DictionaryBuilder;
+  using BASE = Dictionary32Builder<T>;
+  using BASE::Append;
+  using BASE::AppendIndices;
+  using BASE::BASE;
 
-  StringDictionaryBuilder() : StringDictionaryBuilder(default_memory_pool()) {}
+  BinaryDictionary32BuilderImpl()
+      : BinaryDictionary32BuilderImpl(default_memory_pool()) {}
 
   Status Append(const uint8_t* value, int32_t length) {
     return Append(reinterpret_cast<const char*>(value), length);
@@ -400,4 +442,28 @@ class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType
   }
 };
 
+}  // namespace internal
+
+class BinaryDictionaryBuilder : public internal::BinaryDictionaryBuilderImpl<BinaryType> {
+  using BASE = internal::BinaryDictionaryBuilderImpl<BinaryType>;
+  using BASE::BASE;
+};
+
+class StringDictionaryBuilder : public internal::BinaryDictionaryBuilderImpl<StringType> {
+  using BASE = BinaryDictionaryBuilderImpl<StringType>;
+  using BASE::BASE;
+};
+
+class BinaryDictionary32Builder
+    : public internal::BinaryDictionary32BuilderImpl<BinaryType> {
+  using BASE = internal::BinaryDictionary32BuilderImpl<BinaryType>;
+  using BASE::BASE;
+};
+
+class StringDictionary32Builder
+    : public internal::BinaryDictionary32BuilderImpl<StringType> {
+  using BASE = internal::BinaryDictionary32BuilderImpl<StringType>;
+  using BASE::BASE;
+};
+
 }  // namespace arrow
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index cfc1c1f..f733ea5 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -2736,20 +2736,22 @@ TEST(TestArrowWriterAdHoc, SchemaMismatch) {
 
 class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
  public:
+  static constexpr int kNumRowGroups = 10;
+
   void SetUp() override {
     GenerateData(GetParam());
 
     // Write 4 row groups; each row group will have a different dictionary
     ASSERT_NO_FATAL_FAILURE(
-        WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / 4,
+        WriteTableToBuffer(expected_dense_, expected_dense_->num_rows() / kNumRowGroups,
                            default_arrow_writer_properties(), &buffer_));
 
     properties_ = default_arrow_reader_properties();
   }
 
   void GenerateData(double null_probability) {
-    constexpr int num_unique = 100;
-    constexpr int repeat = 100;
+    constexpr int num_unique = 1000;
+    constexpr int repeat = 50;
     constexpr int64_t min_length = 2;
     constexpr int64_t max_length = 100;
     ::arrow::random::RandomArrayGenerator rag(0);
@@ -2781,7 +2783,7 @@ class TestArrowReadDictionary : public ::testing::TestWithParam<double> {
 };
 
 void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
-  ::arrow::StringDictionaryBuilder builder(default_memory_pool());
+  ::arrow::StringDictionary32Builder builder(default_memory_pool());
   const auto& string_array = static_cast<const ::arrow::StringArray&>(arr);
   ASSERT_OK(builder.AppendArray(string_array));
   ASSERT_OK(builder.Finish(out));
@@ -2790,9 +2792,9 @@ void AsDictionaryEncoded(const Array& arr, std::shared_ptr<Array>* out) {
 TEST_P(TestArrowReadDictionary, ReadWholeFileDict) {
   properties_.set_read_dictionary(0, true);
 
-  std::vector<std::shared_ptr<Array>> chunks(4);
-  const int64_t chunk_size = expected_dense_->num_rows() / 4;
-  for (int i = 0; i < 4; ++i) {
+  std::vector<std::shared_ptr<Array>> chunks(kNumRowGroups);
+  const int64_t chunk_size = expected_dense_->num_rows() / kNumRowGroups;
+  for (int i = 0; i < kNumRowGroups; ++i) {
     AsDictionaryEncoded(*dense_values_->Slice(chunk_size * i, chunk_size), &chunks[i]);
   }
   auto ex_table = MakeSimpleTable(std::make_shared<ChunkedArray>(chunks),
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 4562d7b..4626869 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1311,7 +1311,7 @@ class ByteArrayDictionaryRecordReader : public TypedRecordReader<ByteArrayType>,
  private:
   using BinaryDictDecoder = DictDecoder<ByteArrayType>;
 
-  ::arrow::BinaryDictionaryBuilder builder_;
+  ::arrow::BinaryDictionary32Builder builder_;
   std::vector<std::shared_ptr<::arrow::Array>> result_chunks_;
 };
 
diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc
index 71659ea..2079f25 100644
--- a/cpp/src/parquet/encoding-benchmark.cc
+++ b/cpp/src/parquet/encoding-benchmark.cc
@@ -36,6 +36,7 @@ using arrow::default_memory_pool;
 using arrow::MemoryPool;
 
 namespace {
+
 // The min/max number of values used to drive each family of encoding benchmarks
 constexpr int MIN_RANGE = 1024;
 constexpr int MAX_RANGE = 65536;
@@ -335,7 +336,7 @@ class BenchmarkDecodeArrow : public ::benchmark::Fixture {
   std::shared_ptr<Buffer> buffer_;
 };
 
-using ::arrow::BinaryDictionaryBuilder;
+using ::arrow::BinaryDictionary32Builder;
 using ::arrow::internal::ChunkedBinaryBuilder;
 
 template <>
@@ -346,9 +347,9 @@ std::unique_ptr<ChunkedBinaryBuilder> BenchmarkDecodeArrow::CreateBuilder() {
 }
 
 template <>
-std::unique_ptr<BinaryDictionaryBuilder> BenchmarkDecodeArrow::CreateBuilder() {
-  return std::unique_ptr<BinaryDictionaryBuilder>(
-      new BinaryDictionaryBuilder(default_memory_pool()));
+std::unique_ptr<BinaryDictionary32Builder> BenchmarkDecodeArrow::CreateBuilder() {
+  return std::unique_ptr<BinaryDictionary32Builder>(
+      new BinaryDictionary32Builder(default_memory_pool()));
 }
 
 // ----------------------------------------------------------------------
@@ -379,12 +380,14 @@ BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dense)
     ->Range(MIN_RANGE, MAX_RANGE);
 
 BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrow_Dict)
-(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionary32Builder>(state); }
 BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrow_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
 BENCHMARK_DEFINE_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict)
-(benchmark::State& state) { DecodeArrowNonNullBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) {
+  DecodeArrowNonNullBenchmark<BinaryDictionary32Builder>(state);
+}
 BENCHMARK_REGISTER_F(BM_PlainDecodingByteArray, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
@@ -439,12 +442,14 @@ BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dense)
     ->Range(MIN_RANGE, MAX_RANGE);
 
 BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrow_Dict)
-(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) { DecodeArrowBenchmark<BinaryDictionary32Builder>(state); }
 BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrow_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
 BENCHMARK_DEFINE_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict)
-(benchmark::State& state) { DecodeArrowNonNullBenchmark<BinaryDictionaryBuilder>(state); }
+(benchmark::State& state) {
+  DecodeArrowNonNullBenchmark<BinaryDictionary32Builder>(state);
+}
 BENCHMARK_REGISTER_F(BM_DictDecodingByteArray, DecodeArrowNonNull_Dict)
     ->Range(MIN_RANGE, MAX_RANGE);
 
diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc
index 799b81d..9497534 100644
--- a/cpp/src/parquet/encoding-test.cc
+++ b/cpp/src/parquet/encoding-test.cc
@@ -328,7 +328,7 @@ TEST(TestDictionaryEncoding, CannotDictDecodeBoolean) {
 class TestArrowBuilderDecoding : public ::testing::Test {
  public:
   using DenseBuilder = ::arrow::internal::ChunkedBinaryBuilder;
-  using DictBuilder = ::arrow::BinaryDictionaryBuilder;
+  using DictBuilder = ::arrow::BinaryDictionary32Builder;
 
   void SetUp() override { null_probabilities_ = {0.0, 0.5, 1.0}; }
   void TearDown() override {}
diff --git a/cpp/src/parquet/encoding.cc b/cpp/src/parquet/encoding.cc
index ef42e88..26b9050 100644
--- a/cpp/src/parquet/encoding.cc
+++ b/cpp/src/parquet/encoding.cc
@@ -711,7 +711,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionaryBuilder* builder) override {
+                  ::arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -728,7 +728,7 @@ class PlainByteArrayDecoder : public PlainDecoder<ByteArrayType>,
   }
 
   int DecodeArrowNonNull(int num_values,
-                         ::arrow::BinaryDictionaryBuilder* builder) override {
+                         ::arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
     return result;
@@ -878,12 +878,12 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
     if (num_values > 0) {
       // TODO(wesm): Refactor to batch reads for improved memory use. It is not
       // trivial because the null_count is relative to the entire bitmap
-      PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int64_t>(
+      PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
           num_values, /*shrink_to_fit=*/false));
     }
 
     auto indices_buffer =
-        reinterpret_cast<int64_t*>(indices_scratch_space_->mutable_data());
+        reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
 
     if (num_values != idx_decoder_.GetBatchSpaced(num_values, null_count, valid_bits,
                                                   valid_bits_offset, indices_buffer)) {
@@ -898,7 +898,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
       bit_reader.Next();
     }
 
-    auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder);
+    auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
     PARQUET_THROW_NOT_OK(
         binary_builder->AppendIndices(indices_buffer, num_values, valid_bytes.data()));
     num_values_ -= num_values;
@@ -912,15 +912,15 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
       // TODO(wesm): Refactor to batch reads for improved memory use. This is
       // relatively simple here because we don't have to do any bookkeeping of
       // nulls
-      PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int64_t>(
+      PARQUET_THROW_NOT_OK(indices_scratch_space_->TypedResize<int32_t>(
           num_values, /*shrink_to_fit=*/false));
     }
     auto indices_buffer =
-        reinterpret_cast<int64_t*>(indices_scratch_space_->mutable_data());
+        reinterpret_cast<int32_t*>(indices_scratch_space_->mutable_data());
     if (num_values != idx_decoder_.GetBatch(indices_buffer, num_values)) {
       ParquetException::EofException();
     }
-    auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder);
+    auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
     PARQUET_THROW_NOT_OK(binary_builder->AppendIndices(indices_buffer, num_values));
     num_values_ -= num_values;
     return num_values;
@@ -952,7 +952,7 @@ class DictDecoderImpl : public DecoderImpl, virtual public DictDecoder<Type> {
   std::shared_ptr<ResizableBuffer> byte_array_offsets_;
 
   // Reusable buffer for decoding dictionary indices to be appended to a
-  // BinaryDictionaryBuilder
+  // BinaryDictionary32Builder
   std::shared_ptr<ResizableBuffer> indices_scratch_space_;
 
   ::arrow::util::RleDecoder idx_decoder_;
@@ -1024,7 +1024,7 @@ void DictDecoderImpl<Type>::InsertDictionary(::arrow::ArrayBuilder* builder) {
 
 template <>
 void DictDecoderImpl<ByteArrayType>::InsertDictionary(::arrow::ArrayBuilder* builder) {
-  auto binary_builder = checked_cast<::arrow::BinaryDictionaryBuilder*>(builder);
+  auto binary_builder = checked_cast<::arrow::BinaryDictionary32Builder*>(builder);
 
   // Make an BinaryArray referencing the internal dictionary data
   auto arr = std::make_shared<::arrow::BinaryArray>(
@@ -1040,7 +1040,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
 
   int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                   int64_t valid_bits_offset,
-                  ::arrow::BinaryDictionaryBuilder* builder) override {
+                  ::arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrow(num_values, null_count, valid_bits,
                                      valid_bits_offset, builder, &result));
@@ -1057,7 +1057,7 @@ class DictByteArrayDecoderImpl : public DictDecoderImpl<ByteArrayType>,
   }
 
   int DecodeArrowNonNull(int num_values,
-                         ::arrow::BinaryDictionaryBuilder* builder) override {
+                         ::arrow::BinaryDictionary32Builder* builder) override {
     int result = 0;
     PARQUET_THROW_NOT_OK(DecodeArrowNonNull(num_values, builder, &result));
     return result;
diff --git a/cpp/src/parquet/encoding.h b/cpp/src/parquet/encoding.h
index 4918a13..5aa1fed 100644
--- a/cpp/src/parquet/encoding.h
+++ b/cpp/src/parquet/encoding.h
@@ -29,7 +29,7 @@
 namespace arrow {
 
 class ArrayBuilder;
-class BinaryDictionaryBuilder;
+class BinaryDictionary32Builder;
 
 namespace internal {
 
@@ -225,10 +225,10 @@ class ByteArrayDecoder : virtual public TypedDecoder<ByteArrayType> {
 
   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,
-                          ::arrow::BinaryDictionaryBuilder* builder) = 0;
+                          ::arrow::BinaryDictionary32Builder* builder) = 0;
 
   virtual int DecodeArrowNonNull(int num_values,
-                                 ::arrow::BinaryDictionaryBuilder* builder) = 0;
+                                 ::arrow::BinaryDictionary32Builder* builder) = 0;
 
   virtual int DecodeArrow(int num_values, int null_count, const uint8_t* valid_bits,
                           int64_t valid_bits_offset,