You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/05/12 15:14:01 UTC

[arrow] branch master updated: ARROW-4725: [C++] Enable dictionary builder tests with MinGW build

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 828c44b  ARROW-4725: [C++] Enable dictionary builder tests with MinGW build
828c44b is described below

commit 828c44baa581077d3be23a346450895f1f352770
Author: Kouhei Sutou <ko...@clear-code.com>
AuthorDate: Sun May 12 10:13:48 2019 -0500

    ARROW-4725: [C++] Enable dictionary builder tests with MinGW build
    
    This stops exporting template class that has internal
    implementation. It's not supported in MinGW. This exposes all template
    class implementations to .h and hides internal details to .cc by pimpl
    idiom.
    
    See also ARROW-4399.
    
    Author: Kouhei Sutou <ko...@clear-code.com>
    
    Closes #4255 from kou/cpp-dictionary-build-stop-export-template and squashes the following commits:
    
    e4d7f7d0a <Kouhei Sutou> Simplify
    9f3791766 <Kouhei Sutou>  Enable dictionary builder tests with MinGW build
---
 ci/appveyor-cpp-setup-mingw.bat     |   1 +
 cpp/src/arrow/CMakeLists.txt        |  28 +---
 cpp/src/arrow/array/builder_dict.cc | 313 +++++++++++++++---------------------
 cpp/src/arrow/array/builder_dict.h  | 225 +++++++++++++++++++++++---
 cpp/src/arrow/util/hashing.h        |  22 ++-
 5 files changed, 356 insertions(+), 233 deletions(-)

diff --git a/ci/appveyor-cpp-setup-mingw.bat b/ci/appveyor-cpp-setup-mingw.bat
index d65b9bd..4e4ae75 100644
--- a/ci/appveyor-cpp-setup-mingw.bat
+++ b/ci/appveyor-cpp-setup-mingw.bat
@@ -32,6 +32,7 @@ pacman --sync --refresh --noconfirm ^
     "%MINGW_PACKAGE_PREFIX%-flatbuffers" ^
     "%MINGW_PACKAGE_PREFIX%-gflags" ^
     "%MINGW_PACKAGE_PREFIX%-gobject-introspection" ^
+    "%MINGW_PACKAGE_PREFIX%-gtest" ^
     "%MINGW_PACKAGE_PREFIX%-gtk-doc" ^
     "%MINGW_PACKAGE_PREFIX%-lz4" ^
     "%MINGW_PACKAGE_PREFIX%-meson" ^
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index f9bc964..dc1701d 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -323,26 +323,14 @@ arrow_add_pkg_config("arrow")
 #
 
 add_arrow_test(allocator-test)
-
-if(WIN32)
-  add_arrow_test(array-test
-                 SOURCES
-                 array-test.cc
-                 array-binary-test.cc
-                 array-list-test.cc
-                 array-struct-test.cc
-                 array-union-test.cc)
-else()
-  add_arrow_test(array-test
-                 SOURCES
-                 array-test.cc
-                 array-binary-test.cc
-                 array-dict-test.cc
-                 array-list-test.cc
-                 array-struct-test.cc
-                 array-union-test.cc)
-endif()
-
+add_arrow_test(array-test
+               SOURCES
+               array-test.cc
+               array-binary-test.cc
+               array-dict-test.cc
+               array-list-test.cc
+               array-struct-test.cc
+               array-union-test.cc)
 add_arrow_test(buffer-test)
 
 if(ARROW_IPC)
diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc
index 2e43234..8af3628 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -17,7 +17,6 @@
 
 #include "arrow/array/builder_dict.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <limits>
 #include <sstream>
@@ -145,229 +144,173 @@ Status DictionaryType::Unify(MemoryPool* pool, const std::vector<const DataType*
 // ----------------------------------------------------------------------
 // DictionaryBuilder
 
-template <typename T>
-class DictionaryBuilder<T>::MemoTableImpl
-    : public internal::HashTraits<T>::MemoTableType {
- public:
-  using MemoTableType = typename internal::HashTraits<T>::MemoTableType;
-  using MemoTableType::MemoTableType;
-
-  MemoTableImpl(const std::shared_ptr<Array>& dictionary)
-      : MemoTableImpl(dictionary->length()) {
-    const auto& values =
-        static_cast<const typename TypeTraits<T>::ArrayType&>(*dictionary);
-    for (int64_t i = 0; i < values.length(); ++i) {
-      ARROW_IGNORE_EXPR(this->GetOrInsert(values.GetView(i)));
+class internal::DictionaryMemoTable::DictionaryMemoTableImpl {
+  struct MemoTableInitializer {
+    std::shared_ptr<DataType> value_type_;
+    std::unique_ptr<MemoTable>* memo_table_;
+
+    Status Visit(const DataType&, void* = nullptr) {
+      return Status::NotImplemented("Initialization of ", value_type_,
+                                    " memo table is not implemented");
     }
-  }
-};
 
-template <typename T>
-DictionaryBuilder<T>::~DictionaryBuilder() {}
-
-template <typename T>
-DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
-                                        MemoryPool* pool)
-    : ArrayBuilder(dictionary->type(), pool),
-      memo_table_(new MemoTableImpl(dictionary)),
-      delta_offset_(0),
-      byte_width_(-1),
-      values_builder_(pool) {
-  DCHECK_EQ(T::type_id, type_->id()) << "inconsistent type passed to DictionaryBuilder";
-}
+    template <typename T>
+    Status Visit(const T&,
+                 typename internal::DictionaryTraits<T>::MemoTableType* = nullptr) {
+      using MemoTable = typename internal::DictionaryTraits<T>::MemoTableType;
+      memo_table_->reset(new MemoTable(0));
+      return Status::OK();
+    }
+  };
 
-template <typename T>
-DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
-                                        MemoryPool* pool)
-    : ArrayBuilder(type, pool),
-      memo_table_(new MemoTableImpl(0)),
-      delta_offset_(0),
-      byte_width_(-1),
-      values_builder_(pool) {
-  DCHECK_EQ(T::type_id, type->id()) << "inconsistent type passed to DictionaryBuilder";
-}
+  struct ArrayValuesInserter {
+    DictionaryMemoTableImpl* impl_;
 
-DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
-                                               MemoryPool* pool)
-    : ArrayBuilder(type, pool), values_builder_(pool) {
-  DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to DictionaryBuilder";
-}
+    template <typename T>
+    Status Visit(const T& array) {
+      return InsertValues(array.type(), array);
+    }
 
-DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<Array>& dictionary,
-                                               MemoryPool* pool)
-    : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {
-  DCHECK_EQ(Type::NA, type_->id()) << "inconsistent type passed to DictionaryBuilder";
-}
+   private:
+    template <typename DataType, typename Array>
+    Status InsertValues(const DataType& type, const Array&, void* = nullptr) {
+      return Status::NotImplemented("Inserting array values of ", type,
+                                    " is not implemented");
+    }
 
-template <>
-DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
-    const std::shared_ptr<DataType>& type, MemoryPool* pool)
-    : ArrayBuilder(type, pool),
-      memo_table_(new MemoTableImpl(0)),
-      delta_offset_(0),
-      byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
-
-template <typename T>
-void DictionaryBuilder<T>::Reset() {
-  ArrayBuilder::Reset();
-  values_builder_.Reset();
-  memo_table_.reset(new MemoTableImpl(0));
-  delta_offset_ = 0;
-}
+    template <typename DataType, typename Array>
+    Status InsertValues(
+        const DataType&, const Array& array,
+        typename internal::DictionaryTraits<DataType>::MemoTableType* = nullptr) {
+      for (int64_t i = 0; i < array.length(); ++i) {
+        ARROW_IGNORE_EXPR(impl_->GetOrInsert(array.GetView(i)));
+      }
+      return Status::OK();
+    }
+  };
+
+  struct ArrayDataGetter {
+    std::shared_ptr<DataType> value_type_;
+    MemoTable* memo_table_;
+    MemoryPool* pool_;
+    int64_t start_offset_;
+    std::shared_ptr<ArrayData>* out_;
+
+    Status Visit(const DataType&, void* = nullptr) {
+      return Status::NotImplemented("Getting array data of ", value_type_,
+                                    " is not implemented");
+    }
 
-template <typename T>
-Status DictionaryBuilder<T>::Resize(int64_t capacity) {
-  RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
-  capacity = std::max(capacity, kMinBuilderCapacity);
+    template <typename T>
+    Status Visit(const T&,
+                 typename internal::DictionaryTraits<T>::MemoTableType* = nullptr) {
+      using ConcreteMemoTable = typename internal::DictionaryTraits<T>::MemoTableType;
+      auto memo_table = static_cast<ConcreteMemoTable*>(memo_table_);
+      return internal::DictionaryTraits<T>::GetDictionaryArrayData(
+          pool_, value_type_, *memo_table, start_offset_, out_);
+    }
+  };
 
-  if (capacity_ == 0) {
-    // Initialize hash table
-    // XXX should we let the user pass additional size heuristics?
-    delta_offset_ = 0;
+ public:
+  explicit DictionaryMemoTableImpl(const std::shared_ptr<DataType>& type)
+      : type_(type), memo_table_(nullptr) {
+    MemoTableInitializer visitor{type_, &memo_table_};
+    ARROW_IGNORE_EXPR(VisitTypeInline(*type_, &visitor));
   }
-  RETURN_NOT_OK(values_builder_.Resize(capacity));
-  capacity_ = values_builder_.capacity();
-  return Status::OK();
-}
-
-Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
-  RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
-  capacity = std::max(capacity, kMinBuilderCapacity);
 
-  RETURN_NOT_OK(values_builder_.Resize(capacity));
-  capacity_ = values_builder_.capacity();
-  return Status::OK();
-}
+  Status InsertValues(const std::shared_ptr<Array>& array) {
+    ArrayValuesInserter visitor{this};
+    return VisitArrayInline(*array, &visitor);
+  }
 
-template <typename T>
-Status DictionaryBuilder<T>::Append(const Scalar& value) {
-  RETURN_NOT_OK(Reserve(1));
+  template <typename T>
+  int32_t GetOrInsert(const T& value) {
+    using ConcreteMemoTable = typename internal::DictionaryTraits<
+        typename CTypeTraits<T>::ArrowType>::MemoTableType;
+    return static_cast<ConcreteMemoTable*>(memo_table_.get())->GetOrInsert(value);
+  }
 
-  auto memo_index = memo_table_->GetOrInsert(value);
-  RETURN_NOT_OK(values_builder_.Append(memo_index));
-  length_ += 1;
+  int32_t GetOrInsert(const util::string_view& value) {
+    return static_cast<BinaryMemoTable*>(memo_table_.get())->GetOrInsert(value);
+  }
 
-  return Status::OK();
-}
+  Status GetArrayData(MemoryPool* pool, int64_t start_offset,
+                      std::shared_ptr<ArrayData>* out) {
+    ArrayDataGetter visitor{type_, memo_table_.get(), pool, start_offset, out};
+    return VisitTypeInline(*type_, &visitor);
+  }
 
-template <typename T>
-Status DictionaryBuilder<T>::AppendNull() {
-  length_ += 1;
-  null_count_ += 1;
+  int32_t size() const { return memo_table_->size(); }
 
-  return values_builder_.AppendNull();
-}
+ private:
+  std::shared_ptr<DataType> type_;
+  std::unique_ptr<MemoTable> memo_table_;
+};
 
-template <typename T>
-Status DictionaryBuilder<T>::AppendNulls(int64_t length) {
-  length_ += length;
-  null_count_ += length;
+internal::DictionaryMemoTable::DictionaryMemoTable(const std::shared_ptr<DataType>& type)
+    : impl_(new DictionaryMemoTableImpl(type)) {}
 
-  return values_builder_.AppendNulls(length);
+internal::DictionaryMemoTable::DictionaryMemoTable(
+    const std::shared_ptr<Array>& dictionary)
+    : impl_(new DictionaryMemoTableImpl(dictionary->type())) {
+  ARROW_IGNORE_EXPR(impl_->InsertValues(dictionary));
 }
 
-Status DictionaryBuilder<NullType>::AppendNull() {
-  length_ += 1;
-  null_count_ += 1;
+internal::DictionaryMemoTable::~DictionaryMemoTable() = default;
 
-  return values_builder_.AppendNull();
+int32_t internal::DictionaryMemoTable::GetOrInsert(const bool& value) {
+  return impl_->GetOrInsert(value);
 }
 
-Status DictionaryBuilder<NullType>::AppendNulls(int64_t length) {
-  length_ += length;
-  null_count_ += length;
-
-  return values_builder_.AppendNulls(length);
+int32_t internal::DictionaryMemoTable::GetOrInsert(const int8_t& value) {
+  return impl_->GetOrInsert(value);
 }
 
-template <typename T>
-Status DictionaryBuilder<T>::AppendArray(const Array& array) {
-  using ArrayType = typename TypeTraits<T>::ArrayType;
-
-  const auto& concrete_array = checked_cast<const ArrayType&>(array);
-  for (int64_t i = 0; i < array.length(); i++) {
-    if (array.IsNull(i)) {
-      RETURN_NOT_OK(AppendNull());
-    } else {
-      RETURN_NOT_OK(Append(concrete_array.GetView(i)));
-    }
-  }
-  return Status::OK();
+int32_t internal::DictionaryMemoTable::GetOrInsert(const int16_t& value) {
+  return impl_->GetOrInsert(value);
 }
 
-template <>
-Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
-  if (!type_->Equals(*array.type())) {
-    return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
-  }
-
-  const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
-  for (int64_t i = 0; i < array.length(); i++) {
-    if (array.IsNull(i)) {
-      RETURN_NOT_OK(AppendNull());
-    } else {
-      RETURN_NOT_OK(Append(typed_array.GetValue(i)));
-    }
-  }
-  return Status::OK();
+int32_t internal::DictionaryMemoTable::GetOrInsert(const int32_t& value) {
+  return impl_->GetOrInsert(value);
 }
 
-Status DictionaryBuilder<NullType>::AppendArray(const Array& array) {
-  for (int64_t i = 0; i < array.length(); i++) {
-    RETURN_NOT_OK(AppendNull());
-  }
-  return Status::OK();
+int32_t internal::DictionaryMemoTable::GetOrInsert(const int64_t& value) {
+  return impl_->GetOrInsert(value);
 }
 
-template <typename T>
-Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  // Finalize indices array
-  RETURN_NOT_OK(values_builder_.FinishInternal(out));
-
-  // Generate dictionary array from hash table contents
-  std::shared_ptr<Array> dictionary;
-  std::shared_ptr<ArrayData> dictionary_data;
+int32_t internal::DictionaryMemoTable::GetOrInsert(const uint8_t& value) {
+  return impl_->GetOrInsert(value);
+}
 
-  RETURN_NOT_OK(internal::DictionaryTraits<T>::GetDictionaryArrayData(
-      pool_, type_, *memo_table_, delta_offset_, &dictionary_data));
-  dictionary = MakeArray(dictionary_data);
+int32_t internal::DictionaryMemoTable::GetOrInsert(const uint16_t& value) {
+  return impl_->GetOrInsert(value);
+}
 
-  // Set type of array data to the right dictionary type
-  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+int32_t internal::DictionaryMemoTable::GetOrInsert(const uint32_t& value) {
+  return impl_->GetOrInsert(value);
+}
 
-  // Update internals for further uses of this DictionaryBuilder
-  delta_offset_ = memo_table_->size();
-  values_builder_.Reset();
+int32_t internal::DictionaryMemoTable::GetOrInsert(const uint64_t& value) {
+  return impl_->GetOrInsert(value);
+}
 
-  return Status::OK();
+int32_t internal::DictionaryMemoTable::GetOrInsert(const float& value) {
+  return impl_->GetOrInsert(value);
 }
 
-Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* out) {
-  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+int32_t internal::DictionaryMemoTable::GetOrInsert(const double& value) {
+  return impl_->GetOrInsert(value);
+}
 
-  RETURN_NOT_OK(values_builder_.FinishInternal(out));
-  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+int32_t internal::DictionaryMemoTable::GetOrInsert(const util::string_view& value) {
+  return impl_->GetOrInsert(value);
+}
 
-  return Status::OK();
+Status internal::DictionaryMemoTable::GetArrayData(MemoryPool* pool, int64_t start_offset,
+                                                   std::shared_ptr<ArrayData>* out) {
+  return impl_->GetArrayData(pool, start_offset, out);
 }
 
-template class DictionaryBuilder<UInt8Type>;
-template class DictionaryBuilder<UInt16Type>;
-template class DictionaryBuilder<UInt32Type>;
-template class DictionaryBuilder<UInt64Type>;
-template class DictionaryBuilder<Int8Type>;
-template class DictionaryBuilder<Int16Type>;
-template class DictionaryBuilder<Int32Type>;
-template class DictionaryBuilder<Int64Type>;
-template class DictionaryBuilder<Date32Type>;
-template class DictionaryBuilder<Date64Type>;
-template class DictionaryBuilder<Time32Type>;
-template class DictionaryBuilder<Time64Type>;
-template class DictionaryBuilder<TimestampType>;
-template class DictionaryBuilder<FloatType>;
-template class DictionaryBuilder<DoubleType>;
-template class DictionaryBuilder<FixedSizeBinaryType>;
-template class DictionaryBuilder<BinaryType>;
-template class DictionaryBuilder<StringType>;
+int32_t internal::DictionaryMemoTable::size() const { return impl_->size(); }
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h
index e7f44b9..84f2e87 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -17,11 +17,14 @@
 
 #pragma once
 
+#include <algorithm>
 #include <memory>
 
 #include "arrow/array/builder_adaptive.h"  // IWYU pragma: export
 #include "arrow/array/builder_base.h"      // IWYU pragma: export
 
+#include "arrow/array.h"
+
 namespace arrow {
 
 // ----------------------------------------------------------------------
@@ -49,6 +52,35 @@ struct DictionaryScalar<FixedSizeBinaryType> {
   using type = util::string_view;
 };
 
+class ARROW_EXPORT DictionaryMemoTable {
+ public:
+  explicit DictionaryMemoTable(const std::shared_ptr<DataType>& type);
+  explicit DictionaryMemoTable(const std::shared_ptr<Array>& dictionary);
+  ~DictionaryMemoTable();
+
+  int32_t GetOrInsert(const bool& value);
+  int32_t GetOrInsert(const int8_t& value);
+  int32_t GetOrInsert(const int16_t& value);
+  int32_t GetOrInsert(const int32_t& value);
+  int32_t GetOrInsert(const int64_t& value);
+  int32_t GetOrInsert(const uint8_t& value);
+  int32_t GetOrInsert(const uint16_t& value);
+  int32_t GetOrInsert(const uint32_t& value);
+  int32_t GetOrInsert(const uint64_t& value);
+  int32_t GetOrInsert(const float& value);
+  int32_t GetOrInsert(const double& value);
+  int32_t GetOrInsert(const util::string_view& value);
+
+  Status GetArrayData(MemoryPool* pool, int64_t start_offset,
+                      std::shared_ptr<ArrayData>* out);
+
+  int32_t size() const;
+
+ private:
+  class DictionaryMemoTableImpl;
+  std::unique_ptr<DictionaryMemoTableImpl> impl_;
+};
+
 }  // namespace internal
 
 /// \brief Array builder for created encoded DictionaryArray from dense array
@@ -60,25 +92,58 @@ struct DictionaryScalar<FixedSizeBinaryType> {
 ///
 /// data
 template <typename T>
-class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
+class DictionaryBuilder : public ArrayBuilder {
  public:
   using Scalar = typename internal::DictionaryScalar<T>::type;
 
   // WARNING: the type given below is the value type, not the DictionaryType.
   // The DictionaryType is instantiated on the Finish() call.
-  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+  template <typename T1 = T>
+  DictionaryBuilder(
+      typename std::enable_if<!std::is_base_of<FixedSizeBinaryType, T1>::value,
+                              const std::shared_ptr<DataType>&>::type type,
+      MemoryPool* pool)
+      : ArrayBuilder(type, pool),
+        memo_table_(new internal::DictionaryMemoTable(type)),
+        delta_offset_(0),
+        byte_width_(-1),
+        values_builder_(pool) {}
 
-  DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* pool);
+  template <typename T1 = T>
+  explicit DictionaryBuilder(
+      typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value,
+                              const std::shared_ptr<DataType>&>::type type,
+      MemoryPool* pool)
+      : ArrayBuilder(type, pool),
+        memo_table_(new internal::DictionaryMemoTable(type)),
+        delta_offset_(0),
+        byte_width_(static_cast<const T1&>(*type).byte_width()),
+        values_builder_(pool) {}
 
   template <typename T1 = T>
   explicit DictionaryBuilder(
       typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool)
       : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
 
-  ~DictionaryBuilder() override;
+  DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* pool)
+      : ArrayBuilder(dictionary->type(), pool),
+        memo_table_(new internal::DictionaryMemoTable(dictionary)),
+        delta_offset_(0),
+        byte_width_(-1),
+        values_builder_(pool) {}
+
+  ~DictionaryBuilder() override = default;
 
   /// \brief Append a scalar value
-  Status Append(const Scalar& value);
+  Status Append(const Scalar& value) {
+    ARROW_RETURN_NOT_OK(Reserve(1));
+
+    auto memo_index = memo_table_->GetOrInsert(value);
+    ARROW_RETURN_NOT_OK(values_builder_.Append(memo_index));
+    length_ += 1;
+
+    return Status::OK();
+  }
 
   /// \brief Append a fixed-width string (only for FixedSizeBinaryType)
   template <typename T1 = T>
@@ -95,16 +160,100 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
   }
 
   /// \brief Append a scalar null value
-  Status AppendNull() final;
+  Status AppendNull() final {
+    length_ += 1;
+    null_count_ += 1;
 
-  Status AppendNulls(int64_t length) final;
+    return values_builder_.AppendNull();
+  }
+
+  Status AppendNulls(int64_t length) final {
+    length_ += length;
+    null_count_ += length;
+
+    return values_builder_.AppendNulls(length);
+  }
 
   /// \brief Append a whole dense array to the builder
-  Status AppendArray(const Array& array);
+  template <typename T1 = T>
+  Status AppendArray(
+      typename std::enable_if<!std::is_base_of<FixedSizeBinaryType, T1>::value,
+                              const Array&>::type array) {
+    using ArrayType = typename TypeTraits<T>::ArrayType;
+
+    const auto& concrete_array = static_cast<const ArrayType&>(array);
+    for (int64_t i = 0; i < array.length(); i++) {
+      if (array.IsNull(i)) {
+        ARROW_RETURN_NOT_OK(AppendNull());
+      } else {
+        ARROW_RETURN_NOT_OK(Append(concrete_array.GetView(i)));
+      }
+    }
+    return Status::OK();
+  }
 
-  void Reset() override;
-  Status Resize(int64_t capacity) override;
-  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+  template <typename T1 = T>
+  Status AppendArray(
+      typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value,
+                              const Array&>::type array) {
+    if (!type_->Equals(*array.type())) {
+      return Status::Invalid(
+          "Cannot append FixedSizeBinary array with non-matching type");
+    }
+
+    const auto& concrete_array = static_cast<const FixedSizeBinaryArray&>(array);
+    for (int64_t i = 0; i < array.length(); i++) {
+      if (array.IsNull(i)) {
+        ARROW_RETURN_NOT_OK(AppendNull());
+      } else {
+        ARROW_RETURN_NOT_OK(Append(concrete_array.GetValue(i)));
+      }
+    }
+    return Status::OK();
+  }
+
+  void Reset() override {
+    ArrayBuilder::Reset();
+    values_builder_.Reset();
+    memo_table_.reset(new internal::DictionaryMemoTable(type_));
+    delta_offset_ = 0;
+  }
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
+    capacity = std::max(capacity, kMinBuilderCapacity);
+
+    if (capacity_ == 0) {
+      // Initialize hash table
+      // XXX should we let the user pass additional size heuristics?
+      delta_offset_ = 0;
+    }
+    ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity));
+    capacity_ = values_builder_.capacity();
+    return Status::OK();
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    // Finalize indices array
+    ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out));
+
+    // Generate dictionary array from hash table contents
+    std::shared_ptr<Array> dictionary;
+    std::shared_ptr<ArrayData> dictionary_data;
+
+    ARROW_RETURN_NOT_OK(
+        memo_table_->GetArrayData(pool_, delta_offset_, &dictionary_data));
+    dictionary = MakeArray(dictionary_data);
+
+    // Set type of array data to the right dictionary type
+    (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+
+    // Update internals for further uses of this DictionaryBuilder
+    delta_offset_ = memo_table_->size();
+    values_builder_.Reset();
+
+    return Status::OK();
+  }
 
   /// \cond FALSE
   using ArrayBuilder::Finish;
@@ -116,8 +265,7 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
   bool is_building_delta() { return delta_offset_ > 0; }
 
  protected:
-  class MemoTableImpl;
-  std::unique_ptr<MemoTableImpl> memo_table_;
+  std::unique_ptr<internal::DictionaryMemoTable> memo_table_;
 
   int32_t delta_offset_;
   // Only used for FixedSizeBinaryType
@@ -127,23 +275,56 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
 };
 
 template <>
-class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder {
+class DictionaryBuilder<NullType> : public ArrayBuilder {
  public:
-  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
-  explicit DictionaryBuilder(MemoryPool* pool);
+  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : ArrayBuilder(type, pool), values_builder_(pool) {}
+  explicit DictionaryBuilder(MemoryPool* pool)
+      : ArrayBuilder(null(), pool), values_builder_(pool) {}
 
-  DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* pool);
+  DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* pool)
+      : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {}
 
   /// \brief Append a scalar null value
-  Status AppendNull() final;
+  Status AppendNull() final {
+    length_ += 1;
+    null_count_ += 1;
+
+    return values_builder_.AppendNull();
+  }
+
+  Status AppendNulls(int64_t length) final {
+    length_ += length;
+    null_count_ += length;
 
-  Status AppendNulls(int64_t length) final;
+    return values_builder_.AppendNulls(length);
+  }
 
   /// \brief Append a whole dense array to the builder
-  Status AppendArray(const Array& array);
+  Status AppendArray(const Array& array) {
+    for (int64_t i = 0; i < array.length(); i++) {
+      ARROW_RETURN_NOT_OK(AppendNull());
+    }
+    return Status::OK();
+  }
+
+  Status Resize(int64_t capacity) override {
+    ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
+    capacity = std::max(capacity, kMinBuilderCapacity);
 
-  Status Resize(int64_t capacity) override;
-  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+    ARROW_RETURN_NOT_OK(values_builder_.Resize(capacity));
+    capacity_ = values_builder_.capacity();
+    return Status::OK();
+  }
+
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+    std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+
+    ARROW_RETURN_NOT_OK(values_builder_.FinishInternal(out));
+    (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+
+    return Status::OK();
+  }
 
   /// \cond FALSE
   using ArrayBuilder::Finish;
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index 044d4e9..2730158 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -333,13 +333,23 @@ class HashTable {
 // XXX typedef memo_index_t int32_t ?
 
 // ----------------------------------------------------------------------
+// A base class for memoization table.
+
+class MemoTable {
+ public:
+  virtual ~MemoTable() = default;
+
+  virtual int32_t size() const = 0;
+};
+
+// ----------------------------------------------------------------------
 // A memoization table for memory-cheap scalar values.
 
 // The memoization table remembers and allows to look up the insertion
 // index for each key.
 
 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
-class ScalarMemoTable {
+class ScalarMemoTable : public MemoTable {
  public:
   explicit ScalarMemoTable(int64_t entries = 0)
       : hash_table_(static_cast<uint64_t>(entries)) {}
@@ -382,7 +392,7 @@ class ScalarMemoTable {
 
   // The number of entries in the memo table
   // (which is also 1 + the largest memo index)
-  int32_t size() const { return static_cast<int32_t>(hash_table_.size()); }
+  int32_t size() const override { return static_cast<int32_t>(hash_table_.size()); }
 
   // Copy values starting from index `start` into `out_data`
   void CopyValues(int32_t start, Scalar* out_data) const {
@@ -435,7 +445,7 @@ struct SmallScalarTraits<Scalar,
 };
 
 template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
-class SmallScalarMemoTable {
+class SmallScalarMemoTable : public MemoTable {
  public:
   explicit SmallScalarMemoTable(int64_t entries = 0) {
     std::fill(value_to_index_, value_to_index_ + cardinality, -1);
@@ -469,7 +479,7 @@ class SmallScalarMemoTable {
 
   // The number of entries in the memo table
   // (which is also 1 + the largest memo index)
-  int32_t size() const { return static_cast<int32_t>(index_to_value_.size()); }
+  int32_t size() const override { return static_cast<int32_t>(index_to_value_.size()); }
 
   // Copy values starting from index `start` into `out_data`
   void CopyValues(int32_t start, Scalar* out_data) const {
@@ -498,7 +508,7 @@ class SmallScalarMemoTable {
 // ----------------------------------------------------------------------
 // A memoization table for variable-sized binary data.
 
-class BinaryMemoTable {
+class BinaryMemoTable : public MemoTable {
  public:
   explicit BinaryMemoTable(int64_t entries = 0, int64_t values_size = -1)
       : hash_table_(static_cast<uint64_t>(entries)) {
@@ -576,7 +586,7 @@ class BinaryMemoTable {
 
   // The number of entries in the memo table
   // (which is also 1 + the largest memo index)
-  int32_t size() const { return static_cast<int32_t>(hash_table_.size()); }
+  int32_t size() const override { return static_cast<int32_t>(hash_table_.size()); }
 
   int32_t values_size() const { return static_cast<int32_t>(values_.size()); }