You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/19 18:54:15 UTC

[arrow] branch master updated: ARROW-554: [C++] Add functions to unify dictionary types and arrays

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new f66fa80  ARROW-554: [C++] Add functions to unify dictionary types and arrays
f66fa80 is described below

commit f66fa805e89aef948581876ac802b1ffc6430f5c
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Dec 19 12:53:09 2018 -0600

    ARROW-554: [C++] Add functions to unify dictionary types and arrays
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #3165 from pitrou/ARROW-554-conform-dicts and squashes the following commits:
    
    7d2579b30 <Antoine Pitrou> ARROW-554:  Add functions to conform dictionaries
---
 cpp/src/arrow/array-dict-test.cc    |  62 +++++++++++++
 cpp/src/arrow/array.cc              |  61 +++++++++++++
 cpp/src/arrow/array.h               |  24 ++++-
 cpp/src/arrow/array/builder_dict.cc | 172 +++++++++++++++++++++++++++---------
 cpp/src/arrow/type-test.cc          | 128 +++++++++++++++++++++++++++
 cpp/src/arrow/type.cc               |   7 +-
 cpp/src/arrow/type.h                |  18 ++++
 cpp/src/arrow/util/hashing.h        |  38 --------
 cpp/src/arrow/util/int-util-test.cc |   9 ++
 cpp/src/arrow/util/int-util.cc      |  40 +++++++++
 cpp/src/arrow/util/int-util.h       |   4 +
 cpp/src/arrow/visitor_inline.h      |   2 +-
 python/pyarrow/tests/test_types.py  |   4 +-
 13 files changed, 484 insertions(+), 85 deletions(-)

diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index 87cb229..730b891 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -31,6 +31,7 @@
 #include "arrow/test-common.h"
 #include "arrow/test-util.h"
 #include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 
 namespace arrow {
@@ -38,6 +39,8 @@ namespace arrow {
 using std::string;
 using std::vector;
 
+using internal::checked_cast;
+
 // ----------------------------------------------------------------------
 // Dictionary tests
 
@@ -740,4 +743,63 @@ TEST(TestDictionary, FromArray) {
   ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4));
 }
 
+TEST(TestDictionary, TransposeBasic) {
+  std::shared_ptr<Array> arr, out, expected;
+
+  auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
+  auto dict_type = dictionary(int16(), dict);
+  auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
+  // ["B", "C", "A", "A"]
+  ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));
+
+  // Transpose to same index type
+  {
+    auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
+    auto out_dict_type = dictionary(int16(), out_dict);
+
+    const std::vector<int32_t> transpose_map{1, 3, 2};
+    ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
+        default_memory_pool(), out_dict_type, transpose_map, &out));
+
+    auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]");
+    ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
+    AssertArraysEqual(*out, *expected);
+  }
+
+  // Transpose to other type
+  {
+    auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
+    auto out_dict_type = dictionary(int8(), out_dict);
+
+    const std::vector<int32_t> transpose_map{1, 3, 2};
+    ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
+        default_memory_pool(), out_dict_type, transpose_map, &out));
+
+    auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]");
+    ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
+    AssertArraysEqual(*expected, *out);
+  }
+}
+
+TEST(TestDictionary, TransposeNulls) {
+  std::shared_ptr<Array> arr, out, expected;
+
+  auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
+  auto dict_type = dictionary(int16(), dict);
+  auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]");
+  // ["B", "C", null, "A"]
+  ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));
+
+  auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
+  auto out_dict_type = dictionary(int16(), out_dict);
+
+  const std::vector<int32_t> transpose_map{1, 3, 2};
+  ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
+      default_memory_pool(), out_dict_type, transpose_map, &out));
+
+  auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]");
+  ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
+  AssertArraysEqual(*expected, *out);
+}
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index ff94aa2..7e45e90 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -33,6 +33,7 @@
 #include "arrow/util/bit-util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
+#include "arrow/util/int-util.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/visitor.h"
@@ -663,6 +664,66 @@ std::shared_ptr<Array> DictionaryArray::dictionary() const {
   return dict_type_->dictionary();
 }
 
+template <typename InType, typename OutType>
+static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data,
+                                   const std::shared_ptr<DataType>& type,
+                                   const std::vector<int32_t>& transpose_map,
+                                   std::shared_ptr<Array>* out) {
+  using in_c_type = typename InType::c_type;
+  using out_c_type = typename OutType::c_type;
+
+  std::shared_ptr<Buffer> out_buffer;
+  RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer));
+  // Null bitmap is unchanged
+  auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer},
+                                  in_data.null_count);
+  internal::TransposeInts(in_data.GetValues<in_c_type>(1),
+                          out_data->GetMutableValues<out_c_type>(1), in_data.length,
+                          transpose_map.data());
+  *out = MakeArray(out_data);
+  return Status::OK();
+}
+
+Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+                                  const std::vector<int32_t>& transpose_map,
+                                  std::shared_ptr<Array>* out) const {
+  DCHECK_EQ(type->id(), Type::DICTIONARY);
+  const auto& out_dict_type = checked_cast<const DictionaryType&>(*type);
+
+  // XXX We'll probably want to make this operation a kernel when we
+  // implement dictionary-to-dictionary casting.
+  auto in_type_id = dict_type_->index_type()->id();
+  auto out_type_id = out_dict_type.index_type()->id();
+
+#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE)                        \
+  case OUT_INDEX_TYPE::type_id:                                                     \
+    return TransposeDictIndices<IN_INDEX_TYPE, OUT_INDEX_TYPE>(pool, *data(), type, \
+                                                               transpose_map, out);
+
+#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE)                        \
+  case IN_INDEX_TYPE::type_id:                                  \
+    switch (out_type_id) {                                      \
+      TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type)            \
+      TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type)           \
+      TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type)           \
+      TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type)           \
+      default:                                                  \
+        return Status::NotImplemented("unexpected index type"); \
+    }
+
+  switch (in_type_id) {
+    TRANSPOSE_IN_CASE(Int8Type)
+    TRANSPOSE_IN_CASE(Int16Type)
+    TRANSPOSE_IN_CASE(Int32Type)
+    TRANSPOSE_IN_CASE(Int64Type)
+    default:
+      return Status::NotImplemented("unexpected index type");
+  }
+
+#undef TRANSPOSE_IN_OUT_CASE
+#undef TRANSPOSE_IN_CASE
+}
+
 // ----------------------------------------------------------------------
 // Implement Array::Accept as inline visitor
 
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 52c5207..aead17f 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -422,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {
 
   value_type Value(int64_t i) const { return raw_values()[i]; }
 
+  // For API compatibility with BinaryArray etc.
+  value_type GetView(int64_t i) const { return Value(i); }
+
  protected:
   using PrimitiveArray::PrimitiveArray;
 };
@@ -442,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
                            i + data_->offset);
   }
 
+  bool GetView(int64_t i) const { return Value(i); }
+
  protected:
   using PrimitiveArray::PrimitiveArray;
 };
@@ -802,7 +807,7 @@ class ARROW_EXPORT DictionaryArray : public Array {
   /// This function does the validation of the indices and input type. It checks if
   /// all indices are non-negative and smaller than the size of the dictionary
   ///
-  /// \param[in] type a data type containing a dictionary
+  /// \param[in] type a dictionary type
   /// \param[in] indices an array of non-negative signed
   /// integers smaller than the size of the dictionary
   /// \param[out] out the resulting DictionaryArray instance
@@ -810,6 +815,23 @@ class ARROW_EXPORT DictionaryArray : public Array {
                            const std::shared_ptr<Array>& indices,
                            std::shared_ptr<Array>* out);
 
+  /// \brief Transpose this DictionaryArray
+  ///
+  /// This method constructs a new dictionary array with the given dictionary type,
+  /// transposing indices using the transpose map.
+  /// The type and the transpose map are typically computed using
+  /// DictionaryType::Unify.
+  ///
+  /// \param[in] pool a pool to allocate the array data from
+  /// \param[in] type a dictionary type
+  /// \param[in] transpose_map a vector transposing this array's indices
+  /// into the target array's indices
+  /// \param[out] out the resulting DictionaryArray instance
+  Status Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+                   const std::vector<int32_t>& transpose_map,
+                   std::shared_ptr<Array>* out) const;
+  // XXX Do we also want an unsafe in-place Transpose?
+
   std::shared_ptr<Array> indices() const;
   std::shared_ptr<Array> dictionary() const;
 
diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc
index 0891e4c..e534c3c 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -19,6 +19,9 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <limits>
+#include <sstream>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -30,12 +33,118 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/hashing.h"
 #include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
 using internal::checked_cast;
 
 // ----------------------------------------------------------------------
+// DictionaryType unification
+
+struct UnifyDictionaryValues {
+  MemoryPool* pool_;
+  std::shared_ptr<DataType> value_type_;
+  const std::vector<const DictionaryType*>& types_;
+  std::shared_ptr<Array>* out_values_;
+  std::vector<std::vector<int32_t>>* out_transpose_maps_;
+
+  Status Visit(const DataType&, void* = nullptr) {
+    // Default implementation for non-dictionary-supported datatypes
+    std::stringstream ss;
+    ss << "Unification of " << value_type_->ToString()
+       << " dictionaries is not implemented";
+    return Status::NotImplemented(ss.str());
+  }
+
+  template <typename T>
+  Status Visit(const T&,
+               typename internal::DictionaryTraits<T>::MemoTableType* = nullptr) {
+    using ArrayType = typename TypeTraits<T>::ArrayType;
+    using DictTraits = typename internal::DictionaryTraits<T>;
+    using MemoTableType = typename DictTraits::MemoTableType;
+
+    MemoTableType memo_table;
+    if (out_transpose_maps_ != nullptr) {
+      out_transpose_maps_->clear();
+      out_transpose_maps_->reserve(types_.size());
+    }
+    // Build up the unified dictionary values and the transpose maps
+    for (const auto& type : types_) {
+      const ArrayType& values = checked_cast<const ArrayType&>(*type->dictionary());
+      if (out_transpose_maps_ != nullptr) {
+        std::vector<int32_t> transpose_map;
+        transpose_map.reserve(values.length());
+        for (int64_t i = 0; i < values.length(); ++i) {
+          int32_t dict_index = memo_table.GetOrInsert(values.GetView(i));
+          transpose_map.push_back(dict_index);
+        }
+        out_transpose_maps_->push_back(std::move(transpose_map));
+      } else {
+        for (int64_t i = 0; i < values.length(); ++i) {
+          memo_table.GetOrInsert(values.GetView(i));
+        }
+      }
+    }
+    // Build unified dictionary array
+    std::shared_ptr<ArrayData> data;
+    RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table,
+                                                     0 /* start_offset */, &data));
+    *out_values_ = MakeArray(data);
+    return Status::OK();
+  }
+};
+
+Status DictionaryType::Unify(MemoryPool* pool, const std::vector<const DataType*>& types,
+                             std::shared_ptr<DataType>* out_type,
+                             std::vector<std::vector<int32_t>>* out_transpose_maps) {
+  if (types.size() == 0) {
+    return Status::Invalid("need at least one input type");
+  }
+  std::vector<const DictionaryType*> dict_types;
+  dict_types.reserve(types.size());
+  for (const auto& type : types) {
+    if (type->id() != Type::DICTIONARY) {
+      return Status::TypeError("input types must be dictionary types");
+    }
+    dict_types.push_back(checked_cast<const DictionaryType*>(type));
+  }
+
+  // XXX Should we check the ordered flag?
+  auto value_type = dict_types[0]->dictionary()->type();
+  for (const auto& type : dict_types) {
+    auto values = type->dictionary();
+    if (!values->type()->Equals(value_type)) {
+      return Status::TypeError("input types have different value types");
+    }
+    if (values->null_count() != 0) {
+      return Status::TypeError("input types have null values");
+    }
+  }
+
+  std::shared_ptr<Array> values;
+  {
+    UnifyDictionaryValues visitor{pool, value_type, dict_types, &values,
+                                  out_transpose_maps};
+    RETURN_NOT_OK(VisitTypeInline(*value_type, &visitor));
+  }
+
+  // Build unified dictionary type with the right index type
+  std::shared_ptr<DataType> index_type;
+  if (values->length() <= std::numeric_limits<int8_t>::max()) {
+    index_type = int8();
+  } else if (values->length() <= std::numeric_limits<int16_t>::max()) {
+    index_type = int16();
+  } else if (values->length() <= std::numeric_limits<int32_t>::max()) {
+    index_type = int32();
+  } else {
+    index_type = int64();
+  }
+  *out_type = arrow::dictionary(index_type, values);
+  return Status::OK();
+}
+
+// ----------------------------------------------------------------------
 // DictionaryBuilder
 
 template <typename T>
@@ -118,12 +227,31 @@ Status DictionaryBuilder<NullType>::AppendNull() { return values_builder_.Append
 
 template <typename T>
 Status DictionaryBuilder<T>::AppendArray(const Array& array) {
-  const auto& numeric_array = checked_cast<const NumericArray<T>&>(array);
+  using ArrayType = typename TypeTraits<T>::ArrayType;
+
+  const auto& concrete_array = checked_cast<const ArrayType&>(array);
   for (int64_t i = 0; i < array.length(); i++) {
     if (array.IsNull(i)) {
       RETURN_NOT_OK(AppendNull());
     } else {
-      RETURN_NOT_OK(Append(numeric_array.Value(i)));
+      RETURN_NOT_OK(Append(concrete_array.GetView(i)));
+    }
+  }
+  return Status::OK();
+}
+
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
+  if (!type_->Equals(*array.type())) {
+    return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
+  }
+
+  const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(typed_array.GetValue(i)));
     }
   }
   return Status::OK();
@@ -168,46 +296,6 @@ Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* o
   return Status::OK();
 }
 
-//
-// StringType and BinaryType specializations
-//
-
-#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                            \
-                                                                           \
-  template <>                                                              \
-  Status DictionaryBuilder<Type>::AppendArray(const Array& array) {        \
-    using ArrayType = typename TypeTraits<Type>::ArrayType;                \
-    const ArrayType& binary_array = checked_cast<const ArrayType&>(array); \
-    for (int64_t i = 0; i < array.length(); i++) {                         \
-      if (array.IsNull(i)) {                                               \
-        RETURN_NOT_OK(AppendNull());                                       \
-      } else {                                                             \
-        RETURN_NOT_OK(Append(binary_array.GetView(i)));                    \
-      }                                                                    \
-    }                                                                      \
-    return Status::OK();                                                   \
-  }
-
-BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
-BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType);
-
-template <>
-Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
-  if (!type_->Equals(*array.type())) {
-    return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
-  }
-
-  const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
-  for (int64_t i = 0; i < array.length(); i++) {
-    if (array.IsNull(i)) {
-      RETURN_NOT_OK(AppendNull());
-    } else {
-      RETURN_NOT_OK(Append(typed_array.GetValue(i)));
-    }
-  }
-  return Status::OK();
-}
-
 template class DictionaryBuilder<UInt8Type>;
 template class DictionaryBuilder<UInt16Type>;
 template class DictionaryBuilder<UInt32Type>;
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index e0a1069..20b7aff 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -24,6 +24,8 @@
 
 #include <gtest/gtest.h>
 
+#include "arrow/memory_pool.h"
+#include "arrow/test-util.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
 
@@ -480,6 +482,132 @@ TEST(TestStructType, GetChildIndex) {
   ASSERT_EQ(-1, struct_type.GetChildIndex("not-found"));
 }
 
+TEST(TestDictionaryType, Equals) {
+  auto t1 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]"));
+  auto t2 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]"));
+  auto t3 = dictionary(int16(), ArrayFromJSON(int32(), "[3, 4, 5, 6]"));
+  auto t4 = dictionary(int8(), ArrayFromJSON(int16(), "[3, 4, 5, 6]"));
+  auto t5 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 7, 6]"));
+
+  ASSERT_TRUE(t1->Equals(t2));
+  // Different index type
+  ASSERT_FALSE(t1->Equals(t3));
+  // Different value type
+  ASSERT_FALSE(t1->Equals(t4));
+  // Different values
+  ASSERT_FALSE(t1->Equals(t5));
+}
+
+TEST(TestDictionaryType, UnifyNumeric) {
+  auto t1 = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7]"));
+  auto t2 = dictionary(int8(), ArrayFromJSON(int64(), "[1, 7, 4, 8]"));
+  auto t3 = dictionary(int8(), ArrayFromJSON(int64(), "[1, -200]"));
+
+  auto expected = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7, 1, 8, -200]"));
+
+  std::shared_ptr<DataType> dict_type;
+  ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()},
+                                  &dict_type));
+  ASSERT_TRUE(dict_type->Equals(expected));
+
+  std::vector<std::vector<int32_t>> transpose_maps;
+  ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()},
+                                  &dict_type, &transpose_maps));
+  ASSERT_TRUE(dict_type->Equals(expected));
+  ASSERT_EQ(transpose_maps.size(), 3);
+  ASSERT_EQ(transpose_maps[0], std::vector<int32_t>({0, 1, 2}));
+  ASSERT_EQ(transpose_maps[1], std::vector<int32_t>({3, 2, 1, 4}));
+  ASSERT_EQ(transpose_maps[2], std::vector<int32_t>({3, 5}));
+}
+
+TEST(TestDictionaryType, UnifyString) {
+  auto t1 = dictionary(int16(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\"]"));
+  auto t2 = dictionary(int32(), ArrayFromJSON(utf8(), "[\"quux\", \"foo\"]"));
+
+  auto expected =
+      dictionary(int8(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"quux\"]"));
+
+  std::shared_ptr<DataType> dict_type;
+  ASSERT_OK(
+      DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type));
+  ASSERT_TRUE(dict_type->Equals(expected));
+
+  std::vector<std::vector<int32_t>> transpose_maps;
+  ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type,
+                                  &transpose_maps));
+  ASSERT_TRUE(dict_type->Equals(expected));
+
+  ASSERT_EQ(transpose_maps.size(), 2);
+  ASSERT_EQ(transpose_maps[0], std::vector<int32_t>({0, 1}));
+  ASSERT_EQ(transpose_maps[1], std::vector<int32_t>({2, 0}));
+}
+
+TEST(TestDictionaryType, UnifyFixedSizeBinary) {
+  auto type = fixed_size_binary(3);
+
+  std::string data = "foobarbazqux";
+  auto buf = std::make_shared<Buffer>(data);
+  // ["foo", "bar"]
+  auto dict1 = std::make_shared<FixedSizeBinaryArray>(type, 2, SliceBuffer(buf, 0, 6));
+  auto t1 = dictionary(int16(), dict1);
+  // ["bar", "baz", "qux"]
+  auto dict2 = std::make_shared<FixedSizeBinaryArray>(type, 3, SliceBuffer(buf, 3, 9));
+  auto t2 = dictionary(int16(), dict2);
+
+  // ["foo", "bar", "baz", "qux"]
+  auto expected_dict = std::make_shared<FixedSizeBinaryArray>(type, 4, buf);
+  auto expected = dictionary(int8(), expected_dict);
+
+  std::shared_ptr<DataType> dict_type;
+  ASSERT_OK(
+      DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type));
+  ASSERT_TRUE(dict_type->Equals(expected));
+
+  std::vector<std::vector<int32_t>> transpose_maps;
+  ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type,
+                                  &transpose_maps));
+  ASSERT_TRUE(dict_type->Equals(expected));
+  ASSERT_EQ(transpose_maps.size(), 2);
+  ASSERT_EQ(transpose_maps[0], std::vector<int32_t>({0, 1}));
+  ASSERT_EQ(transpose_maps[1], std::vector<int32_t>({1, 2, 3}));
+}
+
+TEST(TestDictionaryType, UnifyLarge) {
+  // Unifying "large" dictionary types should choose the right index type
+  std::shared_ptr<Array> dict1, dict2, expected_dict;
+
+  Int32Builder builder;
+  ASSERT_OK(builder.Reserve(120));
+  for (int32_t i = 0; i < 120; ++i) {
+    builder.UnsafeAppend(i);
+  }
+  ASSERT_OK(builder.Finish(&dict1));
+  ASSERT_EQ(dict1->length(), 120);
+  auto t1 = dictionary(int8(), dict1);
+
+  ASSERT_OK(builder.Reserve(30));
+  for (int32_t i = 110; i < 140; ++i) {
+    builder.UnsafeAppend(i);
+  }
+  ASSERT_OK(builder.Finish(&dict2));
+  ASSERT_EQ(dict2->length(), 30);
+  auto t2 = dictionary(int8(), dict2);
+
+  ASSERT_OK(builder.Reserve(140));
+  for (int32_t i = 0; i < 140; ++i) {
+    builder.UnsafeAppend(i);
+  }
+  ASSERT_OK(builder.Finish(&expected_dict));
+  ASSERT_EQ(expected_dict->length(), 140);
+  // int8 would be too narrow to hold all possible index values
+  auto expected = dictionary(int16(), expected_dict);
+
+  std::shared_ptr<DataType> dict_type;
+  ASSERT_OK(
+      DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type));
+  ASSERT_TRUE(dict_type->Equals(expected));
+}
+
 TEST(TypesTest, TestDecimal128Small) {
   Decimal128Type t1(8, 4);
 
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 5f1ca8d..753cb65 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -260,7 +260,12 @@ DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type,
     : FixedWidthType(Type::DICTIONARY),
       index_type_(index_type),
       dictionary_(dictionary),
-      ordered_(ordered) {}
+      ordered_(ordered) {
+#ifndef NDEBUG
+  const auto& int_type = checked_cast<const Integer&>(*index_type);
+  DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed";
+#endif
+}
 
 int DictionaryType::bit_width() const {
   return checked_cast<const FixedWidthType&>(*index_type_).bit_width();
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 9694202..8f6cfd6 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -39,6 +39,7 @@ namespace arrow {
 
 class Array;
 class Field;
+class MemoryPool;
 
 struct Type {
   /// \brief Main data type enumeration
@@ -768,6 +769,23 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
 
   bool ordered() const { return ordered_; }
 
+  /// \brief Unify several dictionary types
+  ///
+  /// Compute a resulting dictionary that will allow the union of values
+  /// of all input dictionary types.  The input types must all have the
+  /// same value type.
+  /// \param[in] pool Memory pool to allocate dictionary values from
+  /// \param[in] types A sequence of input dictionary types
+  /// \param[out] out_type The unified dictionary type
+  /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors,
+  ///     one per input type.  Each integer vector represents the transposition
+  ///     of input type indices into unified type indices.
+  // XXX Should we return something special (an empty transpose map?) when
+  // the transposition is the identity function?
+  static Status Unify(MemoryPool* pool, const std::vector<const DataType*>& types,
+                      std::shared_ptr<DataType>* out_type,
+                      std::vector<std::vector<int32_t>>* out_transpose_maps = NULLPTR);
+
  private:
   // Must be an integer type (not currently checked)
   std::shared_ptr<DataType> index_type_;
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index ee368fb..76724b2 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -651,25 +651,6 @@ template <typename T>
 struct HashTraits<T, enable_if_8bit_int<T>> {
   using c_type = typename T::c_type;
   using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
-
-  static Status GetDictionaryArrayData(MemoryPool* pool,
-                                       const std::shared_ptr<DataType>& type,
-                                       const MemoTableType& memo_table,
-                                       int64_t start_offset,
-                                       std::shared_ptr<ArrayData>* out) {
-    std::shared_ptr<Buffer> dict_buffer;
-    auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
-    // This makes a copy, but we assume a dictionary array is usually small
-    // compared to the size of the dictionary-using array.
-    // (also, copying the dictionary values is cheap compared to the cost
-    //  of building the memo table)
-    RETURN_NOT_OK(
-        AllocateBuffer(pool, TypeTraits<T>::bytes_required(dict_length), &dict_buffer));
-    memo_table.CopyValues(static_cast<int32_t>(start_offset),
-                          reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
-    *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */);
-    return Status::OK();
-  }
 };
 
 template <typename T>
@@ -677,25 +658,6 @@ struct HashTraits<
     T, typename std::enable_if<has_c_type<T>::value && !is_8bit_int<T>::value>::type> {
   using c_type = typename T::c_type;
   using MemoTableType = ScalarMemoTable<c_type, HashTable>;
-
-  static Status GetDictionaryArrayData(MemoryPool* pool,
-                                       const std::shared_ptr<DataType>& type,
-                                       const MemoTableType& memo_table,
-                                       int64_t start_offset,
-                                       std::shared_ptr<ArrayData>* out) {
-    std::shared_ptr<Buffer> dict_buffer;
-    auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
-    // This makes a copy, but we assume a dictionary array is usually small
-    // compared to the size of the dictionary-using array.
-    // (also, copying the dictionary values is cheap compared to the cost
-    //  of building the memo table)
-    RETURN_NOT_OK(
-        AllocateBuffer(pool, TypeTraits<T>::bytes_required(dict_length), &dict_buffer));
-    memo_table.CopyValues(static_cast<int32_t>(start_offset),
-                          reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
-    *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */);
-    return Status::OK();
-  }
 };
 
 template <typename T>
diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc
index 018eeda..5eba531 100644
--- a/cpp/src/arrow/util/int-util-test.cc
+++ b/cpp/src/arrow/util/int-util-test.cc
@@ -373,5 +373,14 @@ TEST(IntWidth, NullsMany) {
   }
 }
 
+TEST(TransposeInts, Int8ToInt64) {
+  std::vector<int8_t> src = {1, 3, 5, 0, 3, 2};
+  std::vector<int32_t> transpose_map = {1111, 2222, 3333, 4444, 5555, 6666, 7777};
+  std::vector<int64_t> dest(src.size());
+
+  TransposeInts(src.data(), dest.data(), 6, transpose_map.data());
+  ASSERT_EQ(dest, std::vector<int64_t>({2222, 4444, 6666, 1111, 4444, 3333}));
+}
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/int-util.cc b/cpp/src/arrow/util/int-util.cc
index ced1cd1..d81044b 100644
--- a/cpp/src/arrow/util/int-util.cc
+++ b/cpp/src/arrow/util/int-util.cc
@@ -402,5 +402,45 @@ void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) {
   memcpy(dest, source, length * sizeof(int64_t));
 }
 
+template <typename InputInt, typename OutputInt>
+void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length,
+                   const int32_t* transpose_map) {
+  while (length >= 4) {
+    dest[0] = static_cast<OutputInt>(transpose_map[src[0]]);
+    dest[1] = static_cast<OutputInt>(transpose_map[src[1]]);
+    dest[2] = static_cast<OutputInt>(transpose_map[src[2]]);
+    dest[3] = static_cast<OutputInt>(transpose_map[src[3]]);
+    length -= 4;
+    src += 4;
+    dest += 4;
+  }
+  while (length > 0) {
+    *dest++ = static_cast<OutputInt>(transpose_map[*src++]);
+    --length;
+  }
+}
+
+#define INSTANTIATE(SRC, DEST)              \
+  template ARROW_EXPORT void TransposeInts( \
+      const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map);
+
+#define INSTANTIATE_ALL_DEST(DEST) \
+  INSTANTIATE(int8_t, DEST)        \
+  INSTANTIATE(int16_t, DEST)       \
+  INSTANTIATE(int32_t, DEST)       \
+  INSTANTIATE(int64_t, DEST)
+
+#define INSTANTIATE_ALL()       \
+  INSTANTIATE_ALL_DEST(int8_t)  \
+  INSTANTIATE_ALL_DEST(int16_t) \
+  INSTANTIATE_ALL_DEST(int32_t) \
+  INSTANTIATE_ALL_DEST(int64_t)
+
+INSTANTIATE_ALL()
+
+#undef INSTANTIATE
+#undef INSTANTIATE_ALL
+#undef INSTANTIATE_ALL_DEST
+
 }  // namespace internal
 }  // namespace arrow
diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h
index 68355d3..66d389e 100644
--- a/cpp/src/arrow/util/int-util.h
+++ b/cpp/src/arrow/util/int-util.h
@@ -63,6 +63,10 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length);
 ARROW_EXPORT
 void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length);
 
+template <typename InputInt, typename OutputInt>
+ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length,
+                                const int32_t* transpose_map);
+
 }  // namespace internal
 }  // namespace arrow
 
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index b6fc1f1..a5deaa7 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -121,7 +121,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) {
 // The scalar value's type depends on the array data type:
 // - the type's `c_type`, if any
 // - for boolean arrays, a `bool`
-// - for binary, string and fixed-size binary arrars, a `util::string_view`
+// - for binary, string and fixed-size binary arrays, a `util::string_view`
 
 template <typename T, typename Enable = void>
 struct ArrayDataVisitor {};
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 310656d..af2d113 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -303,8 +303,8 @@ def test_dictionary_type():
     assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
     assert ty0.ordered is False
 
-    ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True)
-    assert ty1.index_type == pa.float32()
+    ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
+    assert ty1.index_type == pa.int8()
     assert isinstance(ty0.dictionary, pa.Array)
     assert ty1.dictionary.to_pylist() == [1.0, 2.0]
     assert ty1.ordered is True