You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/19 18:54:15 UTC
[arrow] branch master updated: ARROW-554: [C++] Add functions to
unify dictionary types and arrays
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new f66fa80 ARROW-554: [C++] Add functions to unify dictionary types and arrays
f66fa80 is described below
commit f66fa805e89aef948581876ac802b1ffc6430f5c
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Dec 19 12:53:09 2018 -0600
ARROW-554: [C++] Add functions to unify dictionary types and arrays
Author: Antoine Pitrou <an...@python.org>
Closes #3165 from pitrou/ARROW-554-conform-dicts and squashes the following commits:
7d2579b30 <Antoine Pitrou> ARROW-554: Add functions to conform dictionaries
---
cpp/src/arrow/array-dict-test.cc | 62 +++++++++++++
cpp/src/arrow/array.cc | 61 +++++++++++++
cpp/src/arrow/array.h | 24 ++++-
cpp/src/arrow/array/builder_dict.cc | 172 +++++++++++++++++++++++++++---------
cpp/src/arrow/type-test.cc | 128 +++++++++++++++++++++++++++
cpp/src/arrow/type.cc | 7 +-
cpp/src/arrow/type.h | 18 ++++
cpp/src/arrow/util/hashing.h | 38 --------
cpp/src/arrow/util/int-util-test.cc | 9 ++
cpp/src/arrow/util/int-util.cc | 40 +++++++++
cpp/src/arrow/util/int-util.h | 4 +
cpp/src/arrow/visitor_inline.h | 2 +-
python/pyarrow/tests/test_types.py | 4 +-
13 files changed, 484 insertions(+), 85 deletions(-)
diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index 87cb229..730b891 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -31,6 +31,7 @@
#include "arrow/test-common.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
+#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
namespace arrow {
@@ -38,6 +39,8 @@ namespace arrow {
using std::string;
using std::vector;
+using internal::checked_cast;
+
// ----------------------------------------------------------------------
// Dictionary tests
@@ -740,4 +743,63 @@ TEST(TestDictionary, FromArray) {
ASSERT_RAISES(Invalid, DictionaryArray::FromArrays(dict_type, indices4, &arr4));
}
+TEST(TestDictionary, TransposeBasic) {
+ std::shared_ptr<Array> arr, out, expected;
+
+ auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
+ auto dict_type = dictionary(int16(), dict);
+ auto indices = ArrayFromJSON(int16(), "[1, 2, 0, 0]");
+ // ["B", "C", "A", "A"]
+ ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));
+
+ // Transpose to same index type
+ {
+ auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
+ auto out_dict_type = dictionary(int16(), out_dict);
+
+ const std::vector<int32_t> transpose_map{1, 3, 2};
+ ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
+ default_memory_pool(), out_dict_type, transpose_map, &out));
+
+ auto expected_indices = ArrayFromJSON(int16(), "[3, 2, 1, 1]");
+ ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
+ AssertArraysEqual(*out, *expected);
+ }
+
+ // Transpose to other type
+ {
+ auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
+ auto out_dict_type = dictionary(int8(), out_dict);
+
+ const std::vector<int32_t> transpose_map{1, 3, 2};
+ ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
+ default_memory_pool(), out_dict_type, transpose_map, &out));
+
+ auto expected_indices = ArrayFromJSON(int8(), "[3, 2, 1, 1]");
+ ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
+ AssertArraysEqual(*expected, *out);
+ }
+}
+
+TEST(TestDictionary, TransposeNulls) {
+ std::shared_ptr<Array> arr, out, expected;
+
+ auto dict = ArrayFromJSON(utf8(), "[\"A\", \"B\", \"C\"]");
+ auto dict_type = dictionary(int16(), dict);
+ auto indices = ArrayFromJSON(int16(), "[1, 2, null, 0]");
+ // ["B", "C", null, "A"]
+ ASSERT_OK(DictionaryArray::FromArrays(dict_type, indices, &arr));
+
+ auto out_dict = ArrayFromJSON(utf8(), "[\"Z\", \"A\", \"C\", \"B\"]");
+ auto out_dict_type = dictionary(int16(), out_dict);
+
+ const std::vector<int32_t> transpose_map{1, 3, 2};
+ ASSERT_OK(internal::checked_cast<const DictionaryArray&>(*arr).Transpose(
+ default_memory_pool(), out_dict_type, transpose_map, &out));
+
+ auto expected_indices = ArrayFromJSON(int16(), "[3, 2, null, 1]");
+ ASSERT_OK(DictionaryArray::FromArrays(out_dict_type, expected_indices, &expected));
+ AssertArraysEqual(*expected, *out);
+}
+
} // namespace arrow
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index ff94aa2..7e45e90 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -33,6 +33,7 @@
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
+#include "arrow/util/int-util.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
#include "arrow/visitor.h"
@@ -663,6 +664,66 @@ std::shared_ptr<Array> DictionaryArray::dictionary() const {
return dict_type_->dictionary();
}
+template <typename InType, typename OutType>
+static Status TransposeDictIndices(MemoryPool* pool, const ArrayData& in_data,
+ const std::shared_ptr<DataType>& type,
+ const std::vector<int32_t>& transpose_map,
+ std::shared_ptr<Array>* out) {
+ using in_c_type = typename InType::c_type;
+ using out_c_type = typename OutType::c_type;
+
+ std::shared_ptr<Buffer> out_buffer;
+ RETURN_NOT_OK(AllocateBuffer(pool, in_data.length * sizeof(out_c_type), &out_buffer));
+ // Null bitmap is unchanged
+ auto out_data = ArrayData::Make(type, in_data.length, {in_data.buffers[0], out_buffer},
+ in_data.null_count);
+ internal::TransposeInts(in_data.GetValues<in_c_type>(1),
+ out_data->GetMutableValues<out_c_type>(1), in_data.length,
+ transpose_map.data());
+ *out = MakeArray(out_data);
+ return Status::OK();
+}
+
+Status DictionaryArray::Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::vector<int32_t>& transpose_map,
+ std::shared_ptr<Array>* out) const {
+ DCHECK_EQ(type->id(), Type::DICTIONARY);
+ const auto& out_dict_type = checked_cast<const DictionaryType&>(*type);
+
+ // XXX We'll probably want to make this operation a kernel when we
+ // implement dictionary-to-dictionary casting.
+ auto in_type_id = dict_type_->index_type()->id();
+ auto out_type_id = out_dict_type.index_type()->id();
+
+#define TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, OUT_INDEX_TYPE) \
+ case OUT_INDEX_TYPE::type_id: \
+ return TransposeDictIndices<IN_INDEX_TYPE, OUT_INDEX_TYPE>(pool, *data(), type, \
+ transpose_map, out);
+
+#define TRANSPOSE_IN_CASE(IN_INDEX_TYPE) \
+ case IN_INDEX_TYPE::type_id: \
+ switch (out_type_id) { \
+ TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int8Type) \
+ TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int16Type) \
+ TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int32Type) \
+ TRANSPOSE_IN_OUT_CASE(IN_INDEX_TYPE, Int64Type) \
+ default: \
+ return Status::NotImplemented("unexpected index type"); \
+ }
+
+ switch (in_type_id) {
+ TRANSPOSE_IN_CASE(Int8Type)
+ TRANSPOSE_IN_CASE(Int16Type)
+ TRANSPOSE_IN_CASE(Int32Type)
+ TRANSPOSE_IN_CASE(Int64Type)
+ default:
+ return Status::NotImplemented("unexpected index type");
+ }
+
+#undef TRANSPOSE_IN_OUT_CASE
+#undef TRANSPOSE_IN_CASE
+}
+
// ----------------------------------------------------------------------
// Implement Array::Accept as inline visitor
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 52c5207..aead17f 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -422,6 +422,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {
value_type Value(int64_t i) const { return raw_values()[i]; }
+ // For API compatibility with BinaryArray etc.
+ value_type GetView(int64_t i) const { return Value(i); }
+
protected:
using PrimitiveArray::PrimitiveArray;
};
@@ -442,6 +445,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
i + data_->offset);
}
+ bool GetView(int64_t i) const { return Value(i); }
+
protected:
using PrimitiveArray::PrimitiveArray;
};
@@ -802,7 +807,7 @@ class ARROW_EXPORT DictionaryArray : public Array {
/// This function does the validation of the indices and input type. It checks if
/// all indices are non-negative and smaller than the size of the dictionary
///
- /// \param[in] type a data type containing a dictionary
+ /// \param[in] type a dictionary type
/// \param[in] indices an array of non-negative signed
/// integers smaller than the size of the dictionary
/// \param[out] out the resulting DictionaryArray instance
@@ -810,6 +815,23 @@ class ARROW_EXPORT DictionaryArray : public Array {
const std::shared_ptr<Array>& indices,
std::shared_ptr<Array>* out);
+ /// \brief Transpose this DictionaryArray
+ ///
+ /// This method constructs a new dictionary array with the given dictionary type,
+ /// transposing indices using the transpose map.
+ /// The type and the transpose map are typically computed using
+ /// DictionaryType::Unify.
+ ///
+ /// \param[in] pool a pool to allocate the array data from
+ /// \param[in] type a dictionary type
+ /// \param[in] transpose_map a vector transposing this array's indices
+ /// into the target array's indices
+ /// \param[out] out the resulting DictionaryArray instance
+ Status Transpose(MemoryPool* pool, const std::shared_ptr<DataType>& type,
+ const std::vector<int32_t>& transpose_map,
+ std::shared_ptr<Array>* out) const;
+ // XXX Do we also want an unsafe in-place Transpose?
+
std::shared_ptr<Array> indices() const;
std::shared_ptr<Array> dictionary() const;
diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc
index 0891e4c..e534c3c 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -19,6 +19,9 @@
#include <algorithm>
#include <cstdint>
+#include <limits>
+#include <sstream>
+#include <type_traits>
#include <utility>
#include <vector>
@@ -30,12 +33,118 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/hashing.h"
#include "arrow/util/logging.h"
+#include "arrow/visitor_inline.h"
namespace arrow {
using internal::checked_cast;
// ----------------------------------------------------------------------
+// DictionaryType unification
+
+struct UnifyDictionaryValues {
+ MemoryPool* pool_;
+ std::shared_ptr<DataType> value_type_;
+ const std::vector<const DictionaryType*>& types_;
+ std::shared_ptr<Array>* out_values_;
+ std::vector<std::vector<int32_t>>* out_transpose_maps_;
+
+ Status Visit(const DataType&, void* = nullptr) {
+ // Default implementation for non-dictionary-supported datatypes
+ std::stringstream ss;
+ ss << "Unification of " << value_type_->ToString()
+ << " dictionaries is not implemented";
+ return Status::NotImplemented(ss.str());
+ }
+
+ template <typename T>
+ Status Visit(const T&,
+ typename internal::DictionaryTraits<T>::MemoTableType* = nullptr) {
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+ using DictTraits = typename internal::DictionaryTraits<T>;
+ using MemoTableType = typename DictTraits::MemoTableType;
+
+ MemoTableType memo_table;
+ if (out_transpose_maps_ != nullptr) {
+ out_transpose_maps_->clear();
+ out_transpose_maps_->reserve(types_.size());
+ }
+ // Build up the unified dictionary values and the transpose maps
+ for (const auto& type : types_) {
+ const ArrayType& values = checked_cast<const ArrayType&>(*type->dictionary());
+ if (out_transpose_maps_ != nullptr) {
+ std::vector<int32_t> transpose_map;
+ transpose_map.reserve(values.length());
+ for (int64_t i = 0; i < values.length(); ++i) {
+ int32_t dict_index = memo_table.GetOrInsert(values.GetView(i));
+ transpose_map.push_back(dict_index);
+ }
+ out_transpose_maps_->push_back(std::move(transpose_map));
+ } else {
+ for (int64_t i = 0; i < values.length(); ++i) {
+ memo_table.GetOrInsert(values.GetView(i));
+ }
+ }
+ }
+ // Build unified dictionary array
+ std::shared_ptr<ArrayData> data;
+ RETURN_NOT_OK(DictTraits::GetDictionaryArrayData(pool_, value_type_, memo_table,
+ 0 /* start_offset */, &data));
+ *out_values_ = MakeArray(data);
+ return Status::OK();
+ }
+};
+
+Status DictionaryType::Unify(MemoryPool* pool, const std::vector<const DataType*>& types,
+ std::shared_ptr<DataType>* out_type,
+ std::vector<std::vector<int32_t>>* out_transpose_maps) {
+ if (types.size() == 0) {
+ return Status::Invalid("need at least one input type");
+ }
+ std::vector<const DictionaryType*> dict_types;
+ dict_types.reserve(types.size());
+ for (const auto& type : types) {
+ if (type->id() != Type::DICTIONARY) {
+ return Status::TypeError("input types must be dictionary types");
+ }
+ dict_types.push_back(checked_cast<const DictionaryType*>(type));
+ }
+
+ // XXX Should we check the ordered flag?
+ auto value_type = dict_types[0]->dictionary()->type();
+ for (const auto& type : dict_types) {
+ auto values = type->dictionary();
+ if (!values->type()->Equals(value_type)) {
+ return Status::TypeError("input types have different value types");
+ }
+ if (values->null_count() != 0) {
+ return Status::TypeError("input types have null values");
+ }
+ }
+
+ std::shared_ptr<Array> values;
+ {
+ UnifyDictionaryValues visitor{pool, value_type, dict_types, &values,
+ out_transpose_maps};
+ RETURN_NOT_OK(VisitTypeInline(*value_type, &visitor));
+ }
+
+ // Build unified dictionary type with the right index type
+ std::shared_ptr<DataType> index_type;
+ if (values->length() <= std::numeric_limits<int8_t>::max()) {
+ index_type = int8();
+ } else if (values->length() <= std::numeric_limits<int16_t>::max()) {
+ index_type = int16();
+ } else if (values->length() <= std::numeric_limits<int32_t>::max()) {
+ index_type = int32();
+ } else {
+ index_type = int64();
+ }
+ *out_type = arrow::dictionary(index_type, values);
+ return Status::OK();
+}
+
+// ----------------------------------------------------------------------
// DictionaryBuilder
template <typename T>
@@ -118,12 +227,31 @@ Status DictionaryBuilder<NullType>::AppendNull() { return values_builder_.Append
template <typename T>
Status DictionaryBuilder<T>::AppendArray(const Array& array) {
- const auto& numeric_array = checked_cast<const NumericArray<T>&>(array);
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+
+ const auto& concrete_array = checked_cast<const ArrayType&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
RETURN_NOT_OK(AppendNull());
} else {
- RETURN_NOT_OK(Append(numeric_array.Value(i)));
+ RETURN_NOT_OK(Append(concrete_array.GetView(i)));
+ }
+ }
+ return Status::OK();
+}
+
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
+ if (!type_->Equals(*array.type())) {
+ return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
+ }
+
+ const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (array.IsNull(i)) {
+ RETURN_NOT_OK(AppendNull());
+ } else {
+ RETURN_NOT_OK(Append(typed_array.GetValue(i)));
}
}
return Status::OK();
@@ -168,46 +296,6 @@ Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* o
return Status::OK();
}
-//
-// StringType and BinaryType specializations
-//
-
-#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \
- \
- template <> \
- Status DictionaryBuilder<Type>::AppendArray(const Array& array) { \
- using ArrayType = typename TypeTraits<Type>::ArrayType; \
- const ArrayType& binary_array = checked_cast<const ArrayType&>(array); \
- for (int64_t i = 0; i < array.length(); i++) { \
- if (array.IsNull(i)) { \
- RETURN_NOT_OK(AppendNull()); \
- } else { \
- RETURN_NOT_OK(Append(binary_array.GetView(i))); \
- } \
- } \
- return Status::OK(); \
- }
-
-BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
-BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType);
-
-template <>
-Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
- if (!type_->Equals(*array.type())) {
- return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
- }
-
- const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
- for (int64_t i = 0; i < array.length(); i++) {
- if (array.IsNull(i)) {
- RETURN_NOT_OK(AppendNull());
- } else {
- RETURN_NOT_OK(Append(typed_array.GetValue(i)));
- }
- }
- return Status::OK();
-}
-
template class DictionaryBuilder<UInt8Type>;
template class DictionaryBuilder<UInt16Type>;
template class DictionaryBuilder<UInt32Type>;
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index e0a1069..20b7aff 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -24,6 +24,8 @@
#include <gtest/gtest.h>
+#include "arrow/memory_pool.h"
+#include "arrow/test-util.h"
#include "arrow/type.h"
#include "arrow/util/checked_cast.h"
@@ -480,6 +482,132 @@ TEST(TestStructType, GetChildIndex) {
ASSERT_EQ(-1, struct_type.GetChildIndex("not-found"));
}
+TEST(TestDictionaryType, Equals) {
+ auto t1 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]"));
+ auto t2 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 5, 6]"));
+ auto t3 = dictionary(int16(), ArrayFromJSON(int32(), "[3, 4, 5, 6]"));
+ auto t4 = dictionary(int8(), ArrayFromJSON(int16(), "[3, 4, 5, 6]"));
+ auto t5 = dictionary(int8(), ArrayFromJSON(int32(), "[3, 4, 7, 6]"));
+
+ ASSERT_TRUE(t1->Equals(t2));
+ // Different index type
+ ASSERT_FALSE(t1->Equals(t3));
+ // Different value type
+ ASSERT_FALSE(t1->Equals(t4));
+ // Different values
+ ASSERT_FALSE(t1->Equals(t5));
+}
+
+TEST(TestDictionaryType, UnifyNumeric) {
+ auto t1 = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7]"));
+ auto t2 = dictionary(int8(), ArrayFromJSON(int64(), "[1, 7, 4, 8]"));
+ auto t3 = dictionary(int8(), ArrayFromJSON(int64(), "[1, -200]"));
+
+ auto expected = dictionary(int8(), ArrayFromJSON(int64(), "[3, 4, 7, 1, 8, -200]"));
+
+ std::shared_ptr<DataType> dict_type;
+ ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()},
+ &dict_type));
+ ASSERT_TRUE(dict_type->Equals(expected));
+
+ std::vector<std::vector<int32_t>> transpose_maps;
+ ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get(), t3.get()},
+ &dict_type, &transpose_maps));
+ ASSERT_TRUE(dict_type->Equals(expected));
+ ASSERT_EQ(transpose_maps.size(), 3);
+ ASSERT_EQ(transpose_maps[0], std::vector<int32_t>({0, 1, 2}));
+ ASSERT_EQ(transpose_maps[1], std::vector<int32_t>({3, 2, 1, 4}));
+ ASSERT_EQ(transpose_maps[2], std::vector<int32_t>({3, 5}));
+}
+
+TEST(TestDictionaryType, UnifyString) {
+ auto t1 = dictionary(int16(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\"]"));
+ auto t2 = dictionary(int32(), ArrayFromJSON(utf8(), "[\"quux\", \"foo\"]"));
+
+ auto expected =
+ dictionary(int8(), ArrayFromJSON(utf8(), "[\"foo\", \"bar\", \"quux\"]"));
+
+ std::shared_ptr<DataType> dict_type;
+ ASSERT_OK(
+ DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type));
+ ASSERT_TRUE(dict_type->Equals(expected));
+
+ std::vector<std::vector<int32_t>> transpose_maps;
+ ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type,
+ &transpose_maps));
+ ASSERT_TRUE(dict_type->Equals(expected));
+
+ ASSERT_EQ(transpose_maps.size(), 2);
+ ASSERT_EQ(transpose_maps[0], std::vector<int32_t>({0, 1}));
+ ASSERT_EQ(transpose_maps[1], std::vector<int32_t>({2, 0}));
+}
+
+TEST(TestDictionaryType, UnifyFixedSizeBinary) {
+ auto type = fixed_size_binary(3);
+
+ std::string data = "foobarbazqux";
+ auto buf = std::make_shared<Buffer>(data);
+ // ["foo", "bar"]
+ auto dict1 = std::make_shared<FixedSizeBinaryArray>(type, 2, SliceBuffer(buf, 0, 6));
+ auto t1 = dictionary(int16(), dict1);
+ // ["bar", "baz", "qux"]
+ auto dict2 = std::make_shared<FixedSizeBinaryArray>(type, 3, SliceBuffer(buf, 3, 9));
+ auto t2 = dictionary(int16(), dict2);
+
+ // ["foo", "bar", "baz", "qux"]
+ auto expected_dict = std::make_shared<FixedSizeBinaryArray>(type, 4, buf);
+ auto expected = dictionary(int8(), expected_dict);
+
+ std::shared_ptr<DataType> dict_type;
+ ASSERT_OK(
+ DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type));
+ ASSERT_TRUE(dict_type->Equals(expected));
+
+ std::vector<std::vector<int32_t>> transpose_maps;
+ ASSERT_OK(DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type,
+ &transpose_maps));
+ ASSERT_TRUE(dict_type->Equals(expected));
+ ASSERT_EQ(transpose_maps.size(), 2);
+ ASSERT_EQ(transpose_maps[0], std::vector<int32_t>({0, 1}));
+ ASSERT_EQ(transpose_maps[1], std::vector<int32_t>({1, 2, 3}));
+}
+
+TEST(TestDictionaryType, UnifyLarge) {
+ // Unifying "large" dictionary types should choose the right index type
+ std::shared_ptr<Array> dict1, dict2, expected_dict;
+
+ Int32Builder builder;
+ ASSERT_OK(builder.Reserve(120));
+ for (int32_t i = 0; i < 120; ++i) {
+ builder.UnsafeAppend(i);
+ }
+ ASSERT_OK(builder.Finish(&dict1));
+ ASSERT_EQ(dict1->length(), 120);
+ auto t1 = dictionary(int8(), dict1);
+
+ ASSERT_OK(builder.Reserve(30));
+ for (int32_t i = 110; i < 140; ++i) {
+ builder.UnsafeAppend(i);
+ }
+ ASSERT_OK(builder.Finish(&dict2));
+ ASSERT_EQ(dict2->length(), 30);
+ auto t2 = dictionary(int8(), dict2);
+
+ ASSERT_OK(builder.Reserve(140));
+ for (int32_t i = 0; i < 140; ++i) {
+ builder.UnsafeAppend(i);
+ }
+ ASSERT_OK(builder.Finish(&expected_dict));
+ ASSERT_EQ(expected_dict->length(), 140);
+ // int8 would be too narrow to hold all possible index values
+ auto expected = dictionary(int16(), expected_dict);
+
+ std::shared_ptr<DataType> dict_type;
+ ASSERT_OK(
+ DictionaryType::Unify(default_memory_pool(), {t1.get(), t2.get()}, &dict_type));
+ ASSERT_TRUE(dict_type->Equals(expected));
+}
+
TEST(TypesTest, TestDecimal128Small) {
Decimal128Type t1(8, 4);
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 5f1ca8d..753cb65 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -260,7 +260,12 @@ DictionaryType::DictionaryType(const std::shared_ptr<DataType>& index_type,
: FixedWidthType(Type::DICTIONARY),
index_type_(index_type),
dictionary_(dictionary),
- ordered_(ordered) {}
+ ordered_(ordered) {
+#ifndef NDEBUG
+ const auto& int_type = checked_cast<const Integer&>(*index_type);
+ DCHECK_EQ(int_type.is_signed(), true) << "dictionary index type should be signed";
+#endif
+}
int DictionaryType::bit_width() const {
return checked_cast<const FixedWidthType&>(*index_type_).bit_width();
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 9694202..8f6cfd6 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -39,6 +39,7 @@ namespace arrow {
class Array;
class Field;
+class MemoryPool;
struct Type {
/// \brief Main data type enumeration
@@ -768,6 +769,23 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
bool ordered() const { return ordered_; }
+ /// \brief Unify several dictionary types
+ ///
+ /// Compute a resulting dictionary that will allow the union of values
+ /// of all input dictionary types. The input types must all have the
+ /// same value type.
+ /// \param[in] pool Memory pool to allocate dictionary values from
+ /// \param[in] types A sequence of input dictionary types
+ /// \param[out] out_type The unified dictionary type
+ /// \param[out] out_transpose_maps (optionally) A sequence of integer vectors,
+ /// one per input type. Each integer vector represents the transposition
+ /// of input type indices into unified type indices.
+ // XXX Should we return something special (an empty transpose map?) when
+ // the transposition is the identity function?
+ static Status Unify(MemoryPool* pool, const std::vector<const DataType*>& types,
+ std::shared_ptr<DataType>* out_type,
+ std::vector<std::vector<int32_t>>* out_transpose_maps = NULLPTR);
+
private:
// Must be an integer type (not currently checked)
std::shared_ptr<DataType> index_type_;
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
index ee368fb..76724b2 100644
--- a/cpp/src/arrow/util/hashing.h
+++ b/cpp/src/arrow/util/hashing.h
@@ -651,25 +651,6 @@ template <typename T>
struct HashTraits<T, enable_if_8bit_int<T>> {
using c_type = typename T::c_type;
using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
-
- static Status GetDictionaryArrayData(MemoryPool* pool,
- const std::shared_ptr<DataType>& type,
- const MemoTableType& memo_table,
- int64_t start_offset,
- std::shared_ptr<ArrayData>* out) {
- std::shared_ptr<Buffer> dict_buffer;
- auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
- // This makes a copy, but we assume a dictionary array is usually small
- // compared to the size of the dictionary-using array.
- // (also, copying the dictionary values is cheap compared to the cost
- // of building the memo table)
- RETURN_NOT_OK(
- AllocateBuffer(pool, TypeTraits<T>::bytes_required(dict_length), &dict_buffer));
- memo_table.CopyValues(static_cast<int32_t>(start_offset),
- reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
- *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */);
- return Status::OK();
- }
};
template <typename T>
@@ -677,25 +658,6 @@ struct HashTraits<
T, typename std::enable_if<has_c_type<T>::value && !is_8bit_int<T>::value>::type> {
using c_type = typename T::c_type;
using MemoTableType = ScalarMemoTable<c_type, HashTable>;
-
- static Status GetDictionaryArrayData(MemoryPool* pool,
- const std::shared_ptr<DataType>& type,
- const MemoTableType& memo_table,
- int64_t start_offset,
- std::shared_ptr<ArrayData>* out) {
- std::shared_ptr<Buffer> dict_buffer;
- auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
- // This makes a copy, but we assume a dictionary array is usually small
- // compared to the size of the dictionary-using array.
- // (also, copying the dictionary values is cheap compared to the cost
- // of building the memo table)
- RETURN_NOT_OK(
- AllocateBuffer(pool, TypeTraits<T>::bytes_required(dict_length), &dict_buffer));
- memo_table.CopyValues(static_cast<int32_t>(start_offset),
- reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
- *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */);
- return Status::OK();
- }
};
template <typename T>
diff --git a/cpp/src/arrow/util/int-util-test.cc b/cpp/src/arrow/util/int-util-test.cc
index 018eeda..5eba531 100644
--- a/cpp/src/arrow/util/int-util-test.cc
+++ b/cpp/src/arrow/util/int-util-test.cc
@@ -373,5 +373,14 @@ TEST(IntWidth, NullsMany) {
}
}
+TEST(TransposeInts, Int8ToInt64) {
+ std::vector<int8_t> src = {1, 3, 5, 0, 3, 2};
+ std::vector<int32_t> transpose_map = {1111, 2222, 3333, 4444, 5555, 6666, 7777};
+ std::vector<int64_t> dest(src.size());
+
+ TransposeInts(src.data(), dest.data(), 6, transpose_map.data());
+ ASSERT_EQ(dest, std::vector<int64_t>({2222, 4444, 6666, 1111, 4444, 3333}));
+}
+
} // namespace internal
} // namespace arrow
diff --git a/cpp/src/arrow/util/int-util.cc b/cpp/src/arrow/util/int-util.cc
index ced1cd1..d81044b 100644
--- a/cpp/src/arrow/util/int-util.cc
+++ b/cpp/src/arrow/util/int-util.cc
@@ -402,5 +402,45 @@ void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length) {
memcpy(dest, source, length * sizeof(int64_t));
}
+template <typename InputInt, typename OutputInt>
+void TransposeInts(const InputInt* src, OutputInt* dest, int64_t length,
+ const int32_t* transpose_map) {
+ while (length >= 4) {
+ dest[0] = static_cast<OutputInt>(transpose_map[src[0]]);
+ dest[1] = static_cast<OutputInt>(transpose_map[src[1]]);
+ dest[2] = static_cast<OutputInt>(transpose_map[src[2]]);
+ dest[3] = static_cast<OutputInt>(transpose_map[src[3]]);
+ length -= 4;
+ src += 4;
+ dest += 4;
+ }
+ while (length > 0) {
+ *dest++ = static_cast<OutputInt>(transpose_map[*src++]);
+ --length;
+ }
+}
+
+#define INSTANTIATE(SRC, DEST) \
+ template ARROW_EXPORT void TransposeInts( \
+ const SRC* source, DEST* dest, int64_t length, const int32_t* transpose_map);
+
+#define INSTANTIATE_ALL_DEST(DEST) \
+ INSTANTIATE(int8_t, DEST) \
+ INSTANTIATE(int16_t, DEST) \
+ INSTANTIATE(int32_t, DEST) \
+ INSTANTIATE(int64_t, DEST)
+
+#define INSTANTIATE_ALL() \
+ INSTANTIATE_ALL_DEST(int8_t) \
+ INSTANTIATE_ALL_DEST(int16_t) \
+ INSTANTIATE_ALL_DEST(int32_t) \
+ INSTANTIATE_ALL_DEST(int64_t)
+
+INSTANTIATE_ALL()
+
+#undef INSTANTIATE
+#undef INSTANTIATE_ALL
+#undef INSTANTIATE_ALL_DEST
+
} // namespace internal
} // namespace arrow
diff --git a/cpp/src/arrow/util/int-util.h b/cpp/src/arrow/util/int-util.h
index 68355d3..66d389e 100644
--- a/cpp/src/arrow/util/int-util.h
+++ b/cpp/src/arrow/util/int-util.h
@@ -63,6 +63,10 @@ void DowncastUInts(const uint64_t* source, uint32_t* dest, int64_t length);
ARROW_EXPORT
void DowncastUInts(const uint64_t* source, uint64_t* dest, int64_t length);
+template <typename InputInt, typename OutputInt>
+ARROW_EXPORT void TransposeInts(const InputInt* source, OutputInt* dest, int64_t length,
+ const int32_t* transpose_map);
+
} // namespace internal
} // namespace arrow
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index b6fc1f1..a5deaa7 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -121,7 +121,7 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) {
// The scalar value's type depends on the array data type:
// - the type's `c_type`, if any
// - for boolean arrays, a `bool`
-// - for binary, string and fixed-size binary arrars, a `util::string_view`
+// - for binary, string and fixed-size binary arrays, a `util::string_view`
template <typename T, typename Enable = void>
struct ArrayDataVisitor {};
diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py
index 310656d..af2d113 100644
--- a/python/pyarrow/tests/test_types.py
+++ b/python/pyarrow/tests/test_types.py
@@ -303,8 +303,8 @@ def test_dictionary_type():
assert ty0.dictionary.to_pylist() == ['a', 'b', 'c']
assert ty0.ordered is False
- ty1 = pa.dictionary(pa.float32(), pa.array([1.0, 2.0]), ordered=True)
- assert ty1.index_type == pa.float32()
+ ty1 = pa.dictionary(pa.int8(), pa.array([1.0, 2.0]), ordered=True)
+ assert ty1.index_type == pa.int8()
assert isinstance(ty0.dictionary, pa.Array)
assert ty1.dictionary.to_pylist() == [1.0, 2.0]
assert ty1.ordered is True