You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2017/09/11 10:47:41 UTC
arrow git commit: ARROW-1508: C++: Add support for
FixedSizeBinaryType in DictionaryBuilder
Repository: arrow
Updated Branches:
refs/heads/master a45b04712 -> 94f6247c4
ARROW-1508: C++: Add support for FixedSizeBinaryType in DictionaryBuilder
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #1073 from xhochy/ARROW-1508 and squashes the following commits:
cefee2c [Uwe L. Korn] ninja format
c471bfa [Uwe L. Korn] Review comments
4bf354e [Uwe L. Korn] Use non-deprecated interfaces
2538926 [Uwe L. Korn] ARROW-1508: C++: Add support for FixedSizeBinaryType in DictionaryBuilder
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/94f6247c
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/94f6247c
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/94f6247c
Branch: refs/heads/master
Commit: 94f6247c4ef49b9b4fe1cccdc172810c55c34a10
Parents: a45b047
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Mon Sep 11 12:46:37 2017 +0200
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Mon Sep 11 12:46:37 2017 +0200
----------------------------------------------------------------------
cpp/src/arrow/array-test.cc | 84 ++++++++++++++++++++++++++++++++++++++++
cpp/src/arrow/array.h | 1 +
cpp/src/arrow/builder.cc | 71 ++++++++++++++++++++++++++++++++-
cpp/src/arrow/builder.h | 11 ++++++
4 files changed, 165 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index c92c23d..a9596c5 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1764,6 +1764,90 @@ TEST(TestStringDictionaryBuilder, DoubleTableSize) {
ASSERT_TRUE(expected.Equals(result));
}
+TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
+ // Build the dictionary Array
+ DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+ default_memory_pool());
+ std::vector<uint8_t> test{12, 12, 11, 12};
+ std::vector<uint8_t> test2{12, 12, 11, 11};
+ ASSERT_OK(builder.Append(test.data()));
+ ASSERT_OK(builder.Append(test2.data()));
+ ASSERT_OK(builder.Append(test.data()));
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ // Build expected data
+ FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+ ASSERT_OK(fsb_builder.Append(test.data()));
+ ASSERT_OK(fsb_builder.Append(test2.data()));
+ std::shared_ptr<Array> fsb_array;
+ ASSERT_OK(fsb_builder.Finish(&fsb_array));
+ auto dtype = std::make_shared<DictionaryType>(int8(), fsb_array);
+
+ Int8Builder int_builder;
+ ASSERT_OK(int_builder.Append(0));
+ ASSERT_OK(int_builder.Append(1));
+ ASSERT_OK(int_builder.Append(0));
+ std::shared_ptr<Array> int_array;
+ ASSERT_OK(int_builder.Finish(&int_array));
+
+ DictionaryArray expected(dtype, int_array);
+ ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
+ // Build the dictionary Array
+ DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+ default_memory_pool());
+ // Build expected data
+ FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+ Int16Builder int_builder;
+
+ // Fill with 1024 different values
+ for (int64_t i = 0; i < 1024; i++) {
+ std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128),
+ static_cast<uint8_t>(i % 128)};
+ ASSERT_OK(builder.Append(value.data()));
+ ASSERT_OK(fsb_builder.Append(value.data()));
+ ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+ }
+ // Fill with an already existing value
+ std::vector<uint8_t> known_value{12, 12, 0, 1};
+ for (int64_t i = 0; i < 1024; i++) {
+ ASSERT_OK(builder.Append(known_value.data()));
+ ASSERT_OK(int_builder.Append(1));
+ }
+
+ // Finalize result
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ // Finalize expected data
+ std::shared_ptr<Array> fsb_array;
+ ASSERT_OK(fsb_builder.Finish(&fsb_array));
+ auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+ std::shared_ptr<Array> int_array;
+ ASSERT_OK(int_builder.Finish(&int_array));
+
+ DictionaryArray expected(dtype, int_array);
+ ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
+ // Build the dictionary Array
+ DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+ default_memory_pool());
+ // Build an array with different byte width
+ FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5));
+ std::vector<uint8_t> value{100, 1, 1, 1, 1};
+ ASSERT_OK(fsb_builder.Append(value.data()));
+ std::shared_ptr<Array> fsb_array;
+ ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+ ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
+}
+
// ----------------------------------------------------------------------
// List tests
http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index bfeedd2..994270d 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -512,6 +512,7 @@ class ARROW_EXPORT FixedSizeBinaryArray : public PrimitiveArray {
int64_t null_count = 0, int64_t offset = 0);
const uint8_t* GetValue(int64_t i) const;
+ const uint8_t* Value(int64_t i) const { return GetValue(i); }
int32_t byte_width() const { return byte_width_; }
http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 7966241..5945677 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -26,6 +26,7 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
+#include "arrow/compare.h"
#include "arrow/status.h"
#include "arrow/table.h"
#include "arrow/type.h"
@@ -818,10 +819,25 @@ DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
hash_table_(new PoolBuffer(pool)),
hash_slots_(nullptr),
dict_builder_(type, pool),
+ values_builder_(pool),
+ byte_width_(-1) {
+ if (!::arrow::CpuInfo::initialized()) {
+ ::arrow::CpuInfo::Init();
+ }
+}
+
+template <>
+DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
+ const std::shared_ptr<DataType>& type, MemoryPool* pool)
+ : ArrayBuilder(type, pool),
+ hash_table_(new PoolBuffer(pool)),
+ hash_slots_(nullptr),
+ dict_builder_(type, pool),
values_builder_(pool) {
if (!::arrow::CpuInfo::initialized()) {
::arrow::CpuInfo::Init();
}
+ byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
}
#ifndef ARROW_NO_DEPRECATED_API
@@ -918,6 +934,24 @@ Status DictionaryBuilder<T>::AppendArray(const Array& array) {
return Status::OK();
}
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
+ if (!type_->Equals(*array.type())) {
+ return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
+ }
+
+ const FixedSizeBinaryArray& numeric_array =
+ static_cast<const FixedSizeBinaryArray&>(array);
+ for (int64_t i = 0; i < array.length(); i++) {
+ if (array.IsNull(i)) {
+ RETURN_NOT_OK(AppendNull());
+ } else {
+ RETURN_NOT_OK(Append(numeric_array.Value(i)));
+ }
+ }
+ return Status::OK();
+}
+
template <typename T>
Status DictionaryBuilder<T>::AppendNull() {
return values_builder_.AppendNull();
@@ -975,17 +1009,35 @@ typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue(
return data[index];
}
+template <>
+const uint8_t* DictionaryBuilder<FixedSizeBinaryType>::GetDictionaryValue(int64_t index) {
+ return dict_builder_.GetValue(index);
+}
+
template <typename T>
int DictionaryBuilder<T>::HashValue(const Scalar& value) {
return HashUtil::Hash(&value, sizeof(Scalar), 0);
}
+template <>
+int DictionaryBuilder<FixedSizeBinaryType>::HashValue(const Scalar& value) {
+ return HashUtil::Hash(value, byte_width_, 0);
+}
+
template <typename T>
bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& value) {
const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
return other != value;
}
+template <>
+bool DictionaryBuilder<FixedSizeBinaryType>::SlotDifferent(hash_slot_t index,
+ const Scalar& value) {
+ int32_t width = static_cast<const FixedSizeBinaryType&>(*type_).byte_width();
+ const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
+ return memcmp(other, value, width) != 0;
+}
+
template <typename T>
Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) {
return dict_builder_.Append(value);
@@ -1052,6 +1104,7 @@ template class DictionaryBuilder<Time64Type>;
template class DictionaryBuilder<TimestampType>;
template class DictionaryBuilder<FloatType>;
template class DictionaryBuilder<DoubleType>;
+template class DictionaryBuilder<FixedSizeBinaryType>;
template class DictionaryBuilder<BinaryType>;
template class DictionaryBuilder<StringType>;
@@ -1315,6 +1368,11 @@ Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>* out) {
return Status::OK();
}
+const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const {
+ const uint8_t* data_ptr = byte_builder_.data();
+ return data_ptr + i * byte_width_;
+}
+
// ----------------------------------------------------------------------
// Struct
@@ -1438,6 +1496,7 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder<DoubleType>);
DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder);
DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder);
+ DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
default:
return Status::NotImplemented(type->ToString());
}
@@ -1472,8 +1531,12 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool,
DICTIONARY_ARRAY_CASE(DOUBLE, DictionaryBuilder<DoubleType>);
DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder);
DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder);
+ DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
default:
- return Status::NotImplemented(type->ToString());
+ std::stringstream ss;
+ ss << "Cannot encode array of type " << type->ToString();
+ ss << " to dictionary";
+ return Status::NotImplemented(ss.str());
}
}
#define DICTIONARY_COLUMN_CASE(ENUM, BuilderType) \
@@ -1516,8 +1579,12 @@ Status EncodeColumnToDictionary(const Column& input, MemoryPool* pool,
DICTIONARY_COLUMN_CASE(DOUBLE, DictionaryBuilder<DoubleType>);
DICTIONARY_COLUMN_CASE(STRING, StringDictionaryBuilder);
DICTIONARY_COLUMN_CASE(BINARY, BinaryDictionaryBuilder);
+ DICTIONARY_COLUMN_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
default:
- return Status::NotImplemented(type->ToString());
+ std::stringstream ss;
+ ss << "Cannot encode column of type " << type->ToString();
+ ss << " to dictionary";
+ return Status::NotImplemented(ss.str());
}
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/94f6247c/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 3e8289f..bf7b317 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -756,6 +756,11 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
/// \return size of values buffer so far
int64_t value_data_length() const { return byte_builder_.length(); }
+ /// Temporary access to a value.
+ ///
+ /// This pointer becomes invalid on the next modifying operation.
+ const uint8_t* GetValue(int64_t i) const;
+
protected:
int32_t byte_width_;
BufferBuilder byte_builder_;
@@ -866,6 +871,11 @@ struct DictionaryScalar<StringType> {
using type = WrappedBinary;
};
+template <>
+struct DictionaryScalar<FixedSizeBinaryType> {
+ using type = uint8_t const*;
+};
+
} // namespace internal
/// \brief Array builder for created encoded DictionaryArray from dense array
@@ -921,6 +931,7 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
typename TypeTraits<T>::BuilderType dict_builder_;
AdaptiveIntBuilder values_builder_;
+ int32_t byte_width_;
};
class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {