You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/28 00:58:18 UTC
arrow git commit: ARROW-1607: [C++] Implement DictionaryBuilder for
Decimals
Repository: arrow
Updated Branches:
refs/heads/master 808a14330 -> b62f99a60
ARROW-1607: [C++] Implement DictionaryBuilder for Decimals
Author: Phillip Cloud <cp...@gmail.com>
Closes #1128 from cpcloud/ARROW-1607 and squashes the following commits:
3137566 [Phillip Cloud] ARROW-1607: [C++] Implement DictionaryBuilder for Decimals
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/b62f99a6
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/b62f99a6
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/b62f99a6
Branch: refs/heads/master
Commit: b62f99a603a4822756c610e52e4643980d4bebf1
Parents: 808a143
Author: Phillip Cloud <cp...@gmail.com>
Authored: Wed Sep 27 20:58:08 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed Sep 27 20:58:08 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/array-test.cc | 92 +++++++++++++++++++++++++++++++++--
cpp/src/arrow/builder.cc | 11 +++--
cpp/src/arrow/builder.h | 4 +-
cpp/src/arrow/compute/cast.cc | 6 ++-
cpp/src/arrow/io/io-file-test.cc | 8 +--
cpp/src/arrow/io/io-hdfs-test.cc | 2 +-
6 files changed, 107 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 9731083..4ea9248 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1200,7 +1200,7 @@ class TestFWBinaryArray : public ::testing::Test {
};
TEST_F(TestFWBinaryArray, Builder) {
- const int32_t byte_width = 10;
+ constexpr int32_t byte_width = 10;
int64_t length = 4096;
int64_t nbytes = length * byte_width;
@@ -1215,8 +1215,7 @@ TEST_F(TestFWBinaryArray, Builder) {
std::shared_ptr<Array> result;
- auto CheckResult = [this, &length, &is_valid, &raw_data,
- &byte_width](const Array& result) {
+ auto CheckResult = [&length, &is_valid, &raw_data, byte_width](const Array& result) {
// Verify output
const auto& fw_result = static_cast<const FixedSizeBinaryArray&>(result);
@@ -1847,6 +1846,93 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
}
+TEST(TestDecimalDictionaryBuilder, Basic) {
+ // Build the dictionary Array
+ const auto& decimal_type = arrow::decimal(2, 0);
+ DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
+
+ // Test data
+ std::vector<Decimal128> test{12, 12, 11, 12};
+ for (const auto& value : test) {
+ ASSERT_OK(builder.Append(value.ToBytes().data()));
+ }
+
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ // Build expected data
+ FixedSizeBinaryBuilder decimal_builder(decimal_type);
+ ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes()));
+ ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes()));
+
+ std::shared_ptr<Array> decimal_array;
+ ASSERT_OK(decimal_builder.Finish(&decimal_array));
+ auto dtype = arrow::dictionary(int8(), decimal_array);
+
+ Int8Builder int_builder;
+ ASSERT_OK(int_builder.Append({0, 0, 1, 0}));
+ std::shared_ptr<Array> int_array;
+ ASSERT_OK(int_builder.Finish(&int_array));
+
+ DictionaryArray expected(dtype, int_array);
+ ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
+ const auto& decimal_type = arrow::decimal(21, 0);
+
+ // Build the dictionary Array
+ DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
+
+ // Build expected data
+ FixedSizeBinaryBuilder fsb_builder(decimal_type);
+ Int16Builder int_builder;
+
+ // Fill with 1024 different values
+ for (int64_t i = 0; i < 1024; i++) {
+ const uint8_t bytes[] = {0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 12,
+ 12,
+ static_cast<uint8_t>(i / 128),
+ static_cast<uint8_t>(i % 128)};
+ ASSERT_OK(builder.Append(bytes));
+ ASSERT_OK(fsb_builder.Append(bytes));
+ ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+ }
+ // Fill with an already existing value
+ const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1};
+ for (int64_t i = 0; i < 1024; i++) {
+ ASSERT_OK(builder.Append(known_value));
+ ASSERT_OK(int_builder.Append(1));
+ }
+
+ // Finalize result
+ std::shared_ptr<Array> result;
+ ASSERT_OK(builder.Finish(&result));
+
+ // Finalize expected data
+ std::shared_ptr<Array> fsb_array;
+ ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+ auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+ std::shared_ptr<Array> int_array;
+ ASSERT_OK(int_builder.Finish(&int_array));
+
+ DictionaryArray expected(dtype, int_array);
+ ASSERT_TRUE(expected.Equals(result));
+}
+
// ----------------------------------------------------------------------
// List tests
http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index a194ab7..0479dc5 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -831,11 +831,11 @@ DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
hash_table_(new PoolBuffer(pool)),
hash_slots_(nullptr),
dict_builder_(type, pool),
- values_builder_(pool) {
+ values_builder_(pool),
+ byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width()) {
if (!::arrow::CpuInfo::initialized()) {
::arrow::CpuInfo::Init();
}
- byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
}
#ifndef ARROW_NO_DEPRECATED_API
@@ -921,7 +921,7 @@ Status DictionaryBuilder<T>::Append(const Scalar& value) {
template <typename T>
Status DictionaryBuilder<T>::AppendArray(const Array& array) {
- const NumericArray<T>& numeric_array = static_cast<const NumericArray<T>&>(array);
+ const auto& numeric_array = static_cast<const NumericArray<T>&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
RETURN_NOT_OK(AppendNull());
@@ -938,8 +938,7 @@ Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
}
- const FixedSizeBinaryArray& numeric_array =
- static_cast<const FixedSizeBinaryArray&>(array);
+ const auto& numeric_array = static_cast<const FixedSizeBinaryArray&>(array);
for (int64_t i = 0; i < array.length(); i++) {
if (array.IsNull(i)) {
RETURN_NOT_OK(AppendNull());
@@ -1493,6 +1492,7 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder);
DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder);
DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
+ DICTIONARY_BUILDER_CASE(DECIMAL, DictionaryBuilder<FixedSizeBinaryType>);
default:
return Status::NotImplemented(type->ToString());
}
@@ -1528,6 +1528,7 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool,
DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder);
DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder);
DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
+ DICTIONARY_ARRAY_CASE(DECIMAL, DictionaryBuilder<FixedSizeBinaryType>);
default:
std::stringstream ss;
ss << "Cannot encode array of type " << type->ToString();
http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 28f3cb9..da7386a 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -937,8 +937,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
public:
- using DictionaryBuilder::DictionaryBuilder;
using DictionaryBuilder::Append;
+ using DictionaryBuilder::DictionaryBuilder;
Status Append(const uint8_t* value, int32_t length) {
return Append(internal::WrappedBinary(value, length));
@@ -958,8 +958,8 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType
/// \brief Dictionary array builder with convenience methods for strings
class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
public:
- using DictionaryBuilder::DictionaryBuilder;
using DictionaryBuilder::Append;
+ using DictionaryBuilder::DictionaryBuilder;
Status Append(const uint8_t* value, int32_t length) {
return Append(internal::WrappedBinary(value, length));
http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/compute/cast.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index ee838fa..149cc36 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -490,7 +490,10 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input
if (can_pre_allocate_values) {
std::shared_ptr<Buffer> out_data;
- if (!(is_primitive(out->type->id()) || out->type->id() == Type::FIXED_SIZE_BINARY)) {
+ const Type::type type_id = out->type->id();
+
+ if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY ||
+ type_id == Type::DECIMAL)) {
std::stringstream ss;
ss << "Cannot pre-allocate memory for type: " << out->type->ToString();
return Status::NotImplemented(ss.str());
@@ -614,6 +617,7 @@ class CastKernel : public UnaryKernel {
FN(IN_TYPE, FloatType); \
FN(IN_TYPE, DoubleType); \
FN(IN_TYPE, FixedSizeBinaryType); \
+ FN(IN_TYPE, DecimalType); \
FN(IN_TYPE, BinaryType); \
FN(IN_TYPE, StringType);
http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/io/io-file-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc
index 636fbd8..ee3beab 100644
--- a/cpp/src/arrow/io/io-file-test.cc
+++ b/cpp/src/arrow/io/io-file-test.cc
@@ -393,9 +393,9 @@ TEST_F(TestReadableFile, ThreadSafety) {
ASSERT_OK(ReadableFile::Open(path_, &pool, &file_));
std::atomic<int> correct_count(0);
- const int niter = 10000;
+ constexpr int niter = 10000;
- auto ReadData = [&correct_count, &data, niter, this]() {
+ auto ReadData = [&correct_count, &data, this, niter]() {
std::shared_ptr<Buffer> buffer;
for (int i = 0; i < niter; ++i) {
@@ -587,9 +587,9 @@ TEST_F(TestMemoryMappedFile, ThreadSafety) {
static_cast<int64_t>(data.size())));
std::atomic<int> correct_count(0);
- const int niter = 10000;
+ constexpr int niter = 10000;
- auto ReadData = [&correct_count, &data, niter, &file]() {
+ auto ReadData = [&correct_count, &data, &file, niter]() {
std::shared_ptr<Buffer> buffer;
for (int i = 0; i < niter; ++i) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/io/io-hdfs-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc
index eaf638f..5305b47 100644
--- a/cpp/src/arrow/io/io-hdfs-test.cc
+++ b/cpp/src/arrow/io/io-hdfs-test.cc
@@ -437,7 +437,7 @@ TYPED_TEST(TestHadoopFileSystem, ThreadSafety) {
ASSERT_OK(this->client_->OpenReadable(src_path, &file));
std::atomic<int> correct_count(0);
- const int niter = 1000;
+ constexpr int niter = 1000;
auto ReadData = [&file, &correct_count, &data, niter]() {
for (int i = 0; i < niter; ++i) {