You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/28 00:58:18 UTC

arrow git commit: ARROW-1607: [C++] Implement DictionaryBuilder for Decimals

Repository: arrow
Updated Branches:
  refs/heads/master 808a14330 -> b62f99a60


ARROW-1607: [C++] Implement DictionaryBuilder for Decimals

Author: Phillip Cloud <cp...@gmail.com>

Closes #1128 from cpcloud/ARROW-1607 and squashes the following commits:

3137566 [Phillip Cloud] ARROW-1607: [C++] Implement DictionaryBuilder for Decimals


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/b62f99a6
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/b62f99a6
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/b62f99a6

Branch: refs/heads/master
Commit: b62f99a603a4822756c610e52e4643980d4bebf1
Parents: 808a143
Author: Phillip Cloud <cp...@gmail.com>
Authored: Wed Sep 27 20:58:08 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Wed Sep 27 20:58:08 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc      | 92 +++++++++++++++++++++++++++++++++--
 cpp/src/arrow/builder.cc         | 11 +++--
 cpp/src/arrow/builder.h          |  4 +-
 cpp/src/arrow/compute/cast.cc    |  6 ++-
 cpp/src/arrow/io/io-file-test.cc |  8 +--
 cpp/src/arrow/io/io-hdfs-test.cc |  2 +-
 6 files changed, 107 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 9731083..4ea9248 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1200,7 +1200,7 @@ class TestFWBinaryArray : public ::testing::Test {
 };
 
 TEST_F(TestFWBinaryArray, Builder) {
-  const int32_t byte_width = 10;
+  constexpr int32_t byte_width = 10;
   int64_t length = 4096;
 
   int64_t nbytes = length * byte_width;
@@ -1215,8 +1215,7 @@ TEST_F(TestFWBinaryArray, Builder) {
 
   std::shared_ptr<Array> result;
 
-  auto CheckResult = [this, &length, &is_valid, &raw_data,
-                      &byte_width](const Array& result) {
+  auto CheckResult = [&length, &is_valid, &raw_data, byte_width](const Array& result) {
     // Verify output
     const auto& fw_result = static_cast<const FixedSizeBinaryArray&>(result);
 
@@ -1847,6 +1846,93 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
   ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
 }
 
+TEST(TestDecimalDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  const auto& decimal_type = arrow::decimal(2, 0);
+  DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
+
+  // Test data
+  std::vector<Decimal128> test{12, 12, 11, 12};
+  for (const auto& value : test) {
+    ASSERT_OK(builder.Append(value.ToBytes().data()));
+  }
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  FixedSizeBinaryBuilder decimal_builder(decimal_type);
+  ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes()));
+  ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes()));
+
+  std::shared_ptr<Array> decimal_array;
+  ASSERT_OK(decimal_builder.Finish(&decimal_array));
+  auto dtype = arrow::dictionary(int8(), decimal_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append({0, 0, 1, 0}));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
+  const auto& decimal_type = arrow::decimal(21, 0);
+
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, default_memory_pool());
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(decimal_type);
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    const uint8_t bytes[] = {0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             12,
+                             12,
+                             static_cast<uint8_t>(i / 128),
+                             static_cast<uint8_t>(i % 128)};
+    ASSERT_OK(builder.Append(bytes));
+    ASSERT_OK(fsb_builder.Append(bytes));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 0, 1};
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append(known_value));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+  auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
 // ----------------------------------------------------------------------
 // List tests
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index a194ab7..0479dc5 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -831,11 +831,11 @@ DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
       hash_table_(new PoolBuffer(pool)),
       hash_slots_(nullptr),
       dict_builder_(type, pool),
-      values_builder_(pool) {
+      values_builder_(pool),
+      byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width()) {
   if (!::arrow::CpuInfo::initialized()) {
     ::arrow::CpuInfo::Init();
   }
-  byte_width_ = static_cast<const FixedSizeBinaryType&>(*type).byte_width();
 }
 
 #ifndef ARROW_NO_DEPRECATED_API
@@ -921,7 +921,7 @@ Status DictionaryBuilder<T>::Append(const Scalar& value) {
 
 template <typename T>
 Status DictionaryBuilder<T>::AppendArray(const Array& array) {
-  const NumericArray<T>& numeric_array = static_cast<const NumericArray<T>&>(array);
+  const auto& numeric_array = static_cast<const NumericArray<T>&>(array);
   for (int64_t i = 0; i < array.length(); i++) {
     if (array.IsNull(i)) {
       RETURN_NOT_OK(AppendNull());
@@ -938,8 +938,7 @@ Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
     return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
   }
 
-  const FixedSizeBinaryArray& numeric_array =
-      static_cast<const FixedSizeBinaryArray&>(array);
+  const auto& numeric_array = static_cast<const FixedSizeBinaryArray&>(array);
   for (int64_t i = 0; i < array.length(); i++) {
     if (array.IsNull(i)) {
       RETURN_NOT_OK(AppendNull());
@@ -1493,6 +1492,7 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
     DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder);
     DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder);
     DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
+    DICTIONARY_BUILDER_CASE(DECIMAL, DictionaryBuilder<FixedSizeBinaryType>);
     default:
       return Status::NotImplemented(type->ToString());
   }
@@ -1528,6 +1528,7 @@ Status EncodeArrayToDictionary(const Array& input, MemoryPool* pool,
     DICTIONARY_ARRAY_CASE(STRING, StringDictionaryBuilder);
     DICTIONARY_ARRAY_CASE(BINARY, BinaryDictionaryBuilder);
     DICTIONARY_ARRAY_CASE(FIXED_SIZE_BINARY, DictionaryBuilder<FixedSizeBinaryType>);
+    DICTIONARY_ARRAY_CASE(DECIMAL, DictionaryBuilder<FixedSizeBinaryType>);
     default:
       std::stringstream ss;
       ss << "Cannot encode array of type " << type->ToString();

http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 28f3cb9..da7386a 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -937,8 +937,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
 
 class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> {
  public:
-  using DictionaryBuilder::DictionaryBuilder;
   using DictionaryBuilder::Append;
+  using DictionaryBuilder::DictionaryBuilder;
 
   Status Append(const uint8_t* value, int32_t length) {
     return Append(internal::WrappedBinary(value, length));
@@ -958,8 +958,8 @@ class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType
 /// \brief Dictionary array builder with convenience methods for strings
 class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> {
  public:
-  using DictionaryBuilder::DictionaryBuilder;
   using DictionaryBuilder::Append;
+  using DictionaryBuilder::DictionaryBuilder;
 
   Status Append(const uint8_t* value, int32_t length) {
     return Append(internal::WrappedBinary(value, length));

http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/compute/cast.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index ee838fa..149cc36 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -490,7 +490,10 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input
   if (can_pre_allocate_values) {
     std::shared_ptr<Buffer> out_data;
 
-    if (!(is_primitive(out->type->id()) || out->type->id() == Type::FIXED_SIZE_BINARY)) {
+    const Type::type type_id = out->type->id();
+
+    if (!(is_primitive(type_id) || type_id == Type::FIXED_SIZE_BINARY ||
+          type_id == Type::DECIMAL)) {
       std::stringstream ss;
       ss << "Cannot pre-allocate memory for type: " << out->type->ToString();
       return Status::NotImplemented(ss.str());
@@ -614,6 +617,7 @@ class CastKernel : public UnaryKernel {
   FN(IN_TYPE, FloatType);             \
   FN(IN_TYPE, DoubleType);            \
   FN(IN_TYPE, FixedSizeBinaryType);   \
+  FN(IN_TYPE, DecimalType);           \
   FN(IN_TYPE, BinaryType);            \
   FN(IN_TYPE, StringType);
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/io/io-file-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc
index 636fbd8..ee3beab 100644
--- a/cpp/src/arrow/io/io-file-test.cc
+++ b/cpp/src/arrow/io/io-file-test.cc
@@ -393,9 +393,9 @@ TEST_F(TestReadableFile, ThreadSafety) {
   ASSERT_OK(ReadableFile::Open(path_, &pool, &file_));
 
   std::atomic<int> correct_count(0);
-  const int niter = 10000;
+  constexpr int niter = 10000;
 
-  auto ReadData = [&correct_count, &data, niter, this]() {
+  auto ReadData = [&correct_count, &data, this, niter]() {
     std::shared_ptr<Buffer> buffer;
 
     for (int i = 0; i < niter; ++i) {
@@ -587,9 +587,9 @@ TEST_F(TestMemoryMappedFile, ThreadSafety) {
                         static_cast<int64_t>(data.size())));
 
   std::atomic<int> correct_count(0);
-  const int niter = 10000;
+  constexpr int niter = 10000;
 
-  auto ReadData = [&correct_count, &data, niter, &file]() {
+  auto ReadData = [&correct_count, &data, &file, niter]() {
     std::shared_ptr<Buffer> buffer;
 
     for (int i = 0; i < niter; ++i) {

http://git-wip-us.apache.org/repos/asf/arrow/blob/b62f99a6/cpp/src/arrow/io/io-hdfs-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/io/io-hdfs-test.cc b/cpp/src/arrow/io/io-hdfs-test.cc
index eaf638f..5305b47 100644
--- a/cpp/src/arrow/io/io-hdfs-test.cc
+++ b/cpp/src/arrow/io/io-hdfs-test.cc
@@ -437,7 +437,7 @@ TYPED_TEST(TestHadoopFileSystem, ThreadSafety) {
   ASSERT_OK(this->client_->OpenReadable(src_path, &file));
 
   std::atomic<int> correct_count(0);
-  const int niter = 1000;
+  constexpr int niter = 1000;
 
   auto ReadData = [&file, &correct_count, &data, niter]() {
     for (int i = 0; i < niter; ++i) {