You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/11/28 17:33:43 UTC

[arrow] branch master updated: ARROW-1735: [C++] Test CastKernel writing into output array with non-zero offset

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5107e93  ARROW-1735: [C++] Test CastKernel writing into output array with non-zero offset
5107e93 is described below

commit 5107e93837075c90f7d64152402839db3c89148e
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Tue Nov 28 12:33:39 2017 -0500

    ARROW-1735: [C++] Test CastKernel writing into output array with non-zero offset
    
    This uncovered some bugs. I inspected the other kernels that are untested and while they look fine, at some point we may want to add some more extensive unit tests about this
    
    Author: Wes McKinney <we...@twosigma.com>
    
    Closes #1369 from wesm/ARROW-1735 and squashes the following commits:
    
    de41d929 [Wes McKinney] Test CastKernel writing into output array with non-zero offset
---
 cpp/src/arrow/array.cc                 |  6 ++--
 cpp/src/arrow/array.h                  | 11 +++++--
 cpp/src/arrow/compute/compute-test.cc  | 60 ++++++++++++++++++++++++++++++++++
 cpp/src/arrow/compute/kernels/cast.cc  | 38 ++++++++++-----------
 cpp/src/arrow/python/arrow_to_python.h |  4 +--
 cpp/src/arrow/python/numpy_to_arrow.cc |  2 +-
 6 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 4b1fabf..0b235cc 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -103,7 +103,7 @@ static inline std::shared_ptr<ArrayData> SliceData(const ArrayData& data, int64_
   length = std::min(data.length - offset, length);
   offset += data.offset;
 
-  auto new_data = data.ShallowCopy();
+  auto new_data = data.Copy();
   new_data->length = length;
   new_data->offset = offset;
   new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0;
@@ -482,14 +482,14 @@ DictionaryArray::DictionaryArray(const std::shared_ptr<DataType>& type,
     : dict_type_(static_cast<const DictionaryType*>(type.get())) {
   DCHECK_EQ(type->id(), Type::DICTIONARY);
   DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id());
-  auto data = indices->data()->ShallowCopy();
+  auto data = indices->data()->Copy();
   data->type = type;
   SetData(data);
 }
 
 void DictionaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
   this->Array::SetData(data);
-  auto indices_data = data_->ShallowCopy();
+  auto indices_data = data_->Copy();
   indices_data->type = dict_type_->index_type();
   std::shared_ptr<Array> result;
   indices_ = MakeArray(indices_data);
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index ec5381d..ebe54ad 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -143,9 +143,14 @@ struct ARROW_EXPORT ArrayData {
     return *this;
   }
 
-  std::shared_ptr<ArrayData> ShallowCopy() const {
-    return std::make_shared<ArrayData>(*this);
-  }
+  std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
+
+#ifndef ARROW_NO_DEPRECATED_API
+
+  // Deprecated since 0.8.0
+  std::shared_ptr<ArrayData> ShallowCopy() const { return Copy(); }
+
+#endif
 
   std::shared_ptr<DataType> type;
   int64_t length;
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index d515897..c73bfa3 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -709,6 +709,66 @@ TEST_F(TestCast, PreallocatedMemory) {
   ASSERT_ARRAYS_EQUAL(*expected, *result);
 }
 
+template <typename InType, typename InT, typename OutType, typename OutT>
+void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr<DataType>& in_type,
+                           const vector<InT>& in_values,
+                           const std::shared_ptr<DataType>& out_type,
+                           const vector<OutT>& out_values) {
+  using OutTraits = TypeTraits<OutType>;
+
+  CastOptions options;
+
+  const int64_t length = static_cast<int64_t>(in_values.size());
+
+  shared_ptr<Array> arr, expected;
+  ArrayFromVector<InType, InT>(in_type, in_values, &arr);
+  ArrayFromVector<OutType, OutT>(out_type, out_values, &expected);
+
+  shared_ptr<Buffer> out_buffer;
+  ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer));
+
+  std::unique_ptr<UnaryKernel> kernel;
+  ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel));
+
+  const int64_t first_half = length / 2;
+
+  auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer});
+  auto out_second_data = out_data->Copy();
+  out_second_data->offset = first_half;
+
+  Datum out_first(out_data);
+  Datum out_second(out_second_data);
+
+  // Cast each bit
+  ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first));
+  ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second));
+
+  shared_ptr<Array> result = MakeArray(out_data);
+
+  ASSERT_ARRAYS_EQUAL(*expected, *result);
+}
+
+TEST_F(TestCast, OffsetOutputBuffer) {
+  // ARROW-1735
+  vector<int32_t> v1 = {0, 10000, 2000, 1000, 0};
+  vector<int64_t> e1 = {0, 10000, 2000, 1000, 0};
+
+  auto in_type = int32();
+  auto out_type = int64();
+  CheckOffsetOutputCase<Int32Type, int32_t, Int64Type, int64_t>(&this->ctx_, in_type, v1,
+                                                                out_type, e1);
+
+  vector<bool> e2 = {false, true, true, true, false};
+
+  out_type = boolean();
+  CheckOffsetOutputCase<Int32Type, int32_t, BooleanType, bool>(&this->ctx_, in_type, v1,
+                                                               boolean(), e2);
+
+  vector<int16_t> e3 = {0, 10000, 2000, 1000, 0};
+  CheckOffsetOutputCase<Int32Type, int32_t, Int16Type, int16_t>(&this->ctx_, in_type, v1,
+                                                                int16(), e3);
+}
+
 template <typename TestType>
 class TestDictionaryCast : public TestCast {};
 
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index d48d669..465be95 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -124,12 +124,7 @@ template <typename T>
 struct CastFunctor<T, NullType, typename std::enable_if<
                                     std::is_base_of<FixedWidthType, T>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
-                  const ArrayData& input, ArrayData* output) {
-    // Simply initialize data to 0
-    auto buf = output->buffers[1];
-    DCHECK_EQ(output->offset, 0);
-    memset(buf->mutable_data(), 0, buf->size());
-  }
+                  const ArrayData& input, ArrayData* output) {}
 };
 
 template <>
@@ -199,14 +194,19 @@ struct CastFunctor<O, I, typename std::enable_if<std::is_same<BooleanType, O>::v
                                                  !std::is_same<O, I>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    using in_type = typename I::c_type;
-    DCHECK_EQ(output->offset, 0);
+    auto in_data = GetValues<typename I::c_type>(input, 1);
+    internal::BitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
+                                  input.length);
 
-    const in_type* in_data = GetValues<in_type>(input, 1);
-    uint8_t* out_data = GetMutableValues<uint8_t>(output, 1);
     for (int64_t i = 0; i < input.length; ++i) {
-      BitUtil::SetBitTo(out_data, i, (*in_data++) != 0);
+      if (*in_data++ != 0) {
+        writer.Set();
+      } else {
+        writer.Clear();
+      }
+      writer.Next();
     }
+    writer.Finish();
   }
 };
 
@@ -217,7 +217,6 @@ struct CastFunctor<O, I,
                   const ArrayData& input, ArrayData* output) {
     using in_type = typename I::c_type;
     using out_type = typename O::c_type;
-    DCHECK_EQ(output->offset, 0);
 
     auto in_offset = input.offset;
 
@@ -475,9 +474,10 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
 
   const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
 
-  uint8_t* out = output->buffers[1]->mutable_data();
   int32_t byte_width =
       static_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
+
+  uint8_t* out = output->buffers[1]->mutable_data() + byte_width * output->offset;
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (valid_bits_reader.IsSet()) {
       const uint8_t* value = dictionary.Value(in[i]);
@@ -493,7 +493,7 @@ struct CastFunctor<
     typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    DictionaryArray dict_array(input.ShallowCopy());
+    DictionaryArray dict_array(input.Copy());
 
     const DictionaryType& type = static_cast<const DictionaryType&>(*input.type);
     const DataType& values_type = *type.dictionary()->type();
@@ -565,7 +565,7 @@ struct CastFunctor<T, DictionaryType,
                    typename std::enable_if<std::is_base_of<BinaryType, T>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    DictionaryArray dict_array(input.ShallowCopy());
+    DictionaryArray dict_array(input.Copy());
 
     const DictionaryType& type = static_cast<const DictionaryType&>(*input.type);
     const DataType& values_type = *type.dictionary()->type();
@@ -605,12 +605,10 @@ struct CastFunctor<T, DictionaryType,
 template <typename IndexType, typename c_type>
 void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
                                c_type* out) {
-  using index_c_type = typename IndexType::c_type;
-
   internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
                                            indices.length());
 
-  const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
+  auto in = GetValues<typename IndexType::c_type>(*indices.data(), 1);
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (valid_bits_reader.IsSet()) {
       out[i] = dictionary[in[i]];
@@ -627,7 +625,7 @@ struct CastFunctor<T, DictionaryType,
                   const ArrayData& input, ArrayData* output) {
     using c_type = typename T::c_type;
 
-    DictionaryArray dict_array(input.ShallowCopy());
+    DictionaryArray dict_array(input.Copy());
 
     const DictionaryType& type = static_cast<const DictionaryType&>(*input.type);
     const DataType& values_type = *type.dictionary()->type();
@@ -638,7 +636,7 @@ struct CastFunctor<T, DictionaryType,
 
     const c_type* dictionary = GetValues<c_type>(*type.dictionary()->data(), 1);
 
-    auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data());
+    auto out = GetMutableValues<c_type>(output, 1);
     const Array& indices = *dict_array.indices();
     switch (indices.type()->id()) {
       case Type::INT8:
diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h
index 9440ffb..02a22f0 100644
--- a/cpp/src/arrow/python/arrow_to_python.h
+++ b/cpp/src/arrow/python/arrow_to_python.h
@@ -51,8 +51,8 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
 /// \brief Reconstruct SerializedPyObject from representation produced by
 /// SerializedPyObject::GetComponents.
 ///
-/// \param[in] num_tensors
-/// \param[in] num_buffers
+/// \param[in] num_tensors number of tensors in the object
+/// \param[in] num_buffers number of buffers in the object
 /// \param[in] data a list containing pyarrow.Buffer instances. Must be 1 +
 /// num_tensors * 2 + num_buffers in length
 /// \param[out] out the reconstructed object
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 0c0d1a9..0b1124d 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -765,7 +765,7 @@ Status NumPyConverter::ConvertObjectStrings() {
   // If we saw PyBytes, convert everything to BinaryArray
   if (global_have_bytes) {
     for (size_t i = 0; i < out_arrays_.size(); ++i) {
-      auto binary_data = out_arrays_[i]->data()->ShallowCopy();
+      auto binary_data = out_arrays_[i]->data()->Copy();
       binary_data->type = ::arrow::binary();
       out_arrays_[i] = std::make_shared<BinaryArray>(binary_data);
     }

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].