You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/11/28 17:33:43 UTC
[arrow] branch master updated: ARROW-1735: [C++] Test CastKernel
writing into output array with non-zero offset
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5107e93 ARROW-1735: [C++] Test CastKernel writing into output array with non-zero offset
5107e93 is described below
commit 5107e93837075c90f7d64152402839db3c89148e
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Tue Nov 28 12:33:39 2017 -0500
ARROW-1735: [C++] Test CastKernel writing into output array with non-zero offset
This uncovered some bugs. I inspected the other kernels that are untested and while they look fine, at some point we may want to add some more extensive unit tests about this
Author: Wes McKinney <we...@twosigma.com>
Closes #1369 from wesm/ARROW-1735 and squashes the following commits:
de41d929 [Wes McKinney] Test CastKernel writing into output array with non-zero offset
---
cpp/src/arrow/array.cc | 6 ++--
cpp/src/arrow/array.h | 11 +++++--
cpp/src/arrow/compute/compute-test.cc | 60 ++++++++++++++++++++++++++++++++++
cpp/src/arrow/compute/kernels/cast.cc | 38 ++++++++++-----------
cpp/src/arrow/python/arrow_to_python.h | 4 +--
cpp/src/arrow/python/numpy_to_arrow.cc | 2 +-
6 files changed, 92 insertions(+), 29 deletions(-)
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 4b1fabf..0b235cc 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -103,7 +103,7 @@ static inline std::shared_ptr<ArrayData> SliceData(const ArrayData& data, int64_
length = std::min(data.length - offset, length);
offset += data.offset;
- auto new_data = data.ShallowCopy();
+ auto new_data = data.Copy();
new_data->length = length;
new_data->offset = offset;
new_data->null_count = data.null_count != 0 ? kUnknownNullCount : 0;
@@ -482,14 +482,14 @@ DictionaryArray::DictionaryArray(const std::shared_ptr<DataType>& type,
: dict_type_(static_cast<const DictionaryType*>(type.get())) {
DCHECK_EQ(type->id(), Type::DICTIONARY);
DCHECK_EQ(indices->type_id(), dict_type_->index_type()->id());
- auto data = indices->data()->ShallowCopy();
+ auto data = indices->data()->Copy();
data->type = type;
SetData(data);
}
void DictionaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
- auto indices_data = data_->ShallowCopy();
+ auto indices_data = data_->Copy();
indices_data->type = dict_type_->index_type();
std::shared_ptr<Array> result;
indices_ = MakeArray(indices_data);
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index ec5381d..ebe54ad 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -143,9 +143,14 @@ struct ARROW_EXPORT ArrayData {
return *this;
}
- std::shared_ptr<ArrayData> ShallowCopy() const {
- return std::make_shared<ArrayData>(*this);
- }
+ std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
+
+#ifndef ARROW_NO_DEPRECATED_API
+
+ // Deprecated since 0.8.0
+ std::shared_ptr<ArrayData> ShallowCopy() const { return Copy(); }
+
+#endif
std::shared_ptr<DataType> type;
int64_t length;
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index d515897..c73bfa3 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -709,6 +709,66 @@ TEST_F(TestCast, PreallocatedMemory) {
ASSERT_ARRAYS_EQUAL(*expected, *result);
}
+template <typename InType, typename InT, typename OutType, typename OutT>
+void CheckOffsetOutputCase(FunctionContext* ctx, const std::shared_ptr<DataType>& in_type,
+ const vector<InT>& in_values,
+ const std::shared_ptr<DataType>& out_type,
+ const vector<OutT>& out_values) {
+ using OutTraits = TypeTraits<OutType>;
+
+ CastOptions options;
+
+ const int64_t length = static_cast<int64_t>(in_values.size());
+
+ shared_ptr<Array> arr, expected;
+ ArrayFromVector<InType, InT>(in_type, in_values, &arr);
+ ArrayFromVector<OutType, OutT>(out_type, out_values, &expected);
+
+ shared_ptr<Buffer> out_buffer;
+ ASSERT_OK(ctx->Allocate(OutTraits::bytes_required(length), &out_buffer));
+
+ std::unique_ptr<UnaryKernel> kernel;
+ ASSERT_OK(GetCastFunction(*in_type, out_type, options, &kernel));
+
+ const int64_t first_half = length / 2;
+
+ auto out_data = ArrayData::Make(out_type, length, {nullptr, out_buffer});
+ auto out_second_data = out_data->Copy();
+ out_second_data->offset = first_half;
+
+ Datum out_first(out_data);
+ Datum out_second(out_second_data);
+
+ // Cast each bit
+ ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(0, first_half)), &out_first));
+ ASSERT_OK(kernel->Call(ctx, Datum(arr->Slice(first_half)), &out_second));
+
+ shared_ptr<Array> result = MakeArray(out_data);
+
+ ASSERT_ARRAYS_EQUAL(*expected, *result);
+}
+
+TEST_F(TestCast, OffsetOutputBuffer) {
+ // ARROW-1735
+ vector<int32_t> v1 = {0, 10000, 2000, 1000, 0};
+ vector<int64_t> e1 = {0, 10000, 2000, 1000, 0};
+
+ auto in_type = int32();
+ auto out_type = int64();
+ CheckOffsetOutputCase<Int32Type, int32_t, Int64Type, int64_t>(&this->ctx_, in_type, v1,
+ out_type, e1);
+
+ vector<bool> e2 = {false, true, true, true, false};
+
+ out_type = boolean();
+ CheckOffsetOutputCase<Int32Type, int32_t, BooleanType, bool>(&this->ctx_, in_type, v1,
+ boolean(), e2);
+
+ vector<int16_t> e3 = {0, 10000, 2000, 1000, 0};
+ CheckOffsetOutputCase<Int32Type, int32_t, Int16Type, int16_t>(&this->ctx_, in_type, v1,
+ int16(), e3);
+}
+
template <typename TestType>
class TestDictionaryCast : public TestCast {};
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index d48d669..465be95 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -124,12 +124,7 @@ template <typename T>
struct CastFunctor<T, NullType, typename std::enable_if<
std::is_base_of<FixedWidthType, T>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
- const ArrayData& input, ArrayData* output) {
- // Simply initialize data to 0
- auto buf = output->buffers[1];
- DCHECK_EQ(output->offset, 0);
- memset(buf->mutable_data(), 0, buf->size());
- }
+ const ArrayData& input, ArrayData* output) {}
};
template <>
@@ -199,14 +194,19 @@ struct CastFunctor<O, I, typename std::enable_if<std::is_same<BooleanType, O>::v
!std::is_same<O, I>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
- using in_type = typename I::c_type;
- DCHECK_EQ(output->offset, 0);
+ auto in_data = GetValues<typename I::c_type>(input, 1);
+ internal::BitmapWriter writer(output->buffers[1]->mutable_data(), output->offset,
+ input.length);
- const in_type* in_data = GetValues<in_type>(input, 1);
- uint8_t* out_data = GetMutableValues<uint8_t>(output, 1);
for (int64_t i = 0; i < input.length; ++i) {
- BitUtil::SetBitTo(out_data, i, (*in_data++) != 0);
+ if (*in_data++ != 0) {
+ writer.Set();
+ } else {
+ writer.Clear();
+ }
+ writer.Next();
}
+ writer.Finish();
}
};
@@ -217,7 +217,6 @@ struct CastFunctor<O, I,
const ArrayData& input, ArrayData* output) {
using in_type = typename I::c_type;
using out_type = typename O::c_type;
- DCHECK_EQ(output->offset, 0);
auto in_offset = input.offset;
@@ -475,9 +474,10 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
- uint8_t* out = output->buffers[1]->mutable_data();
int32_t byte_width =
static_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
+
+ uint8_t* out = output->buffers[1]->mutable_data() + byte_width * output->offset;
for (int64_t i = 0; i < indices.length(); ++i) {
if (valid_bits_reader.IsSet()) {
const uint8_t* value = dictionary.Value(in[i]);
@@ -493,7 +493,7 @@ struct CastFunctor<
typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
- DictionaryArray dict_array(input.ShallowCopy());
+ DictionaryArray dict_array(input.Copy());
const DictionaryType& type = static_cast<const DictionaryType&>(*input.type);
const DataType& values_type = *type.dictionary()->type();
@@ -565,7 +565,7 @@ struct CastFunctor<T, DictionaryType,
typename std::enable_if<std::is_base_of<BinaryType, T>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
- DictionaryArray dict_array(input.ShallowCopy());
+ DictionaryArray dict_array(input.Copy());
const DictionaryType& type = static_cast<const DictionaryType&>(*input.type);
const DataType& values_type = *type.dictionary()->type();
@@ -605,12 +605,10 @@ struct CastFunctor<T, DictionaryType,
template <typename IndexType, typename c_type>
void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
c_type* out) {
- using index_c_type = typename IndexType::c_type;
-
internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
indices.length());
- const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
+ auto in = GetValues<typename IndexType::c_type>(*indices.data(), 1);
for (int64_t i = 0; i < indices.length(); ++i) {
if (valid_bits_reader.IsSet()) {
out[i] = dictionary[in[i]];
@@ -627,7 +625,7 @@ struct CastFunctor<T, DictionaryType,
const ArrayData& input, ArrayData* output) {
using c_type = typename T::c_type;
- DictionaryArray dict_array(input.ShallowCopy());
+ DictionaryArray dict_array(input.Copy());
const DictionaryType& type = static_cast<const DictionaryType&>(*input.type);
const DataType& values_type = *type.dictionary()->type();
@@ -638,7 +636,7 @@ struct CastFunctor<T, DictionaryType,
const c_type* dictionary = GetValues<c_type>(*type.dictionary()->data(), 1);
- auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data());
+ auto out = GetMutableValues<c_type>(output, 1);
const Array& indices = *dict_array.indices();
switch (indices.type()->id()) {
case Type::INT8:
diff --git a/cpp/src/arrow/python/arrow_to_python.h b/cpp/src/arrow/python/arrow_to_python.h
index 9440ffb..02a22f0 100644
--- a/cpp/src/arrow/python/arrow_to_python.h
+++ b/cpp/src/arrow/python/arrow_to_python.h
@@ -51,8 +51,8 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
/// \brief Reconstruct SerializedPyObject from representation produced by
/// SerializedPyObject::GetComponents.
///
-/// \param[in] num_tensors
-/// \param[in] num_buffers
+/// \param[in] num_tensors number of tensors in the object
+/// \param[in] num_buffers number of buffers in the object
/// \param[in] data a list containing pyarrow.Buffer instances. Must be 1 +
/// num_tensors * 2 + num_buffers in length
/// \param[out] out the reconstructed object
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 0c0d1a9..0b1124d 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -765,7 +765,7 @@ Status NumPyConverter::ConvertObjectStrings() {
// If we saw PyBytes, convert everything to BinaryArray
if (global_have_bytes) {
for (size_t i = 0; i < out_arrays_.size(); ++i) {
- auto binary_data = out_arrays_[i]->data()->ShallowCopy();
+ auto binary_data = out_arrays_[i]->data()->Copy();
binary_data->type = ::arrow::binary();
out_arrays_[i] = std::make_shared<BinaryArray>(binary_data);
}
--
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].