You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/08/05 09:04:46 UTC
[arrow] branch master updated: ARROW-5651: [Python] Fix Incorrect
conversion from strided Numpy array
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2088c05 ARROW-5651: [Python] Fix Incorrect conversion from strided Numpy array
2088c05 is described below
commit 2088c05fa79af84c6de3cd4ff5c36f31f40bbdcf
Author: Takuya Kato <kt...@Takuyas-MacBook-Pro.local>
AuthorDate: Mon Aug 5 11:04:05 2019 +0200
ARROW-5651: [Python] Fix Incorrect conversion from strided Numpy array
This is a second attempt to resolve ARROW-5651. The previous attempt was #4958.
I withdrew the previous PR because it introduces the performance degradation, but I confirmed that this PR does not.
Benchmarks on macOS High Sierra. on master:
```
In [3]: %timeit b = pa.array(a)
4.17 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [4]: %timeit b = pa.array(strided)
46.8 µs ± 1.78 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [5]: %timeit b = pa.array(a, type=pa.float64())
39.2 µs ± 1.79 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [6]: %timeit b = pa.array(strided, type=pa.float64())
78.3 µs ± 280 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
```
on this PR:
```
In [3]: %timeit b = pa.array(a)
4.77 µs ± 276 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
In [4]: %timeit b = pa.array(strided)
49.5 µs ± 2.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [5]: %timeit b = pa.array(a, type=pa.float64())
39.6 µs ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
In [6]: %timeit b = pa.array(strided, type=pa.float64())
76.3 µs ± 295 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
```
Closes #5005 from Ktakuya332C/ARROW-5651 and squashes the following commits:
396b00b39 <Takuya Kato> Add a test
ec10bf902 <Takuya Kato> Add NumPyStridedConverter
Authored-by: Takuya Kato <kt...@Takuyas-MacBook-Pro.local>
Signed-off-by: Antoine Pitrou <an...@python.org>
---
cpp/src/arrow/python/numpy_to_arrow.cc | 55 +++++++++++++++++++++-------------
python/pyarrow/tests/test_pandas.py | 7 +++++
2 files changed, 41 insertions(+), 21 deletions(-)
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 811a31f..6af964c 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -392,29 +392,42 @@ void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output
}
}
-template <typename ArrowType>
-Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* pool,
+class NumPyStridedConverter {
+ public:
+ static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
- using traits = internal::arrow_traits<ArrowType::type_id>;
- using T = typename traits::T;
-
- // Strided, must copy into new contiguous memory
- std::shared_ptr<Buffer> new_buffer;
- RETURN_NOT_OK(AllocateBuffer(pool, sizeof(T) * length, &new_buffer));
-
- const int64_t stride = PyArray_STRIDES(arr)[0];
- if (stride % sizeof(T) == 0) {
- const int64_t stride_elements = stride / sizeof(T);
- CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length, stride_elements,
- reinterpret_cast<T*>(new_buffer->mutable_data()));
- } else {
- CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length, stride,
- reinterpret_cast<T*>(new_buffer->mutable_data()));
+ NumPyStridedConverter converter(arr, length, pool);
+ RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+ *out = converter.buffer_;
+ return Status::OK();
}
+ template <int TYPE>
+ Status Visit(PyArrayObject* arr) {
+ using traits = internal::npy_traits<TYPE>;
+ using T = typename traits::value_type;
- *out = new_buffer;
- return Status::OK();
-}
+ RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(T) * length_, &buffer_));
+
+ const int64_t stride = PyArray_STRIDES(arr)[0];
+ if (stride % sizeof(T) == 0) {
+ const int64_t stride_elements = stride / sizeof(T);
+ CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length_,
+ stride_elements, reinterpret_cast<T*>(buffer_->mutable_data()));
+ } else {
+ CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length_, stride,
+ reinterpret_cast<T*>(buffer_->mutable_data()));
+ }
+ return Status::OK();
+ }
+
+ protected:
+ NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool)
+ : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {}
+ PyArrayObject* arr_;
+ int64_t length_;
+ MemoryPool* pool_;
+ std::shared_ptr<Buffer> buffer_;
+};
} // namespace
@@ -426,7 +439,7 @@ inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
}
if (is_strided()) {
- RETURN_NOT_OK(CopyStridedArray<ArrowType>(arr_, length_, pool_, data));
+ RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data));
} else if (dtype_->type_num == NPY_BOOL) {
int64_t nbytes = BitUtil::BytesForBits(length_);
std::shared_ptr<Buffer> buffer;
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 931c2b1..6ed440f 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2348,6 +2348,13 @@ class TestConvertMisc(object):
arr = pa.array(data['y'], type=pa.int16())
assert arr.to_pylist() == [-1, 2]
+ def test_array_from_strided_numpy_array(self):
+ # ARROW-5651
+ np_arr = np.arange(0, 10, dtype=np.float32)[1:-1:2]
+ pa_arr = pa.array(np_arr, type=pa.float64())
+ expected = pa.array([1.0, 3.0, 5.0, 7.0], type=pa.float64())
+ pa_arr.equals(expected)
+
def test_safe_unsafe_casts(self):
# ARROW-2799
df = pd.DataFrame({