You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/08/05 09:04:46 UTC

[arrow] branch master updated: ARROW-5651: [Python] Fix Incorrect conversion from strided Numpy array

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2088c05  ARROW-5651: [Python] Fix Incorrect conversion from strided Numpy array
2088c05 is described below

commit 2088c05fa79af84c6de3cd4ff5c36f31f40bbdcf
Author: Takuya Kato <kt...@Takuyas-MacBook-Pro.local>
AuthorDate: Mon Aug 5 11:04:05 2019 +0200

    ARROW-5651: [Python] Fix Incorrect conversion from strided Numpy array
    
    This is a second attempt to resolve ARROW-5651. The previous attempt was #4958.
    I withdrew the previous PR because it introduces the performance degradation, but I confirmed that this PR does not.
    
    Benchmarks on macOS High Sierra. on master:
    ```
    In [3]: %timeit b = pa.array(a)
    4.17 µs ± 116 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
    In [4]: %timeit b = pa.array(strided)
    46.8 µs ± 1.78 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    In [5]: %timeit b = pa.array(a, type=pa.float64())
    39.2 µs ± 1.79 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    In [6]: %timeit b = pa.array(strided, type=pa.float64())
    78.3 µs ± 280 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    ```
    on this PR:
    ```
    In [3]: %timeit b = pa.array(a)
    4.77 µs ± 276 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
    In [4]: %timeit b = pa.array(strided)
    49.5 µs ± 2.5 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    In [5]: %timeit b = pa.array(a, type=pa.float64())
    39.6 µs ± 1.53 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    In [6]: %timeit b = pa.array(strided, type=pa.float64())
    76.3 µs ± 295 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    ```
    
    Closes #5005 from Ktakuya332C/ARROW-5651 and squashes the following commits:
    
    396b00b39 <Takuya Kato> Add a test
    ec10bf902 <Takuya Kato> Add NumPyStridedConverter
    
    Authored-by: Takuya Kato <kt...@Takuyas-MacBook-Pro.local>
    Signed-off-by: Antoine Pitrou <an...@python.org>
---
 cpp/src/arrow/python/numpy_to_arrow.cc | 55 +++++++++++++++++++++-------------
 python/pyarrow/tests/test_pandas.py    |  7 +++++
 2 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 811a31f..6af964c 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -392,29 +392,42 @@ void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output
   }
 }
 
-template <typename ArrowType>
-Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* pool,
+class NumPyStridedConverter {
+ public:
+  static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool,
                         std::shared_ptr<Buffer>* out) {
-  using traits = internal::arrow_traits<ArrowType::type_id>;
-  using T = typename traits::T;
-
-  // Strided, must copy into new contiguous memory
-  std::shared_ptr<Buffer> new_buffer;
-  RETURN_NOT_OK(AllocateBuffer(pool, sizeof(T) * length, &new_buffer));
-
-  const int64_t stride = PyArray_STRIDES(arr)[0];
-  if (stride % sizeof(T) == 0) {
-    const int64_t stride_elements = stride / sizeof(T);
-    CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length, stride_elements,
-                       reinterpret_cast<T*>(new_buffer->mutable_data()));
-  } else {
-    CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length, stride,
-                        reinterpret_cast<T*>(new_buffer->mutable_data()));
+    NumPyStridedConverter converter(arr, length, pool);
+    RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter));
+    *out = converter.buffer_;
+    return Status::OK();
   }
+  template <int TYPE>
+  Status Visit(PyArrayObject* arr) {
+    using traits = internal::npy_traits<TYPE>;
+    using T = typename traits::value_type;
 
-  *out = new_buffer;
-  return Status::OK();
-}
+    RETURN_NOT_OK(AllocateBuffer(pool_, sizeof(T) * length_, &buffer_));
+
+    const int64_t stride = PyArray_STRIDES(arr)[0];
+    if (stride % sizeof(T) == 0) {
+      const int64_t stride_elements = stride / sizeof(T);
+      CopyStridedNatural(reinterpret_cast<T*>(PyArray_DATA(arr)), length_,
+                         stride_elements, reinterpret_cast<T*>(buffer_->mutable_data()));
+    } else {
+      CopyStridedBytewise(reinterpret_cast<int8_t*>(PyArray_DATA(arr)), length_, stride,
+                          reinterpret_cast<T*>(buffer_->mutable_data()));
+    }
+    return Status::OK();
+  }
+
+ protected:
+  NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool)
+      : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {}
+  PyArrayObject* arr_;
+  int64_t length_;
+  MemoryPool* pool_;
+  std::shared_ptr<Buffer> buffer_;
+};
 
 }  // namespace
 
@@ -426,7 +439,7 @@ inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
   }
 
   if (is_strided()) {
-    RETURN_NOT_OK(CopyStridedArray<ArrowType>(arr_, length_, pool_, data));
+    RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data));
   } else if (dtype_->type_num == NPY_BOOL) {
     int64_t nbytes = BitUtil::BytesForBits(length_);
     std::shared_ptr<Buffer> buffer;
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 931c2b1..6ed440f 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2348,6 +2348,13 @@ class TestConvertMisc(object):
         arr = pa.array(data['y'], type=pa.int16())
         assert arr.to_pylist() == [-1, 2]
 
+    def test_array_from_strided_numpy_array(self):
+        # ARROW-5651
+        np_arr = np.arange(0, 10, dtype=np.float32)[1:-1:2]
+        pa_arr = pa.array(np_arr, type=pa.float64())
+        expected = pa.array([1.0, 3.0, 5.0, 7.0], type=pa.float64())
+        pa_arr.equals(expected)
+
     def test_safe_unsafe_casts(self):
         # ARROW-2799
         df = pd.DataFrame({