You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ra...@apache.org on 2023/01/18 08:34:24 UTC

[arrow] 08/10: GH-20512: [Python] Numpy conversion doesn't account for ListArray offset (#15210)

This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-11.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit c2199dc4902d200d186f0d23513689686e91fdce
Author: Will Jones <wi...@gmail.com>
AuthorDate: Tue Jan 17 10:15:08 2023 -0800

    GH-20512: [Python] Numpy conversion doesn't account for ListArray offset (#15210)
    
    
    * Closes: #20512
    
    Lead-authored-by: Will Jones <wi...@gmail.com>
    Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
    Signed-off-by: Jacob Wujciak-Jens <ja...@wujciak.de>
---
 cpp/src/arrow/array/array_nested.h                 |  4 +-
 python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 23 ++++++++---
 python/pyarrow/tests/test_pandas.py                | 45 ++++++++++++++++++++++
 3 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index 489a7a3a3c..6fb3fd3c91 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -69,9 +69,11 @@ class BaseListArray : public Array {
   const TypeClass* list_type() const { return list_type_; }
 
   /// \brief Return array object containing the list's values
+  ///
+  /// Note that this buffer does not account for any slice offset or length.
   std::shared_ptr<Array> values() const { return values_; }
 
-  /// Note that this buffer does not account for any slice offset
+  /// Note that this buffer does not account for any slice offset or length.
   std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
 
   std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index f58c151ea6..2faf7d381a 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -738,11 +738,17 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
   ArrayVector value_arrays;
   for (int c = 0; c < data.num_chunks(); c++) {
     const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+    // values() does not account for offsets, so we need to slice into it.
+    // We can't use Flatten(), because it removes the values behind a null list
+    // value, and that makes the offsets into original list values and our
+    // flattened_values array different.
+    std::shared_ptr<Array> flattened_values = arr.values()->Slice(
+        arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0));
     if (arr.value_type()->id() == Type::EXTENSION) {
-      const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr.values());
+      const auto& arr_ext = checked_cast<const ExtensionArray&>(*flattened_values);
       value_arrays.emplace_back(arr_ext.storage());
     } else {
-      value_arrays.emplace_back(arr.values());
+      value_arrays.emplace_back(flattened_values);
     }
   }
 
@@ -772,8 +778,12 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
         Py_INCREF(Py_None);
         *out_values = Py_None;
       } else {
-        OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset));
-        OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset));
+        // Need to subtract value_offset(0) since the original chunk might be a slice
+        // into another array.
+        OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset -
+                                           arr.value_offset(0)));
+        OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset -
+                                         arr.value_offset(0)));
         OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr));
 
         if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) {
@@ -791,7 +801,7 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
     }
     RETURN_IF_PYERROR();
 
-    chunk_offset += arr.values()->length();
+    chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0);
   }
 
   return Status::OK();
@@ -1083,7 +1093,8 @@ struct ObjectWriterVisitor {
       OwnedRef keywords(PyDict_New());
       PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC);
       OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace"));
-      OwnedRef datetime_utc(PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
+      OwnedRef datetime_utc(
+          PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
       // second step: adjust the datetime to tzinfo timezone (astimezone method)
       *out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj());
 
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 729a4122c0..4d0ddf8754 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2308,6 +2308,51 @@ class TestConvertListTypes:
         actual = arr.to_pandas()
         tm.assert_series_equal(actual, expected, check_names=False)
 
+    def test_list_no_duplicate_base(self):
+        # ARROW-18400
+        arr = pa.array([[1, 2], [3, 4, 5], None, [6, None], [7, 8]])
+        chunked_arr = pa.chunked_array([arr.slice(0, 3), arr.slice(3, 1)])
+
+        np_arr = chunked_arr.to_numpy()
+
+        expected = np.array([[1., 2.], [3., 4., 5.], None,
+                            [6., np.NaN]], dtype="object")
+        for left, right in zip(np_arr, expected):
+            if right is None:
+                assert left == right
+            else:
+                npt.assert_array_equal(left, right)
+
+        expected_base = np.array([[1., 2., 3., 4., 5., 6., np.NaN]])
+        npt.assert_array_equal(np_arr[0].base, expected_base)
+
+        np_arr_sliced = chunked_arr.slice(1, 3).to_numpy()
+
+        expected = np.array([[3, 4, 5], None, [6, np.NaN]], dtype="object")
+        for left, right in zip(np_arr_sliced, expected):
+            if right is None:
+                assert left == right
+            else:
+                npt.assert_array_equal(left, right)
+
+        expected_base = np.array([[3., 4., 5., 6., np.NaN]])
+        npt.assert_array_equal(np_arr_sliced[0].base, expected_base)
+
+    def test_list_values_behind_null(self):
+        arr = pa.ListArray.from_arrays(
+            offsets=pa.array([0, 2, 4, 6]),
+            values=pa.array([1, 2, 99, 99, 3, None]),
+            mask=pa.array([False, True, False])
+        )
+        np_arr = arr.to_numpy(zero_copy_only=False)
+
+        expected = np.array([[1., 2.], None, [3., np.NaN]], dtype="object")
+        for left, right in zip(np_arr, expected):
+            if right is None:
+                assert left == right
+            else:
+                npt.assert_array_equal(left, right)
+
 
 class TestConvertStructTypes:
     """