You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by as...@apache.org on 2023/01/17 18:15:14 UTC
[arrow] branch master updated: GH-20512: [Python] Numpy conversion doesn't account for ListArray offset (#15210)
This is an automated email from the ASF dual-hosted git repository.
assignuser pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2b50694c10 GH-20512: [Python] Numpy conversion doesn't account for ListArray offset (#15210)
2b50694c10 is described below
commit 2b50694c10e09e4a1343b62c6b5f44ad4403d0e1
Author: Will Jones <wi...@gmail.com>
AuthorDate: Tue Jan 17 10:15:08 2023 -0800
GH-20512: [Python] Numpy conversion doesn't account for ListArray offset (#15210)
* Closes: #20512
Lead-authored-by: Will Jones <wi...@gmail.com>
Co-authored-by: Joris Van den Bossche <jo...@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <ja...@wujciak.de>
---
cpp/src/arrow/array/array_nested.h | 4 +-
python/pyarrow/src/arrow/python/arrow_to_pandas.cc | 23 ++++++++---
python/pyarrow/tests/test_pandas.py | 45 ++++++++++++++++++++++
3 files changed, 65 insertions(+), 7 deletions(-)
diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h
index 489a7a3a3c..6fb3fd3c91 100644
--- a/cpp/src/arrow/array/array_nested.h
+++ b/cpp/src/arrow/array/array_nested.h
@@ -69,9 +69,11 @@ class BaseListArray : public Array {
const TypeClass* list_type() const { return list_type_; }
/// \brief Return array object containing the list's values
+ ///
+ /// Note that this buffer does not account for any slice offset or length.
std::shared_ptr<Array> values() const { return values_; }
- /// Note that this buffer does not account for any slice offset
+ /// Note that this buffer does not account for any slice offset or length.
std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
index f58c151ea6..2faf7d381a 100644
--- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
+++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc
@@ -738,11 +738,17 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
ArrayVector value_arrays;
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = checked_cast<const ListArrayT&>(*data.chunk(c));
+ // values() does not account for offsets, so we need to slice into it.
+ // We can't use Flatten(), because it removes the values behind a null list
+ // value, and that makes the offsets into original list values and our
+ // flattened_values array different.
+ std::shared_ptr<Array> flattened_values = arr.values()->Slice(
+ arr.value_offset(0), arr.value_offset(arr.length()) - arr.value_offset(0));
if (arr.value_type()->id() == Type::EXTENSION) {
- const auto& arr_ext = checked_cast<const ExtensionArray&>(*arr.values());
+ const auto& arr_ext = checked_cast<const ExtensionArray&>(*flattened_values);
value_arrays.emplace_back(arr_ext.storage());
} else {
- value_arrays.emplace_back(arr.values());
+ value_arrays.emplace_back(flattened_values);
}
}
@@ -772,8 +778,12 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
Py_INCREF(Py_None);
*out_values = Py_None;
} else {
- OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset));
- OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset));
+ // Need to subtract value_offset(0) since the original chunk might be a slice
+ // into another array.
+ OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset -
+ arr.value_offset(0)));
+ OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset -
+ arr.value_offset(0)));
OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr));
if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) {
@@ -791,7 +801,7 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
}
RETURN_IF_PYERROR();
- chunk_offset += arr.values()->length();
+ chunk_offset += arr.value_offset(arr.length()) - arr.value_offset(0);
}
return Status::OK();
@@ -1083,7 +1093,8 @@ struct ObjectWriterVisitor {
OwnedRef keywords(PyDict_New());
PyDict_SetItemString(keywords.obj(), "tzinfo", PyDateTime_TimeZone_UTC);
OwnedRef naive_datetime_replace(PyObject_GetAttrString(naive_datetime, "replace"));
- OwnedRef datetime_utc(PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
+ OwnedRef datetime_utc(
+ PyObject_Call(naive_datetime_replace.obj(), args.obj(), keywords.obj()));
// second step: adjust the datetime to tzinfo timezone (astimezone method)
*out = PyObject_CallMethod(datetime_utc.obj(), "astimezone", "O", tzinfo.obj());
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 729a4122c0..4d0ddf8754 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -2308,6 +2308,51 @@ class TestConvertListTypes:
actual = arr.to_pandas()
tm.assert_series_equal(actual, expected, check_names=False)
+ def test_list_no_duplicate_base(self):
+ # ARROW-18400
+ arr = pa.array([[1, 2], [3, 4, 5], None, [6, None], [7, 8]])
+ chunked_arr = pa.chunked_array([arr.slice(0, 3), arr.slice(3, 1)])
+
+ np_arr = chunked_arr.to_numpy()
+
+ expected = np.array([[1., 2.], [3., 4., 5.], None,
+ [6., np.NaN]], dtype="object")
+ for left, right in zip(np_arr, expected):
+ if right is None:
+ assert left == right
+ else:
+ npt.assert_array_equal(left, right)
+
+ expected_base = np.array([[1., 2., 3., 4., 5., 6., np.NaN]])
+ npt.assert_array_equal(np_arr[0].base, expected_base)
+
+ np_arr_sliced = chunked_arr.slice(1, 3).to_numpy()
+
+ expected = np.array([[3, 4, 5], None, [6, np.NaN]], dtype="object")
+ for left, right in zip(np_arr_sliced, expected):
+ if right is None:
+ assert left == right
+ else:
+ npt.assert_array_equal(left, right)
+
+ expected_base = np.array([[3., 4., 5., 6., np.NaN]])
+ npt.assert_array_equal(np_arr_sliced[0].base, expected_base)
+
+ def test_list_values_behind_null(self):
+ arr = pa.ListArray.from_arrays(
+ offsets=pa.array([0, 2, 4, 6]),
+ values=pa.array([1, 2, 99, 99, 3, None]),
+ mask=pa.array([False, True, False])
+ )
+ np_arr = arr.to_numpy(zero_copy_only=False)
+
+ expected = np.array([[1., 2.], None, [3., np.NaN]], dtype="object")
+ for left, right in zip(np_arr, expected):
+ if right is None:
+ assert left == right
+ else:
+ npt.assert_array_equal(left, right)
+
class TestConvertStructTypes:
"""