You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/05/02 09:28:04 UTC
[arrow] branch master updated: ARROW-2493: [Python] Add support for pickling to buffers and arrays
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 7323b60 ARROW-2493: [Python] Add support for pickling to buffers and arrays
7323b60 is described below
commit 7323b60ac0d30ea83eb717fc910e761233111675
Author: Korn, Uwe <Uw...@blue-yonder.com>
AuthorDate: Wed May 2 11:27:53 2018 +0200
ARROW-2493: [Python] Add support for pickling to buffers and arrays
Author: Korn, Uwe <Uw...@blue-yonder.com>
Closes #1928 from xhochy/ARROW-2493 and squashes the following commits:
e3600f99 <Korn, Uwe> Add pickling support for Arrays
17ec8055 <Korn, Uwe> ARROW-2493: Add support for pickling to buffers
---
cpp/src/arrow/array.cc | 11 ++++++
cpp/src/arrow/array.h | 15 ++++++++
python/pyarrow/array.pxi | 68 ++++++++++++++++++++++++++++++++++++
python/pyarrow/includes/libarrow.pxd | 9 +++++
python/pyarrow/io.pxi | 3 ++
python/pyarrow/tests/test_array.py | 23 ++++++++++++
python/pyarrow/tests/test_io.py | 6 ++++
7 files changed, 135 insertions(+)
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index d84c89e..e854114 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -54,6 +54,15 @@ std::shared_ptr<ArrayData> ArrayData::Make(
return std::make_shared<ArrayData>(type, length, buffers, null_count, offset);
}
+std::shared_ptr<ArrayData> ArrayData::Make(
+ const std::shared_ptr<DataType>& type, int64_t length,
+ const std::vector<std::shared_ptr<Buffer>>& buffers,
+ const std::vector<std::shared_ptr<ArrayData>>& child_data, int64_t null_count,
+ int64_t offset) {
+ return std::make_shared<ArrayData>(type, length, buffers, child_data, null_count,
+ offset);
+}
+
std::shared_ptr<ArrayData> ArrayData::Make(const std::shared_ptr<DataType>& type,
int64_t length, int64_t null_count,
int64_t offset) {
@@ -250,6 +259,8 @@ void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
raw_value_offsets_ = value_offsets == nullptr
? nullptr
: reinterpret_cast<const int32_t*>(value_offsets->data());
+
+ DCHECK_EQ(data_->child_data.size(), 1);
values_ = MakeArray(data_->child_data[0]);
}
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index dec8b39..1b152c4 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -99,6 +99,15 @@ struct ARROW_EXPORT ArrayData {
}
ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::vector<std::shared_ptr<Buffer>>& buffers,
+ const std::vector<std::shared_ptr<ArrayData>>& child_data,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+ : ArrayData(type, length, null_count, offset) {
+ this->buffers = buffers;
+ this->child_data = child_data;
+ }
+
+ ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
std::vector<std::shared_ptr<Buffer>>&& buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0)
: ArrayData(type, length, null_count, offset) {
@@ -116,6 +125,12 @@ struct ARROW_EXPORT ArrayData {
const std::vector<std::shared_ptr<Buffer>>& buffers,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+ static std::shared_ptr<ArrayData> Make(
+ const std::shared_ptr<DataType>& type, int64_t length,
+ const std::vector<std::shared_ptr<Buffer>>& buffers,
+ const std::vector<std::shared_ptr<ArrayData>>& child_data,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type,
int64_t length,
int64_t null_count = kUnknownNullCount,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 14f04ea..e9b579a 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -281,6 +281,70 @@ cdef _append_array_buffers(const CArrayData* ad, list res):
_append_array_buffers(ad.child_data[i].get(), res)
+cdef _reduce_array_data(const CArrayData* ad):
+ """
+ Recursively dissect ArrayData to (pickable) tuples.
+ """
+ cdef size_t i, n
+ assert ad != NULL
+
+ n = ad.buffers.size()
+ buffers = []
+ for i in range(n):
+ buf = ad.buffers[i]
+ buffers.append(pyarrow_wrap_buffer(buf)
+ if buf.get() != NULL else None)
+
+ children = []
+ n = ad.child_data.size()
+ for i in range(n):
+ children.append(_reduce_array_data(ad.child_data[i].get()))
+
+ return pyarrow_wrap_data_type(ad.type), ad.length, ad.null_count, \
+ ad.offset, buffers, children
+
+
+cdef shared_ptr[CArrayData] _reconstruct_array_data(data):
+ """
+ Reconstruct CArrayData objects from the tuple structure generated
+ by _reduce_array_data.
+ """
+ cdef:
+ int64_t length, null_count, offset, i
+ DataType dtype
+ Buffer buf
+ vector[shared_ptr[CBuffer]] c_buffers
+ vector[shared_ptr[CArrayData]] c_children
+
+ dtype, length, null_count, offset, buffers, children = data
+
+ for i in range(len(buffers)):
+ buf = buffers[i]
+ if buf is None:
+ c_buffers.push_back(shared_ptr[CBuffer]())
+ else:
+ c_buffers.push_back(buf.buffer)
+
+ for i in range(len(children)):
+ c_children.push_back(_reconstruct_array_data(children[i]))
+
+ return CArrayData.MakeWithChildren(
+ dtype.sp_type,
+ length,
+ c_buffers,
+ c_children,
+ null_count,
+ offset)
+
+
+def _restore_array(data):
+ """
+ Reconstruct an Array from pickled ArrayData.
+ """
+ cdef shared_ptr[CArrayData] ad = _reconstruct_array_data(data)
+ return pyarrow_wrap_array(MakeArray(ad))
+
+
cdef class Array:
def __init__(self, *args, **kwargs):
@@ -395,6 +459,10 @@ cdef class Array:
return array(obj, mask=mask, type=type, memory_pool=memory_pool,
from_pandas=True)
+ def __reduce__(self):
+ return _restore_array, \
+ (_reduce_array_data(self.sp_array.get().data().get()),)
+
@staticmethod
def from_buffers(DataType type, length, buffers, null_count=-1, offset=0):
"""
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 0df7f34..1dcff8a 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -105,6 +105,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
int64_t null_count,
int64_t offset)
+ @staticmethod
+ shared_ptr[CArrayData] MakeWithChildren" Make"(
+ const shared_ptr[CDataType]& type,
+ int64_t length,
+ vector[shared_ptr[CBuffer]]& buffers,
+ vector[shared_ptr[CArrayData]]& child_data,
+ int64_t null_count,
+ int64_t offset)
+
cdef cppclass CArray" arrow::Array":
shared_ptr[CDataType] type()
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 3947323..ef95bd5 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -678,6 +678,9 @@ cdef class Buffer:
else:
return NotImplemented
+ def __reduce__(self):
+ return py_buffer, (self.to_pybytes(),)
+
def to_pybytes(self):
self._check_nullptr()
return cp.PyBytes_FromStringAndSize(
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 6d7892a..217dfc6 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -23,6 +23,7 @@ import sys
import numpy as np
import pandas as pd
import pandas.util.testing as tm
+import pickle
import pyarrow as pa
from pyarrow.pandas_compat import get_logical_type
@@ -584,6 +585,28 @@ def test_simple_type_construction():
@pytest.mark.parametrize(
+ ('data', 'typ'),
+ [
+ ([True, False, True, True], pa.bool_()),
+ ([1, 2, 4, 6], pa.int64()),
+ ([1.0, 2.5, None], pa.float64()),
+ (['a', None, 'b'], pa.string()),
+ ([], None),
+ ([[1, 2], [3]], pa.list_(pa.int64())),
+ ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
+ ([(1, 'a'), (2, 'c'), None],
+ pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
+ ]
+)
+def test_array_pickle(data, typ):
+ # Allocate here so that we don't have any Arrow data allocated.
+ # This is needed to ensure that allocator tests can be reliable.
+ array = pa.array(data, type=typ)
+ result = pickle.loads(pickle.dumps(array))
+ assert array.equals(result)
+
+
+@pytest.mark.parametrize(
('type', 'expected'),
[
(pa.null(), 'empty'),
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 02851be..d43623f 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -19,6 +19,7 @@ from functools import partial
from io import BytesIO, TextIOWrapper
import gc
import os
+import pickle
import pytest
import sys
import tempfile
@@ -207,6 +208,11 @@ def test_buffer_bytes():
assert result == val
+ # Check that buffers survive a pickle roundtrip
+ result_buf = pickle.loads(pickle.dumps(buf))
+ result = result_buf.to_pybytes()
+ assert result == val
+
def test_buffer_memoryview():
val = b'some data'
--
To stop receiving notification emails like this one, please contact
uwe@apache.org.