You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/05/02 09:28:04 UTC

[arrow] branch master updated: ARROW-2493: [Python] Add support for pickling to buffers and arrays

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 7323b60  ARROW-2493: [Python] Add support for pickling to buffers and arrays
7323b60 is described below

commit 7323b60ac0d30ea83eb717fc910e761233111675
Author: Korn, Uwe <Uw...@blue-yonder.com>
AuthorDate: Wed May 2 11:27:53 2018 +0200

    ARROW-2493: [Python] Add support for pickling to buffers and arrays
    
    Author: Korn, Uwe <Uw...@blue-yonder.com>
    
    Closes #1928 from xhochy/ARROW-2493 and squashes the following commits:
    
    e3600f99 <Korn, Uwe> Add pickling support for Arrays
    17ec8055 <Korn, Uwe> ARROW-2493:  Add support for pickling to buffers
---
 cpp/src/arrow/array.cc               | 11 ++++++
 cpp/src/arrow/array.h                | 15 ++++++++
 python/pyarrow/array.pxi             | 68 ++++++++++++++++++++++++++++++++++++
 python/pyarrow/includes/libarrow.pxd |  9 +++++
 python/pyarrow/io.pxi                |  3 ++
 python/pyarrow/tests/test_array.py   | 23 ++++++++++++
 python/pyarrow/tests/test_io.py      |  6 ++++
 7 files changed, 135 insertions(+)

diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index d84c89e..e854114 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -54,6 +54,15 @@ std::shared_ptr<ArrayData> ArrayData::Make(
   return std::make_shared<ArrayData>(type, length, buffers, null_count, offset);
 }
 
+std::shared_ptr<ArrayData> ArrayData::Make(
+    const std::shared_ptr<DataType>& type, int64_t length,
+    const std::vector<std::shared_ptr<Buffer>>& buffers,
+    const std::vector<std::shared_ptr<ArrayData>>& child_data, int64_t null_count,
+    int64_t offset) {
+  return std::make_shared<ArrayData>(type, length, buffers, child_data, null_count,
+                                     offset);
+}
+
 std::shared_ptr<ArrayData> ArrayData::Make(const std::shared_ptr<DataType>& type,
                                            int64_t length, int64_t null_count,
                                            int64_t offset) {
@@ -250,6 +259,8 @@ void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
   raw_value_offsets_ = value_offsets == nullptr
                            ? nullptr
                            : reinterpret_cast<const int32_t*>(value_offsets->data());
+
+  DCHECK_EQ(data_->child_data.size(), 1);
   values_ = MakeArray(data_->child_data[0]);
 }
 
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index dec8b39..1b152c4 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -99,6 +99,15 @@ struct ARROW_EXPORT ArrayData {
   }
 
   ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
+            const std::vector<std::shared_ptr<Buffer>>& buffers,
+            const std::vector<std::shared_ptr<ArrayData>>& child_data,
+            int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : ArrayData(type, length, null_count, offset) {
+    this->buffers = buffers;
+    this->child_data = child_data;
+  }
+
+  ArrayData(const std::shared_ptr<DataType>& type, int64_t length,
             std::vector<std::shared_ptr<Buffer>>&& buffers,
             int64_t null_count = kUnknownNullCount, int64_t offset = 0)
       : ArrayData(type, length, null_count, offset) {
@@ -116,6 +125,12 @@ struct ARROW_EXPORT ArrayData {
       const std::vector<std::shared_ptr<Buffer>>& buffers,
       int64_t null_count = kUnknownNullCount, int64_t offset = 0);
 
+  static std::shared_ptr<ArrayData> Make(
+      const std::shared_ptr<DataType>& type, int64_t length,
+      const std::vector<std::shared_ptr<Buffer>>& buffers,
+      const std::vector<std::shared_ptr<ArrayData>>& child_data,
+      int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
   static std::shared_ptr<ArrayData> Make(const std::shared_ptr<DataType>& type,
                                          int64_t length,
                                          int64_t null_count = kUnknownNullCount,
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 14f04ea..e9b579a 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -281,6 +281,70 @@ cdef _append_array_buffers(const CArrayData* ad, list res):
         _append_array_buffers(ad.child_data[i].get(), res)
 
 
+cdef _reduce_array_data(const CArrayData* ad):
+    """
+    Recursively dissect ArrayData to (pickable) tuples.
+    """
+    cdef size_t i, n
+    assert ad != NULL
+
+    n = ad.buffers.size()
+    buffers = []
+    for i in range(n):
+        buf = ad.buffers[i]
+        buffers.append(pyarrow_wrap_buffer(buf)
+                       if buf.get() != NULL else None)
+
+    children = []
+    n = ad.child_data.size()
+    for i in range(n):
+        children.append(_reduce_array_data(ad.child_data[i].get()))
+
+    return pyarrow_wrap_data_type(ad.type), ad.length, ad.null_count, \
+        ad.offset, buffers, children
+
+
+cdef shared_ptr[CArrayData] _reconstruct_array_data(data):
+    """
+    Reconstruct CArrayData objects from the tuple structure generated
+    by _reduce_array_data.
+    """
+    cdef:
+        int64_t length, null_count, offset, i
+        DataType dtype
+        Buffer buf
+        vector[shared_ptr[CBuffer]] c_buffers
+        vector[shared_ptr[CArrayData]] c_children
+
+    dtype, length, null_count, offset, buffers, children = data
+
+    for i in range(len(buffers)):
+        buf = buffers[i]
+        if buf is None:
+            c_buffers.push_back(shared_ptr[CBuffer]())
+        else:
+            c_buffers.push_back(buf.buffer)
+
+    for i in range(len(children)):
+        c_children.push_back(_reconstruct_array_data(children[i]))
+
+    return CArrayData.MakeWithChildren(
+        dtype.sp_type,
+        length,
+        c_buffers,
+        c_children,
+        null_count,
+        offset)
+
+
+def _restore_array(data):
+    """
+    Reconstruct an Array from pickled ArrayData.
+    """
+    cdef shared_ptr[CArrayData] ad = _reconstruct_array_data(data)
+    return pyarrow_wrap_array(MakeArray(ad))
+
+
 cdef class Array:
 
     def __init__(self, *args, **kwargs):
@@ -395,6 +459,10 @@ cdef class Array:
         return array(obj, mask=mask, type=type, memory_pool=memory_pool,
                      from_pandas=True)
 
+    def __reduce__(self):
+        return _restore_array, \
+            (_reduce_array_data(self.sp_array.get().data().get()),)
+
     @staticmethod
     def from_buffers(DataType type, length, buffers, null_count=-1, offset=0):
         """
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 0df7f34..1dcff8a 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -105,6 +105,15 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
                                     int64_t null_count,
                                     int64_t offset)
 
+        @staticmethod
+        shared_ptr[CArrayData] MakeWithChildren" Make"(
+            const shared_ptr[CDataType]& type,
+            int64_t length,
+            vector[shared_ptr[CBuffer]]& buffers,
+            vector[shared_ptr[CArrayData]]& child_data,
+            int64_t null_count,
+            int64_t offset)
+
     cdef cppclass CArray" arrow::Array":
         shared_ptr[CDataType] type()
 
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 3947323..ef95bd5 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -678,6 +678,9 @@ cdef class Buffer:
         else:
             return NotImplemented
 
+    def __reduce__(self):
+        return py_buffer, (self.to_pybytes(),)
+
     def to_pybytes(self):
         self._check_nullptr()
         return cp.PyBytes_FromStringAndSize(
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 6d7892a..217dfc6 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -23,6 +23,7 @@ import sys
 import numpy as np
 import pandas as pd
 import pandas.util.testing as tm
+import pickle
 
 import pyarrow as pa
 from pyarrow.pandas_compat import get_logical_type
@@ -584,6 +585,28 @@ def test_simple_type_construction():
 
 
 @pytest.mark.parametrize(
+    ('data', 'typ'),
+    [
+        ([True, False, True, True], pa.bool_()),
+        ([1, 2, 4, 6], pa.int64()),
+        ([1.0, 2.5, None], pa.float64()),
+        (['a', None, 'b'], pa.string()),
+        ([], None),
+        ([[1, 2], [3]], pa.list_(pa.int64())),
+        ([['a'], None, ['b', 'c']], pa.list_(pa.string())),
+        ([(1, 'a'), (2, 'c'), None],
+            pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
+    ]
+)
+def test_array_pickle(data, typ):
+    # Allocate here so that we don't have any Arrow data allocated.
+    # This is needed to ensure that allocator tests can be reliable.
+    array = pa.array(data, type=typ)
+    result = pickle.loads(pickle.dumps(array))
+    assert array.equals(result)
+
+
+@pytest.mark.parametrize(
     ('type', 'expected'),
     [
         (pa.null(), 'empty'),
diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py
index 02851be..d43623f 100644
--- a/python/pyarrow/tests/test_io.py
+++ b/python/pyarrow/tests/test_io.py
@@ -19,6 +19,7 @@ from functools import partial
 from io import BytesIO, TextIOWrapper
 import gc
 import os
+import pickle
 import pytest
 import sys
 import tempfile
@@ -207,6 +208,11 @@ def test_buffer_bytes():
 
     assert result == val
 
+    # Check that buffers survive a pickle roundtrip
+    result_buf = pickle.loads(pickle.dumps(buf))
+    result = result_buf.to_pybytes()
+    assert result == val
+
 
 def test_buffer_memoryview():
     val = b'some data'

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.