You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2018/07/30 13:31:38 UTC

[arrow] branch master updated: ARROW-2660: [Python] Experimental zero-copy pickling

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2422d9c  ARROW-2660: [Python] Experimental zero-copy pickling
2422d9c is described below

commit 2422d9c821e8998e65df4e48920864c7f268c2bb
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon Jul 30 15:31:21 2018 +0200

    ARROW-2660: [Python] Experimental zero-copy pickling
    
    Zero-copy pickling of buffers and buffer-based objects will be possible using PEP 574 (if/when accepted).  The PyPI backport "pickle5" helps us test that possibility.
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2161 from pitrou/ARROW-2660-zero-copy-pickling and squashes the following commits:
    
    50f0491 <Antoine Pitrou> Fix test on Python 2.7 (hopefully)
    132939c <Antoine Pitrou> Add pickle5 to CI environments
    892302a <Antoine Pitrou> ARROW-2660:  Zero-copy pickling
---
 ci/cpp-python-msvc-build.bat       |  4 +++-
 ci/travis_script_python.sh         |  3 +++
 python/pyarrow/compat.py           | 10 +++++-----
 python/pyarrow/io.pxi              |  9 ++++++---
 python/pyarrow/tests/test_array.py | 38 ++++++++++++++++++++++++++++++++++----
 5 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/ci/cpp-python-msvc-build.bat b/ci/cpp-python-msvc-build.bat
index d3f540b..ecc68e0 100644
--- a/ci/cpp-python-msvc-build.bat
+++ b/ci/cpp-python-msvc-build.bat
@@ -133,6 +133,8 @@ popd
 
 pushd python
 
+pip install pickle5
+
 set PYARROW_CXXFLAGS=/WX
 set PYARROW_CMAKE_GENERATOR=%GENERATOR%
 set PYARROW_BUNDLE_ARROW_CPP=ON
@@ -167,6 +169,6 @@ pip install %WHEEL_PATH% || exit /B
 python -c "import pyarrow" || exit /B
 python -c "import pyarrow.parquet" || exit /B
 
-pip install pandas pytest pytest-faulthandler
+pip install pandas pickle5 pytest pytest-faulthandler
 
 py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 4eeb103..0743f86 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -102,6 +102,9 @@ pushd $ARROW_PYTHON_DIR
 
 # Other stuff pip install
 pip install -q -r requirements.txt
+if [ "$PYTHON_VERSION" == "3.6" ]; then
+    pip install -q pickle5
+fi
 if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then
     export PYARROW_GENERATE_COVERAGE=1
     pip install -q coverage
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index 44e156e..bbb1bd8 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -107,7 +107,10 @@ if PY2:
     def unichar(s):
         return unichr(s)
 else:
-    import pickle as builtin_pickle
+    try:
+        import pickle5 as builtin_pickle
+    except ImportError:
+        import pickle as builtin_pickle
 
     unicode_type = str
     def lzip(*x):
@@ -142,10 +145,7 @@ else:
 try:
     import cloudpickle as pickle
 except ImportError:
-    try:
-        import cPickle as pickle
-    except ImportError:
-        import pickle
+    pickle = builtin_pickle
 
 def encode_file_path(path):
     import os
diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi
index 4566477..f0e98ab 100644
--- a/python/pyarrow/io.pxi
+++ b/python/pyarrow/io.pxi
@@ -19,7 +19,7 @@
 # arrow::ipc
 
 from libc.stdlib cimport malloc, free
-from pyarrow.compat import frombytes, tobytes, encode_file_path
+from pyarrow.compat import builtin_pickle, frombytes, tobytes, encode_file_path
 from io import BufferedIOBase, UnsupportedOperation
 
 import re
@@ -823,8 +823,11 @@ cdef class Buffer:
         else:
             return NotImplemented
 
-    def __reduce__(self):
-        return py_buffer, (self.to_pybytes(),)
+    def __reduce_ex__(self, protocol):
+        if protocol >= 5:
+            return py_buffer, (builtin_pickle.PickleBuffer(self),)
+        else:
+            return py_buffer, (self.to_pybytes(),)
 
     def to_pybytes(self):
         return cp.PyBytes_FromStringAndSize(
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 7ab54dd..af2708f 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -17,6 +17,7 @@
 
 import collections
 import datetime
+import pickle
 import pytest
 import struct
 import sys
@@ -24,7 +25,10 @@ import sys
 import numpy as np
 import pandas as pd
 import pandas.util.testing as tm
-import pickle
+try:
+    import pickle5
+except ImportError:
+    pickle5 = None
 
 import pyarrow as pa
 from pyarrow.pandas_compat import get_logical_type
@@ -633,7 +637,7 @@ def test_cast_date64_to_int():
     assert result.equals(expected)
 
 
-@pytest.mark.parametrize(
+pickle_test_parametrize = pytest.mark.parametrize(
     ('data', 'typ'),
     [
         ([True, False, True, True], pa.bool_()),
@@ -647,12 +651,38 @@ def test_cast_date64_to_int():
             pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())]))
     ]
 )
+
+
+@pickle_test_parametrize
 def test_array_pickle(data, typ):
     # Allocate here so that we don't have any Arrow data allocated.
     # This is needed to ensure that allocator tests can be reliable.
     array = pa.array(data, type=typ)
-    result = pickle.loads(pickle.dumps(array))
-    assert array.equals(result)
+    for proto in range(0, pickle.HIGHEST_PROTOCOL + 1):
+        result = pickle.loads(pickle.dumps(array, proto))
+        assert array.equals(result)
+
+
+@pickle_test_parametrize
+def test_array_pickle5(data, typ):
+    # Test zero-copy pickling with protocol 5 (PEP 574)
+    picklemod = pickle5 or pickle
+    if pickle5 is None and picklemod.HIGHEST_PROTOCOL < 5:
+        pytest.skip("need pickle5 package or Python 3.8+")
+
+    array = pa.array(data, type=typ)
+    addresses = [buf.address if buf is not None else 0
+                 for buf in array.buffers()]
+
+    for proto in range(5, pickle.HIGHEST_PROTOCOL + 1):
+        buffers = []
+        pickled = picklemod.dumps(array, proto, buffer_callback=buffers.append)
+        result = picklemod.loads(pickled, buffers=buffers)
+        assert array.equals(result)
+
+        result_addresses = [buf.address if buf is not None else 0
+                            for buf in result.buffers()]
+        assert result_addresses == addresses
 
 
 @pytest.mark.parametrize(