You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/11/29 01:06:22 UTC

[arrow] branch master updated: ARROW-1854: [Python] Use pickle to serialize numpy arrays of objects.

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0d6f5bf  ARROW-1854: [Python] Use pickle to serialize numpy arrays of objects.
0d6f5bf is described below

commit 0d6f5bf5fab44a83669cf9d9af5c023f15793dc0
Author: Wes McKinney <we...@twosigma.com>
AuthorDate: Tue Nov 28 20:06:18 2017 -0500

    ARROW-1854: [Python] Use pickle to serialize numpy arrays of objects.
    
    **Just posting this for discussion.** See the preceding discussion on https://issues.apache.org/jira/browse/ARROW-1854.
    
    I think the ideal way to solve this would actually be to improve our handling of lists, which should be possible given that pickle seems to outperform us by 6x according to the benchmarks in https://issues.apache.org/jira/browse/ARROW-1854.
    
    Note that the implementation in this PR will not handle numpy arrays of user-defined classes because it will not fall back to cloudpickle when needed.
    
    cc @pcmoritz @wesm
    
    Author: Wes McKinney <we...@twosigma.com>
    Author: Robert Nishihara <ro...@gmail.com>
    
    Closes #1360 from robertnishihara/numpyobject and squashes the following commits:
    
    c37a0a08 [Wes McKinney] Fix flake
    51915032 [Wes McKinney] Fix post rebase
    43f2c805 [Wes McKinney] Add SerializationContext.clone method. Add pandas_serialization_context member that uses pickle for NumPy arrays with unsupported tensor types
    c9440231 [Wes McKinney] Use pickle.HIGHEST_PROTOCOL, convert to Buffer then memoryview for more memory-efficient transport
    cf719c3f [Robert Nishihara] Use pickle to serialize numpy arrays of objects.
---
 python/pyarrow/__init__.py                 |  1 +
 python/pyarrow/serialization.pxi           | 16 +++++++++++
 python/pyarrow/serialization.py            | 46 +++++++++++++++++++++---------
 python/pyarrow/tests/test_serialization.py |  7 +++--
 4 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index bd31b21..a245fe6 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -125,6 +125,7 @@ from pyarrow.ipc import (Message, MessageReader,
 localfs = LocalFileSystem.get_instance()
 
 from pyarrow.serialization import (_default_serialization_context,
+                                   pandas_serialization_context,
                                    register_default_serialization_handlers)
 
 import pyarrow.types as types
diff --git a/python/pyarrow/serialization.pxi b/python/pyarrow/serialization.pxi
index bb266b2..faf164b 100644
--- a/python/pyarrow/serialization.pxi
+++ b/python/pyarrow/serialization.pxi
@@ -57,6 +57,22 @@ cdef class SerializationContext:
         self.custom_serializers = dict()
         self.custom_deserializers = dict()
 
+    def clone(self):
+        """
+        Return copy of this SerializationContext
+
+        Returns
+        -------
+        clone : SerializationContext
+        """
+        result = SerializationContext()
+        result.type_to_type_id = self.type_to_type_id.copy()
+        result.whitelisted_types = self.whitelisted_types.copy()
+        result.custom_serializers = self.custom_serializers.copy()
+        result.custom_deserializers = self.custom_deserializers.copy()
+
+        return result
+
     def register_type(self, type_, type_id,
                       custom_serializer=None, custom_deserializer=None):
         """EXPERIMENTAL: Add type to the list of types we can serialize.
diff --git a/python/pyarrow/serialization.py b/python/pyarrow/serialization.py
index ab25b63..08e6cce 100644
--- a/python/pyarrow/serialization.py
+++ b/python/pyarrow/serialization.py
@@ -22,7 +22,7 @@ import pickle
 import numpy as np
 
 from pyarrow import serialize_pandas, deserialize_pandas
-from pyarrow.lib import _default_serialization_context
+from pyarrow.lib import _default_serialization_context, frombuffer
 
 try:
     import cloudpickle
@@ -30,6 +30,28 @@ except ImportError:
     cloudpickle = pickle
 
 
+# ----------------------------------------------------------------------
+# Set up serialization for numpy with dtype object (primitive types are
+# handled efficiently with Arrow's Tensor facilities, see
+# python_to_arrow.cc)
+
+def _serialize_numpy_array_list(obj):
+    return obj.tolist(), obj.dtype.str
+
+
+def _deserialize_numpy_array_list(data):
+    return np.array(data[0], dtype=np.dtype(data[1]))
+
+
+def _serialize_numpy_array_pickle(obj):
+    pickled = pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL)
+    return frombuffer(pickled)
+
+
+def _deserialize_numpy_array_pickle(data):
+    return pickle.loads(memoryview(data))
+
+
 def register_default_serialization_handlers(serialization_context):
 
     # ----------------------------------------------------------------------
@@ -80,21 +102,10 @@ def register_default_serialization_handlers(serialization_context):
                                         custom_serializer=cloudpickle.dumps,
                                         custom_deserializer=cloudpickle.loads)
 
-    # ----------------------------------------------------------------------
-    # Set up serialization for numpy with dtype object (primitive types are
-    # handled efficiently with Arrow's Tensor facilities, see
-    # python_to_arrow.cc)
-
-    def _serialize_numpy_array(obj):
-        return obj.tolist(), obj.dtype.str
-
-    def _deserialize_numpy_array(data):
-        return np.array(data[0], dtype=np.dtype(data[1]))
-
     serialization_context.register_type(
         np.ndarray, 'np.array',
-        custom_serializer=_serialize_numpy_array,
-        custom_deserializer=_deserialize_numpy_array)
+        custom_serializer=_serialize_numpy_array_list,
+        custom_deserializer=_deserialize_numpy_array_list)
 
     # ----------------------------------------------------------------------
     # Set up serialization for pandas Series and DataFrame
@@ -153,3 +164,10 @@ def register_default_serialization_handlers(serialization_context):
 
 
 register_default_serialization_handlers(_default_serialization_context)
+
+pandas_serialization_context = _default_serialization_context.clone()
+
+pandas_serialization_context.register_type(
+    np.ndarray, 'np.array',
+    custom_serializer=_serialize_numpy_array_pickle,
+    custom_deserializer=_deserialize_numpy_array_pickle)
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index d06beea..6d85621 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -212,11 +212,11 @@ def make_serialization_context():
 serialization_context = make_serialization_context()
 
 
-def serialization_roundtrip(value, f):
+def serialization_roundtrip(value, f, ctx=serialization_context):
     f.seek(0)
-    pa.serialize_to(value, f, serialization_context)
+    pa.serialize_to(value, f, ctx)
     f.seek(0)
-    result = pa.deserialize_from(f, None, serialization_context)
+    result = pa.deserialize_from(f, None, ctx)
     assert_equal(value, result)
 
     _check_component_roundtrip(value)
@@ -249,6 +249,7 @@ def test_primitive_serialization(large_memory_map):
     with pa.memory_map(large_memory_map, mode="r+") as mmap:
         for obj in PRIMITIVE_OBJECTS:
             serialization_roundtrip(obj, mmap)
+            serialization_roundtrip(obj, mmap, pa.pandas_serialization_context)
 
 
 def test_serialize_to_buffer():

-- 
To stop receiving notification emails like this one, please contact
['"commits@arrow.apache.org" <co...@arrow.apache.org>'].