You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/04 13:54:00 UTC

arrow git commit: ARROW-1390: [Python] Add more serialization tests

Repository: arrow
Updated Branches:
  refs/heads/master e5aeb9001 -> cc3051262


ARROW-1390: [Python] Add more serialization tests

Author: Philipp Moritz <pc...@gmail.com>

Closes #1035 from pcmoritz/serialization-tests and squashes the following commits:

636a3557 [Philipp Moritz] add more serialization tests


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/cc305126
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/cc305126
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/cc305126

Branch: refs/heads/master
Commit: cc3051262271ac4a25e54af3e81765ccbbd1fbd0
Parents: e5aeb90
Author: Philipp Moritz <pc...@gmail.com>
Authored: Mon Sep 4 09:53:58 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Sep 4 09:53:58 2017 -0400

----------------------------------------------------------------------
 python/pyarrow/tests/test_serialization.py | 78 ++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/cc305126/python/pyarrow/tests/test_serialization.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 5526ac6..aa6301d 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -80,23 +80,27 @@ def assert_equal(obj1, obj2):
         assert obj1 == obj2, "Objects {} and {} are different.".format(obj1,
                                                                        obj2)
 
-
-if sys.version_info >= (3, 0):
-    long_extras = [0, np.array([["hi", u"hi"], [1.3, 1]])]
-else:
-    _LONG_ZERO, _LONG_ONE = long(0), long(1)  # noqa: E501,F821
-    long_extras = [_LONG_ZERO, np.array([["hi", u"hi"],
-                                         [1.3, _LONG_ONE]])]
-
 PRIMITIVE_OBJECTS = [
     0, 0.0, 0.9, 1 << 62, 1 << 100, 1 << 999,
     [1 << 100, [1 << 100]], "a", string.printable, "\u262F",
-    u"hello world", u"\xff\xfe\x9c\x001\x000\x00", None, True,
-    False, [], (), {}, np.int8(3), np.int32(4), np.int64(5),
+    "hello world", u"hello world", u"\xff\xfe\x9c\x001\x000\x00",
+    None, True, False, [], (), {}, {(1, 2): 1}, {(): 2},
+    [1, "hello", 3.0], u"\u262F", 42.0, (1.0, "hi"),
+    [1, 2, 3, None], [(None,), 3, 1.0], ["h", "e", "l", "l", "o", None],
+    (None, None), ("hello", None), (True, False),
+    {True: "hello", False: "world"}, {"hello": "world", 1: 42, 2.5: 45},
+    np.int8(3), np.int32(4), np.int64(5),
     np.uint8(3), np.uint32(4), np.uint64(5), np.float32(1.9),
     np.float64(1.9), np.zeros([100, 100]),
     np.random.normal(size=[100, 100]), np.array(["hi", 3]),
-    np.array(["hi", 3], dtype=object)] + long_extras
+    np.array(["hi", 3], dtype=object)]
+
+if sys.version_info >= (3, 0):
+    PRIMITIVE_OBJECTS += [0, np.array([["hi", u"hi"], [1.3, 1]])]
+else:
+    PRIMITIVE_OBJECTS += [long(42), long(1 << 62), long(0), \
+                          np.array([["hi", u"hi"], \
+                          [1.3, long(1)]])] # noqa: E501,F821
 
 COMPLEX_OBJECTS = [
     [[[[[[[[[[[[]]]]]]]]]]]],
@@ -172,6 +176,8 @@ def make_serialization_context():
 
     context = pa.SerializationContext()
 
+    # This is for numpy arrays of "object" only; primitive types are handled
+    # efficiently with Arrow's Tensor facilities (see python_to_arrow.cc)
     context.register_type(np.ndarray, 20 * b"\x00",
                           custom_serializer=array_custom_serializer,
                           custom_deserializer=array_custom_deserializer)
@@ -217,17 +223,16 @@ def serialization_roundtrip(value, f):
 
 
 @pytest.yield_fixture(scope='session')
-def large_memory_map(tmpdir_factory):
+def large_memory_map(tmpdir_factory, size=100*1024*1024):
     path = (tmpdir_factory.mktemp('data')
             .join('pyarrow-serialization-tmp-file').strpath)
 
     # Create a large memory mapped file
-    SIZE = 100 * 1024 * 1024  # 100 MB
     with open(path, 'wb') as f:
-        f.write(np.random.randint(0, 256, size=SIZE)
+        f.write(np.random.randint(0, 256, size=size)
                 .astype('u1')
                 .tobytes()
-                [:SIZE])
+                [:size])
     return path
 
 
@@ -256,6 +261,49 @@ def test_custom_serialization(large_memory_map):
         for obj in CUSTOM_OBJECTS:
             serialization_roundtrip(obj, mmap)
 
+def test_numpy_serialization(large_memory_map):
+    with pa.memory_map(large_memory_map, mode="r+") as mmap:
+        for t in ["int8", "uint8", "int16", "uint16",
+                  "int32", "uint32", "float32", "float64"]:
+            obj = np.random.randint(0, 10, size=(100, 100)).astype(t)
+            serialization_roundtrip(obj, mmap)
+
+def test_numpy_immutable(large_memory_map):
+    with pa.memory_map(large_memory_map, mode="r+") as mmap:
+        obj = np.zeros([10])
+        mmap.seek(0)
+        pa.serialize_to(obj, mmap, serialization_context)
+        mmap.seek(0)
+        result = pa.deserialize_from(mmap, None, serialization_context)
+        with pytest.raises(ValueError):
+            result[0] = 1.0
+
+@pytest.mark.skip(reason="extensive memory requirements")
+def test_arrow_limits(self):
+    huge_memory_map = lambda temp_dir: large_memory_map(temp_dir, 100 * 1024 * 1024 * 1024)
+    with pa.memory_map(huge_memory_map, mode="r+") as mmap:
+        # Test that objects that are too large for Arrow throw a Python
+        # exception. These tests give out of memory errors on Travis and need
+        # to be run on a machine with lots of RAM.
+        l = 2 ** 29 * [1.0]
+        serialization_roundtrip(l, mmap)
+        del l
+        l = 2 ** 29 * ["s"]
+        serialization_roundtrip(l, mmap)
+        del l
+        l = 2 ** 29 * [["1"], 2, 3, [{"s": 4}]]
+        serialization_roundtrip(l, mmap)
+        del l
+        serialization_roundtrip(l, mmap)
+        l = 2 ** 29 * [{"s": 1}] + 2 ** 29 * [1.0]
+        del l
+        l = np.zeros(2 ** 25)
+        serialization_roundtrip(l, mmap)
+        del l
+        l = [np.zeros(2 ** 18) for _ in range(2 ** 7)]
+        serialization_roundtrip(l, mmap)
+        del l
+
 def test_serialization_callback_error():
 
     class TempClass(object):