You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/04 13:54:00 UTC
arrow git commit: ARROW-1390: [Python] Add more serialization tests
Repository: arrow
Updated Branches:
refs/heads/master e5aeb9001 -> cc3051262
ARROW-1390: [Python] Add more serialization tests
Author: Philipp Moritz <pc...@gmail.com>
Closes #1035 from pcmoritz/serialization-tests and squashes the following commits:
636a3557 [Philipp Moritz] add more serialization tests
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/cc305126
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/cc305126
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/cc305126
Branch: refs/heads/master
Commit: cc3051262271ac4a25e54af3e81765ccbbd1fbd0
Parents: e5aeb90
Author: Philipp Moritz <pc...@gmail.com>
Authored: Mon Sep 4 09:53:58 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Sep 4 09:53:58 2017 -0400
----------------------------------------------------------------------
python/pyarrow/tests/test_serialization.py | 78 ++++++++++++++++++++-----
1 file changed, 63 insertions(+), 15 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/cc305126/python/pyarrow/tests/test_serialization.py
----------------------------------------------------------------------
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 5526ac6..aa6301d 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -80,23 +80,27 @@ def assert_equal(obj1, obj2):
assert obj1 == obj2, "Objects {} and {} are different.".format(obj1,
obj2)
-
-if sys.version_info >= (3, 0):
- long_extras = [0, np.array([["hi", u"hi"], [1.3, 1]])]
-else:
- _LONG_ZERO, _LONG_ONE = long(0), long(1) # noqa: E501,F821
- long_extras = [_LONG_ZERO, np.array([["hi", u"hi"],
- [1.3, _LONG_ONE]])]
-
PRIMITIVE_OBJECTS = [
0, 0.0, 0.9, 1 << 62, 1 << 100, 1 << 999,
[1 << 100, [1 << 100]], "a", string.printable, "\u262F",
- u"hello world", u"\xff\xfe\x9c\x001\x000\x00", None, True,
- False, [], (), {}, np.int8(3), np.int32(4), np.int64(5),
+ "hello world", u"hello world", u"\xff\xfe\x9c\x001\x000\x00",
+ None, True, False, [], (), {}, {(1, 2): 1}, {(): 2},
+ [1, "hello", 3.0], u"\u262F", 42.0, (1.0, "hi"),
+ [1, 2, 3, None], [(None,), 3, 1.0], ["h", "e", "l", "l", "o", None],
+ (None, None), ("hello", None), (True, False),
+ {True: "hello", False: "world"}, {"hello": "world", 1: 42, 2.5: 45},
+ np.int8(3), np.int32(4), np.int64(5),
np.uint8(3), np.uint32(4), np.uint64(5), np.float32(1.9),
np.float64(1.9), np.zeros([100, 100]),
np.random.normal(size=[100, 100]), np.array(["hi", 3]),
- np.array(["hi", 3], dtype=object)] + long_extras
+ np.array(["hi", 3], dtype=object)]
+
+if sys.version_info >= (3, 0):
+ PRIMITIVE_OBJECTS += [0, np.array([["hi", u"hi"], [1.3, 1]])]
+else:
+ PRIMITIVE_OBJECTS += [long(42), long(1 << 62), long(0), \
+ np.array([["hi", u"hi"], \
+ [1.3, long(1)]])] # noqa: E501,F821
COMPLEX_OBJECTS = [
[[[[[[[[[[[[]]]]]]]]]]]],
@@ -172,6 +176,8 @@ def make_serialization_context():
context = pa.SerializationContext()
+ # This is for numpy arrays of "object" only; primitive types are handled
+ # efficiently with Arrow's Tensor facilities (see python_to_arrow.cc)
context.register_type(np.ndarray, 20 * b"\x00",
custom_serializer=array_custom_serializer,
custom_deserializer=array_custom_deserializer)
@@ -217,17 +223,16 @@ def serialization_roundtrip(value, f):
@pytest.yield_fixture(scope='session')
-def large_memory_map(tmpdir_factory):
+def large_memory_map(tmpdir_factory, size=100*1024*1024):
path = (tmpdir_factory.mktemp('data')
.join('pyarrow-serialization-tmp-file').strpath)
# Create a large memory mapped file
- SIZE = 100 * 1024 * 1024 # 100 MB
with open(path, 'wb') as f:
- f.write(np.random.randint(0, 256, size=SIZE)
+ f.write(np.random.randint(0, 256, size=size)
.astype('u1')
.tobytes()
- [:SIZE])
+ [:size])
return path
@@ -256,6 +261,49 @@ def test_custom_serialization(large_memory_map):
for obj in CUSTOM_OBJECTS:
serialization_roundtrip(obj, mmap)
+def test_numpy_serialization(large_memory_map):
+ with pa.memory_map(large_memory_map, mode="r+") as mmap:
+ for t in ["int8", "uint8", "int16", "uint16",
+ "int32", "uint32", "float32", "float64"]:
+ obj = np.random.randint(0, 10, size=(100, 100)).astype(t)
+ serialization_roundtrip(obj, mmap)
+
+def test_numpy_immutable(large_memory_map):
+ with pa.memory_map(large_memory_map, mode="r+") as mmap:
+ obj = np.zeros([10])
+ mmap.seek(0)
+ pa.serialize_to(obj, mmap, serialization_context)
+ mmap.seek(0)
+ result = pa.deserialize_from(mmap, None, serialization_context)
+ with pytest.raises(ValueError):
+ result[0] = 1.0
+
+@pytest.mark.skip(reason="extensive memory requirements")
+def test_arrow_limits(self):
+ huge_memory_map = lambda temp_dir: large_memory_map(temp_dir, 100 * 1024 * 1024 * 1024)
+ with pa.memory_map(huge_memory_map, mode="r+") as mmap:
+ # Test that objects that are too large for Arrow throw a Python
+ # exception. These tests give out of memory errors on Travis and need
+ # to be run on a machine with lots of RAM.
+ l = 2 ** 29 * [1.0]
+ serialization_roundtrip(l, mmap)
+ del l
+ l = 2 ** 29 * ["s"]
+ serialization_roundtrip(l, mmap)
+ del l
+ l = 2 ** 29 * [["1"], 2, 3, [{"s": 4}]]
+ serialization_roundtrip(l, mmap)
+ del l
+ serialization_roundtrip(l, mmap)
+ l = 2 ** 29 * [{"s": 1}] + 2 ** 29 * [1.0]
+ del l
+ l = np.zeros(2 ** 25)
+ serialization_roundtrip(l, mmap)
+ del l
+ l = [np.zeros(2 ** 18) for _ in range(2 ** 7)]
+ serialization_roundtrip(l, mmap)
+ del l
+
def test_serialization_callback_error():
class TempClass(object):