You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/05/20 16:58:43 UTC
[arrow] branch master updated: ARROW-5260: [Python] Fix crash when
deserializating from components in another process
This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e643845 ARROW-5260: [Python] Fix crash when deserializating from components in another process
e643845 is described below
commit e643845dc45d8d6c03102eb05bcfc67d520a39b1
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon May 20 18:58:33 2019 +0200
ARROW-5260: [Python] Fix crash when deserializating from components in another process
Author: Antoine Pitrou <an...@python.org>
Closes #4267 from pitrou/ARROW-5260-from-components-other-process and squashes the following commits:
bc56aa668 <Antoine Pitrou> ARROW-5260: Fix crash when deserializating from components in another process
---
ci/cpp-msvc-build-main.bat | 16 ++++++++--------
cpp/src/arrow/python/deserialize.cc | 1 -
cpp/src/arrow/python/serialize.cc | 1 -
python/pyarrow/includes/libarrow.pxd | 4 ++++
python/pyarrow/lib.pyx | 4 ++++
python/pyarrow/tests/test_serialization.py | 26 +++++++++++++++++++++++++-
python/pyarrow/tests/util.py | 6 +++++-
7 files changed, 46 insertions(+), 12 deletions(-)
diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat
index c9f8b4d..74eac72 100644
--- a/ci/cpp-msvc-build-main.bat
+++ b/ci/cpp-msvc-build-main.bat
@@ -99,17 +99,17 @@ set PYARROW_PARALLEL=2
@rem ARROW-3075; pkgconfig is broken for Parquet for now
set PARQUET_HOME=%CONDA_PREFIX%\Library
-python setup.py build_ext ^
- install -q --single-version-externally-managed --record=record.text ^
- bdist_wheel -q || exit /B
+python setup.py develop -q || exit /B
-for /F %%i in ('dir /B /S dist\*.whl') do set WHEEL_PATH=%%i
+py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B
+
+@rem
+@rem Build wheel
+@rem
-@rem Test directly from installed location
-@rem (needed for test_cython)
+python setup.py bdist_wheel -q || exit /B
-set PYARROW_PATH=%CONDA_PREFIX%\Lib\site-packages\pyarrow
-py.test -r sxX --durations=15 %PYARROW_PATH% || exit /B
+for /F %%i in ('dir /B /S dist\*.whl') do set WHEEL_PATH=%%i
popd
diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc
index f1690a8..5e6e135 100644
--- a/cpp/src/arrow/python/deserialize.cc
+++ b/cpp/src/arrow/python/deserialize.cc
@@ -344,7 +344,6 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& obj, PyObj
PyObject** out) {
PyAcquireGIL lock;
PyDateTime_IMPORT;
- import_pyarrow();
return DeserializeList(context, *obj.batch->column(0), 0, obj.batch->num_rows(), base,
obj, out);
}
diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 09a092d..669fd0a 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -540,7 +540,6 @@ std::shared_ptr<RecordBatch> MakeBatch(std::shared_ptr<Array> data) {
Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out) {
PyAcquireGIL lock;
PyDateTime_IMPORT;
- import_pyarrow();
SequenceBuilder builder;
RETURN_NOT_OK(internal::VisitIterable(
sequence, [&](PyObject* obj, bool* keep_going /* unused */) {
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 11141a1..57dd965 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1220,6 +1220,10 @@ cdef extern from 'arrow/python/init.h':
int arrow_init_numpy() except -1
+cdef extern from 'arrow/python/pyarrow.h' namespace 'arrow::py':
+ int import_pyarrow() except -1
+
+
cdef extern from 'arrow/python/config.h' namespace 'arrow::py':
void set_numpy_nan(object o)
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index da7815a..894ced5 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -34,7 +34,11 @@ from pyarrow.includes.common cimport PyObject_to_object
cimport pyarrow.includes.libarrow as libarrow
cimport cpython as cp
+# Initialize NumPy C API
arrow_init_numpy()
+# Initialize PyArrow C++ API
+# (used from some of our C++ code, see e.g. ARROW-5260)
+import_pyarrow()
set_numpy_nan(np.nan)
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 0a7e443..cbff0ff 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -22,6 +22,8 @@ import pytest
import collections
import datetime
import os
+import pickle
+import subprocess
import string
import sys
@@ -706,6 +708,29 @@ def test_serialize_to_components_invalid_cases():
pa.deserialize_components(components)
+def test_deserialize_components_in_different_process():
+ arr = pa.array([1, 2, 5, 6], type=pa.int8())
+ ser = pa.serialize(arr)
+ data = pickle.dumps(ser.to_components(), protocol=-1)
+
+ code = """if 1:
+ import pickle
+
+ import pyarrow as pa
+
+ data = {0!r}
+ components = pickle.loads(data)
+ arr = pa.deserialize_components(components)
+
+ assert arr.to_pylist() == [1, 2, 5, 6], arr
+ """.format(data)
+
+ subprocess_env = test_util.get_modified_env_with_pythonpath()
+ print("** sys.path =", sys.path)
+ print("** setting PYTHONPATH to:", subprocess_env['PYTHONPATH'])
+ subprocess.check_call(["python", "-c", code], env=subprocess_env)
+
+
def test_serialize_read_concatenated_records():
# ARROW-1996 -- see stream alignment work in ARROW-2840, ARROW-3212
f = pa.BufferOutputStream()
@@ -744,7 +769,6 @@ def test_deserialize_in_different_process():
def test_deserialize_buffer_in_different_process():
import tempfile
- import subprocess
f = tempfile.NamedTemporaryFile(delete=False)
b = pa.serialize(pa.py_buffer(b'hello')).to_buffer()
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index 8c8d23b..cf638b3 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -104,5 +104,9 @@ def get_modified_env_with_pythonpath():
module_path = os.path.abspath(
os.path.dirname(os.path.dirname(pa.__file__)))
- env['PYTHONPATH'] = os.pathsep.join((module_path, existing_pythonpath))
+ if existing_pythonpath:
+ new_pythonpath = os.pathsep.join((module_path, existing_pythonpath))
+ else:
+ new_pythonpath = module_path
+ env['PYTHONPATH'] = new_pythonpath
return env