You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ap...@apache.org on 2019/05/20 16:58:43 UTC

[arrow] branch master updated: ARROW-5260: [Python] Fix crash when deserializating from components in another process

This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new e643845  ARROW-5260: [Python] Fix crash when deserializating from components in another process
e643845 is described below

commit e643845dc45d8d6c03102eb05bcfc67d520a39b1
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Mon May 20 18:58:33 2019 +0200

    ARROW-5260: [Python] Fix crash when deserializating from components in another process
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #4267 from pitrou/ARROW-5260-from-components-other-process and squashes the following commits:
    
    bc56aa668 <Antoine Pitrou> ARROW-5260:  Fix crash when deserializating from components in another process
---
 ci/cpp-msvc-build-main.bat                 | 16 ++++++++--------
 cpp/src/arrow/python/deserialize.cc        |  1 -
 cpp/src/arrow/python/serialize.cc          |  1 -
 python/pyarrow/includes/libarrow.pxd       |  4 ++++
 python/pyarrow/lib.pyx                     |  4 ++++
 python/pyarrow/tests/test_serialization.py | 26 +++++++++++++++++++++++++-
 python/pyarrow/tests/util.py               |  6 +++++-
 7 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/ci/cpp-msvc-build-main.bat b/ci/cpp-msvc-build-main.bat
index c9f8b4d..74eac72 100644
--- a/ci/cpp-msvc-build-main.bat
+++ b/ci/cpp-msvc-build-main.bat
@@ -99,17 +99,17 @@ set PYARROW_PARALLEL=2
 @rem ARROW-3075; pkgconfig is broken for Parquet for now
 set PARQUET_HOME=%CONDA_PREFIX%\Library
 
-python setup.py build_ext ^
-    install -q --single-version-externally-managed --record=record.text ^
-    bdist_wheel -q || exit /B
+python setup.py develop -q || exit /B
 
-for /F %%i in ('dir /B /S dist\*.whl') do set WHEEL_PATH=%%i
+py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B
+
+@rem
+@rem Build wheel
+@rem
 
-@rem Test directly from installed location
-@rem (needed for test_cython)
+python setup.py bdist_wheel -q || exit /B
 
-set PYARROW_PATH=%CONDA_PREFIX%\Lib\site-packages\pyarrow
-py.test -r sxX --durations=15 %PYARROW_PATH% || exit /B
+for /F %%i in ('dir /B /S dist\*.whl') do set WHEEL_PATH=%%i
 
 popd
 
diff --git a/cpp/src/arrow/python/deserialize.cc b/cpp/src/arrow/python/deserialize.cc
index f1690a8..5e6e135 100644
--- a/cpp/src/arrow/python/deserialize.cc
+++ b/cpp/src/arrow/python/deserialize.cc
@@ -344,7 +344,6 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& obj, PyObj
                          PyObject** out) {
   PyAcquireGIL lock;
   PyDateTime_IMPORT;
-  import_pyarrow();
   return DeserializeList(context, *obj.batch->column(0), 0, obj.batch->num_rows(), base,
                          obj, out);
 }
diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 09a092d..669fd0a 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -540,7 +540,6 @@ std::shared_ptr<RecordBatch> MakeBatch(std::shared_ptr<Array> data) {
 Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out) {
   PyAcquireGIL lock;
   PyDateTime_IMPORT;
-  import_pyarrow();
   SequenceBuilder builder;
   RETURN_NOT_OK(internal::VisitIterable(
       sequence, [&](PyObject* obj, bool* keep_going /* unused */) {
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 11141a1..57dd965 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -1220,6 +1220,10 @@ cdef extern from 'arrow/python/init.h':
     int arrow_init_numpy() except -1
 
 
+cdef extern from 'arrow/python/pyarrow.h' namespace 'arrow::py':
+    int import_pyarrow() except -1
+
+
 cdef extern from 'arrow/python/config.h' namespace 'arrow::py':
     void set_numpy_nan(object o)
 
diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx
index da7815a..894ced5 100644
--- a/python/pyarrow/lib.pyx
+++ b/python/pyarrow/lib.pyx
@@ -34,7 +34,11 @@ from pyarrow.includes.common cimport PyObject_to_object
 cimport pyarrow.includes.libarrow as libarrow
 cimport cpython as cp
 
+# Initialize NumPy C API
 arrow_init_numpy()
+# Initialize PyArrow C++ API
+# (used from some of our C++ code, see e.g. ARROW-5260)
+import_pyarrow()
 set_numpy_nan(np.nan)
 
 
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 0a7e443..cbff0ff 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -22,6 +22,8 @@ import pytest
 import collections
 import datetime
 import os
+import pickle
+import subprocess
 import string
 import sys
 
@@ -706,6 +708,29 @@ def test_serialize_to_components_invalid_cases():
         pa.deserialize_components(components)
 
 
+def test_deserialize_components_in_different_process():
+    arr = pa.array([1, 2, 5, 6], type=pa.int8())
+    ser = pa.serialize(arr)
+    data = pickle.dumps(ser.to_components(), protocol=-1)
+
+    code = """if 1:
+        import pickle
+
+        import pyarrow as pa
+
+        data = {0!r}
+        components = pickle.loads(data)
+        arr = pa.deserialize_components(components)
+
+        assert arr.to_pylist() == [1, 2, 5, 6], arr
+        """.format(data)
+
+    subprocess_env = test_util.get_modified_env_with_pythonpath()
+    print("** sys.path =", sys.path)
+    print("** setting PYTHONPATH to:", subprocess_env['PYTHONPATH'])
+    subprocess.check_call(["python", "-c", code], env=subprocess_env)
+
+
 def test_serialize_read_concatenated_records():
     # ARROW-1996 -- see stream alignment work in ARROW-2840, ARROW-3212
     f = pa.BufferOutputStream()
@@ -744,7 +769,6 @@ def test_deserialize_in_different_process():
 
 def test_deserialize_buffer_in_different_process():
     import tempfile
-    import subprocess
 
     f = tempfile.NamedTemporaryFile(delete=False)
     b = pa.serialize(pa.py_buffer(b'hello')).to_buffer()
diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py
index 8c8d23b..cf638b3 100644
--- a/python/pyarrow/tests/util.py
+++ b/python/pyarrow/tests/util.py
@@ -104,5 +104,9 @@ def get_modified_env_with_pythonpath():
     module_path = os.path.abspath(
         os.path.dirname(os.path.dirname(pa.__file__)))
 
-    env['PYTHONPATH'] = os.pathsep.join((module_path, existing_pythonpath))
+    if existing_pythonpath:
+        new_pythonpath = os.pathsep.join((module_path, existing_pythonpath))
+    else:
+        new_pythonpath = module_path
+    env['PYTHONPATH'] = new_pythonpath
     return env