You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/07/16 13:10:12 UTC

[arrow] 02/02: ARROW-5856: [Python] [Packaging] Fix use of C++ / Cython API from wheels

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.14.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 5bdf824e301b7b5aa8eb50e52b1fda8a55efd7f6
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Jul 16 14:22:38 2019 +0200

    ARROW-5856: [Python] [Packaging] Fix use of C++ / Cython API from wheels
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #4884 from pitrou/ARROW-5856-cython-so-version and squashes the following commits:
    
    a411d7ad7 <Antoine Pitrou> Avoid C++ ABI issues with DictionaryMemo
    eaede5be2 <Antoine Pitrou> Revert ARROW-5082 (" Stop exporting copies of shared libraries in wheel")
    4594f784e <Antoine Pitrou> ARROW-5856:  Try to fix Cython API from wheels
---
 dev/tasks/python-wheels/manylinux-test.sh       |  2 +-
 dev/tasks/python-wheels/osx-build.sh            | 11 +++-
 dev/tasks/python-wheels/travis.osx.yml          |  2 +-
 dev/tasks/python-wheels/win-build.bat           |  4 +-
 dev/tasks/tasks.yml                             | 16 +++---
 python/pyarrow/__init__.py                      | 76 ++++++++++++++++++-------
 python/pyarrow/includes/libarrow.pxd            |  3 +
 python/pyarrow/ipc.pxi                          |  4 +-
 python/pyarrow/lib.pxd                          |  5 +-
 python/pyarrow/tests/conftest.py                | 22 +++++--
 python/pyarrow/tests/pyarrow_cython_example.pyx | 10 +++-
 python/pyarrow/tests/test_cython.py             | 40 ++++++++++---
 python/pyarrow/types.pxi                        |  6 +-
 python/requirements-test.txt                    |  5 +-
 python/setup.py                                 |  5 ++
 15 files changed, 155 insertions(+), 56 deletions(-)

diff --git a/dev/tasks/python-wheels/manylinux-test.sh b/dev/tasks/python-wheels/manylinux-test.sh
index 8ed2efe..a0735f3 100755
--- a/dev/tasks/python-wheels/manylinux-test.sh
+++ b/dev/tasks/python-wheels/manylinux-test.sh
@@ -27,7 +27,7 @@ pip install -q /arrow/python/$WHEEL_TAG/dist/*.whl
 # Install test dependencies (pip won't work after removing system zlib)
 pip install -q -r /arrow/python/requirements-test.txt
 # Run pyarrow tests
-pytest -v --pyargs pyarrow
+pytest -rs --pyargs pyarrow
 
 if [[ "$1" == "--remove-system-libs" ]]; then
   # Run import tests after removing the bundled dependencies from the system
diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh
index 415f08d..c1d001d 100755
--- a/dev/tasks/python-wheels/osx-build.sh
+++ b/dev/tasks/python-wheels/osx-build.sh
@@ -183,14 +183,19 @@ function install_wheel {
     pip install $(pip_opts) \
         $(python $multibuild_dir/supported_wheels.py $wheelhouse/*.whl)
 
-    # Install test dependencies
-    pip install $(pip_opts) -r python/requirements-test.txt
     popd
 }
 
 function run_unit_tests {
+    pushd $1
+
+    # Install test dependencies
+    pip install $(pip_opts) -r python/requirements-test.txt
+
     # Run pyarrow tests
-    py.test --pyargs pyarrow
+    pytest -rs --pyargs pyarrow
+
+    popd
 }
 
 function run_import_tests {
diff --git a/dev/tasks/python-wheels/travis.osx.yml b/dev/tasks/python-wheels/travis.osx.yml
index 6c6b177..8502642 100644
--- a/dev/tasks/python-wheels/travis.osx.yml
+++ b/dev/tasks/python-wheels/travis.osx.yml
@@ -72,7 +72,7 @@ install:
   # install the built wheel and test dependencies
   - install_wheel arrow
   # run unit tests before removing the system libraries
-  - run_unit_tests
+  - run_unit_tests arrow
   # remove libz to ensure that it is properly bundled
   - sudo find /usr -name libz.* -delete
   # run the import tests
diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat
index f5caac0..dbb4e47 100644
--- a/dev/tasks/python-wheels/win-build.bat
+++ b/dev/tasks/python-wheels/win-build.bat
@@ -80,7 +80,7 @@ set ARROW_TEST_DATA=%ARROW_SRC%\testing\data
 @rem test the wheel
 @rem TODO For maximum reliability, we should test in a plain virtualenv instead.
 call conda create -n wheel-test -q -y python=%PYTHON_VERSION% ^
-      numpy=%NUMPY_VERSION% pandas pytest hypothesis || exit /B
+      numpy=%NUMPY_VERSION% pandas cython pytest hypothesis || exit /B
 call activate wheel-test
 
 @rem install the built wheel
@@ -90,4 +90,4 @@ pip install -vv --no-index --find-links=%ARROW_SRC%\python\dist\ pyarrow || exit
 python -c "import pyarrow; import pyarrow.parquet; import pyarrow.flight; import pyarrow.gandiva;" || exit /B
 
 @rem run the python tests
-pytest --pyargs pyarrow || exit /B
+pytest -rs --pyargs pyarrow || exit /B
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 031dc36..80f281d 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -180,7 +180,7 @@ tasks:
       unicode_width: 32
       wheel_tag: manylinux1
       test_docker_images:
-        - python:2.7-slim  # debian ucs4
+        - python:2.7  # debian ucs4
       test_remove_system_libs: false
     artifacts:
       - pyarrow-{no_rc_version}-cp27-cp27mu-manylinux1_x86_64.whl
@@ -194,7 +194,7 @@ tasks:
       unicode_width: 16
       wheel_tag: manylinux1
       test_docker_images:
-        - python:3.5-slim
+        - python:3.5
       test_remove_system_libs: true
     artifacts:
       - pyarrow-{no_rc_version}-cp35-cp35m-manylinux1_x86_64.whl
@@ -208,7 +208,7 @@ tasks:
       unicode_width: 16
       wheel_tag: manylinux1
       test_docker_images:
-        - python:3.6-slim
+        - python:3.6
       test_remove_system_libs: true
     artifacts:
       - pyarrow-{no_rc_version}-cp36-cp36m-manylinux1_x86_64.whl
@@ -222,7 +222,7 @@ tasks:
       unicode_width: 16
       wheel_tag: manylinux1
       test_docker_images:
-        - python:3.7-slim
+        - python:3.7
       test_remove_system_libs: true
     artifacts:
       - pyarrow-{no_rc_version}-cp37-cp37m-manylinux1_x86_64.whl
@@ -249,7 +249,7 @@ tasks:
       unicode_width: 32
       wheel_tag: manylinux2010
       test_docker_images:
-        - python:2.7-slim  # debian ucs4
+        - python:2.7  # debian ucs4
       test_remove_system_libs: false
     artifacts:
       - pyarrow-{no_rc_version}-cp27-cp27mu-manylinux2010_x86_64.whl
@@ -263,7 +263,7 @@ tasks:
       unicode_width: 16
       wheel_tag: manylinux2010
       test_docker_images:
-        - python:3.5-slim
+        - python:3.5
       test_remove_system_libs: true
     artifacts:
       - pyarrow-{no_rc_version}-cp35-cp35m-manylinux2010_x86_64.whl
@@ -277,7 +277,7 @@ tasks:
       unicode_width: 16
       wheel_tag: manylinux2010
       test_docker_images:
-        - python:3.6-slim
+        - python:3.6
       test_remove_system_libs: true
     artifacts:
       - pyarrow-{no_rc_version}-cp36-cp36m-manylinux2010_x86_64.whl
@@ -291,7 +291,7 @@ tasks:
       unicode_width: 16
       wheel_tag: manylinux2010
       test_docker_images:
-        - python:3.7-slim
+        - python:3.7
       test_remove_system_libs: true
     artifacts:
       - pyarrow-{no_rc_version}-cp37-cp37m-manylinux2010_x86_64.whl
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 487065c..f3fd738 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -209,6 +209,43 @@ def get_include():
     return _os.path.join(_os.path.dirname(__file__), 'include')
 
 
+def _get_pkg_config_executable():
+    return _os.environ.get('PKG_CONFIG', 'pkg-config')
+
+
+def _has_pkg_config(pkgname):
+    import subprocess
+    try:
+        return subprocess.call([_get_pkg_config_executable(),
+                                '--exists', pkgname]) == 0
+    except OSError:
+        # TODO: replace with FileNotFoundError once we ditch 2.7
+        return False
+
+
+def _read_pkg_config_variable(pkgname, cli_args):
+    import subprocess
+    cmd = [_get_pkg_config_executable(), pkgname] + cli_args
+    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE)
+    out, err = proc.communicate()
+    if proc.returncode != 0:
+        raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
+    return out.rstrip().decode('utf8')
+
+
+def get_so_version():
+    """
+    Return the SO version for Arrow libraries.
+    """
+    if _sys.platform == 'win32':
+        raise NotImplementedError("Cannot get SO version on Windows")
+    if _has_pkg_config("arrow"):
+        return _read_pkg_config_variable("arrow", ["--variable=so_version"])
+    else:
+        return "100"  # XXX Find a way not to hardcode this?
+
+
 def get_libraries():
     """
     Return list of library names to include in the `libraries` argument for C
@@ -223,38 +260,37 @@ def get_library_dirs():
     linking C or Cython extensions using pyarrow
     """
     package_cwd = _os.path.dirname(__file__)
-
     library_dirs = [package_cwd]
 
+    def append_library_dir(library_dir):
+        if library_dir not in library_dirs:
+            library_dirs.append(library_dir)
+
     # Search library paths via pkg-config. This is necessary if the user
     # installed libarrow and the other shared libraries manually and they
     # are not shipped inside the pyarrow package (see also ARROW-2976).
-    from subprocess import call, PIPE, Popen
-    pkg_config_executable = _os.environ.get('PKG_CONFIG', None) or 'pkg-config'
-    for package in ["arrow", "plasma", "arrow_python"]:
-        cmd = '{0} --exists {1}'.format(pkg_config_executable, package).split()
-        try:
-            if call(cmd) == 0:
-                cmd = [pkg_config_executable, "--libs-only-L", package]
-                proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
-                out, err = proc.communicate()
-                library_dir = out.rstrip().decode('utf-8')[2:] # strip "-L"
-                if library_dir not in library_dirs:
-                    library_dirs.append(library_dir)
-        except FileNotFoundError:
-            pass
+    pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
+    for pkgname in ["arrow", "arrow_python"]:
+        if _has_pkg_config(pkgname):
+            library_dir = _read_pkg_config_variable(pkgname,
+                                                    ["--libs-only-L"])
+            assert library_dir.startswith("-L")
+            append_library_dir(library_dir[2:])
 
     if _sys.platform == 'win32':
         # TODO(wesm): Is this necessary, or does setuptools within a conda
         # installation add Library\lib to the linker path for MSVC?
         python_base_install = _os.path.dirname(_sys.executable)
-        library_lib = _os.path.join(python_base_install, 'Library', 'lib')
+        library_dir = _os.path.join(python_base_install, 'Library', 'lib')
 
-        if _os.path.exists(_os.path.join(library_lib, 'arrow.lib')):
-            library_dirs.append(library_lib)
+        if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
+            append_library_dir(library_dir)
 
     # ARROW-4074: Allow for ARROW_HOME to be set to some other directory
-    if 'ARROW_HOME' in _os.environ:
-        library_dirs.append(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
+    if _os.environ.get('ARROW_HOME'):
+        append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
+    else:
+        # Python wheels bundle the Arrow libraries in the pyarrow directory.
+        append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
 
     return library_dirs
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8798834..c2f82df 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -155,6 +155,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType):
         int bit_width()
 
+    cdef cppclass CNullArray" arrow::NullArray"(CArray):
+        CNullArray(int64_t length)
+
     cdef cppclass CDictionaryArray" arrow::DictionaryArray"(CArray):
         CDictionaryArray(const shared_ptr[CDataType]& type,
                          const shared_ptr[CArray]& indices,
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index cfd1cd71..14a8b53 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -536,7 +536,7 @@ def read_schema(obj, DictionaryMemo dictionary_memo=None):
     get_reader(obj, True, &cpp_file)
 
     if dictionary_memo is not None:
-        arg_dict_memo = &dictionary_memo.memo
+        arg_dict_memo = dictionary_memo.memo
     else:
         arg_dict_memo = &temp_memo
 
@@ -575,7 +575,7 @@ def read_record_batch(obj, Schema schema,
         message = read_message(obj)
 
     if dictionary_memo is not None:
-        arg_dict_memo = &dictionary_memo.memo
+        arg_dict_memo = dictionary_memo.memo
     else:
         arg_dict_memo = &temp_memo
 
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 79ab947..9e119fb 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -74,7 +74,10 @@ cdef class StructType(DataType):
 
 cdef class DictionaryMemo:
     cdef:
-        CDictionaryMemo memo
+        # Even though the CDictionaryMemo instance is private, we allocate
+        # it on the heap so as to avoid C++ ABI issues with Python wheels.
+        shared_ptr[CDictionaryMemo] sp_memo
+        CDictionaryMemo* memo
 
 
 cdef class DictionaryType(DataType):
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 4907557..e7ca7f4 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -38,6 +38,7 @@ h.settings.load_profile(os.environ.get('HYPOTHESIS_PROFILE', 'dev'))
 
 
 groups = [
+    'cython',
     'hypothesis',
     'gandiva',
     'hdfs',
@@ -52,6 +53,7 @@ groups = [
 
 
 defaults = {
+    'cython': False,
     'hypothesis': False,
     'gandiva': False,
     'hdfs': False,
@@ -61,10 +63,17 @@ defaults = {
     'parquet': False,
     'plasma': False,
     's3': False,
-    'tensorflow': False
+    'tensorflow': False,
+    'flight': False
 }
 
 try:
+    import cython  # noqa
+    defaults['cython'] = True
+except ImportError:
+    pass
+
+try:
     import pyarrow.gandiva # noqa
     defaults['gandiva'] = True
 except ImportError:
@@ -76,14 +85,12 @@ try:
 except ImportError:
     pass
 
-
 try:
     import pandas  # noqa
     defaults['pandas'] = True
 except ImportError:
     pass
 
-
 try:
     import pyarrow.parquet  # noqa
     defaults['parquet'] = True
@@ -91,18 +98,23 @@ except ImportError:
     pass
 
 try:
-    import pyarrow.plasma as plasma  # noqa
+    import pyarrow.plasma  # noqa
     defaults['plasma'] = True
 except ImportError:
     pass
 
-
 try:
     import tensorflow  # noqa
     defaults['tensorflow'] = True
 except ImportError:
     pass
 
+try:
+    import pyarrow.flight  # noqa
+    defaults['flight'] = True
+except ImportError:
+    pass
+
 
 def pytest_configure(config):
     for mark in groups:
diff --git a/python/pyarrow/tests/pyarrow_cython_example.pyx b/python/pyarrow/tests/pyarrow_cython_example.pyx
index 4a6f3ca..160c151 100644
--- a/python/pyarrow/tests/pyarrow_cython_example.pyx
+++ b/python/pyarrow/tests/pyarrow_cython_example.pyx
@@ -22,9 +22,17 @@ from pyarrow.lib cimport *
 
 
 def get_array_length(obj):
-    # Just an example function accessing both the pyarrow Cython API
+    # An example function accessing both the pyarrow Cython API
     # and the Arrow C++ API
     cdef shared_ptr[CArray] arr = pyarrow_unwrap_array(obj)
     if arr.get() == NULL:
         raise TypeError("not an array")
     return arr.get().length()
+
+
+def make_null_array(length):
+    # An example function that returns a PyArrow object without PyArrow
+    # being imported explicitly at the Python level.
+    cdef shared_ptr[CArray] null_array
+    null_array.reset(new CNullArray(length))
+    return pyarrow_wrap_array(null_array)
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index 57dbeb5..202868d 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -23,11 +23,12 @@ import sys
 import pytest
 
 import pyarrow as pa
-
 import pyarrow.tests.util as test_util
 
+
 here = os.path.dirname(os.path.abspath(__file__))
 
+
 setup_template = """if 1:
     from distutils.core import setup
     from Cython.Build import cythonize
@@ -50,6 +51,8 @@ setup_template = """if 1:
         if custom_ld_path:
             ext.library_dirs.append(custom_ld_path)
         ext.extra_compile_args.extend(compiler_opts)
+        print("Extension module:",
+              ext, ext.include_dirs, ext.libraries, ext.library_dirs)
 
     setup(
         ext_modules=ext_modules,
@@ -57,18 +60,15 @@ setup_template = """if 1:
 """
 
 
-@pytest.mark.skipif(
-    'ARROW_HOME' not in os.environ,
-    reason='ARROW_HOME environment variable not defined')
+@pytest.mark.cython
 def test_cython_api(tmpdir):
     """
     Basic test for the Cython API.
     """
-    pytest.importorskip('Cython')
+    # Fail early if cython is not found
+    import cython  # noqa
 
-    ld_path_default = os.path.join(os.environ['ARROW_HOME'], 'lib')
-
-    test_ld_path = os.environ.get('PYARROW_TEST_LD_PATH', ld_path_default)
+    test_ld_path = os.environ.get('PYARROW_TEST_LD_PATH', '')
 
     with tmpdir.as_cwd():
         # Set up temporary workspace
@@ -106,3 +106,27 @@ def test_cython_api(tmpdir):
                 mod.get_array_length(None)
         finally:
             sys.path = orig_path
+
+        # Check the extension module is loadable from a subprocess without
+        # pyarrow imported first.
+        code = """if 1:
+            import sys
+
+            mod = __import__({mod_name!r})
+            arr = mod.make_null_array(5)
+            assert mod.get_array_length(arr) == 5
+            assert arr.null_count == 5
+        """.format(mod_path=str(tmpdir), mod_name='pyarrow_cython_example')
+
+        if sys.platform == 'win32':
+            delim, var = ';', 'PATH'
+        else:
+            delim, var = ':', 'LD_LIBRARY_PATH'
+
+        subprocess_env[var] = delim.join(
+            pa.get_library_dirs() + [subprocess_env.get(var, '')]
+        )
+
+        subprocess.check_call([sys.executable, '-c', code],
+                              stdout=subprocess.PIPE,
+                              env=subprocess_env)
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 412ebb7..657df04 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -197,7 +197,9 @@ cdef class DictionaryMemo:
     """
     Tracking container for dictionary-encoded fields
     """
-    pass
+    def __cinit__(self):
+        self.sp_memo.reset(new CDictionaryMemo())
+        self.memo = self.sp_memo.get()
 
 
 cdef class DictionaryType(DataType):
@@ -1034,7 +1036,7 @@ cdef class Schema:
             CDictionaryMemo* arg_dict_memo
 
         if dictionary_memo is not None:
-            arg_dict_memo = &dictionary_memo.memo
+            arg_dict_memo = dictionary_memo.memo
         else:
             arg_dict_memo = &temp_memo
 
diff --git a/python/requirements-test.txt b/python/requirements-test.txt
index 2f0e9e7..73eabfe 100644
--- a/python/requirements-test.txt
+++ b/python/requirements-test.txt
@@ -1,5 +1,6 @@
+cython
+hypothesis
 pandas
+pathlib2; python_version < "3.4"
 pytest
-hypothesis
 pytz
-pathlib2; python_version < "3.4"
diff --git a/python/setup.py b/python/setup.py
index fa01adf..18d0a56 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -510,6 +510,11 @@ def _move_shared_libs_unix(build_prefix, build_lib, lib_name):
     lib_filename = os.path.basename(libs[0])
     shutil.move(pjoin(build_prefix, lib_filename),
                 pjoin(build_lib, 'pyarrow', lib_filename))
+    for lib in libs[1:]:
+        filename = os.path.basename(lib)
+        link_name = pjoin(build_lib, 'pyarrow', filename)
+        if not os.path.exists(link_name):
+            os.symlink(lib_filename, link_name)
 
 
 # If the event of not running from a git clone (e.g. from a git archive