You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/07/16 13:10:12 UTC
[arrow] 02/02: ARROW-5856: [Python] [Packaging] Fix use of C++ /
Cython API from wheels
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch maint-0.14.x
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 5bdf824e301b7b5aa8eb50e52b1fda8a55efd7f6
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Tue Jul 16 14:22:38 2019 +0200
ARROW-5856: [Python] [Packaging] Fix use of C++ / Cython API from wheels
Author: Antoine Pitrou <an...@python.org>
Closes #4884 from pitrou/ARROW-5856-cython-so-version and squashes the following commits:
a411d7ad7 <Antoine Pitrou> Avoid C++ ABI issues with DictionaryMemo
eaede5be2 <Antoine Pitrou> Revert ARROW-5082 (" Stop exporting copies of shared libraries in wheel")
4594f784e <Antoine Pitrou> ARROW-5856: Try to fix Cython API from wheels
---
dev/tasks/python-wheels/manylinux-test.sh | 2 +-
dev/tasks/python-wheels/osx-build.sh | 11 +++-
dev/tasks/python-wheels/travis.osx.yml | 2 +-
dev/tasks/python-wheels/win-build.bat | 4 +-
dev/tasks/tasks.yml | 16 +++---
python/pyarrow/__init__.py | 76 ++++++++++++++++++-------
python/pyarrow/includes/libarrow.pxd | 3 +
python/pyarrow/ipc.pxi | 4 +-
python/pyarrow/lib.pxd | 5 +-
python/pyarrow/tests/conftest.py | 22 +++++--
python/pyarrow/tests/pyarrow_cython_example.pyx | 10 +++-
python/pyarrow/tests/test_cython.py | 40 ++++++++++---
python/pyarrow/types.pxi | 6 +-
python/requirements-test.txt | 5 +-
python/setup.py | 5 ++
15 files changed, 155 insertions(+), 56 deletions(-)
diff --git a/dev/tasks/python-wheels/manylinux-test.sh b/dev/tasks/python-wheels/manylinux-test.sh
index 8ed2efe..a0735f3 100755
--- a/dev/tasks/python-wheels/manylinux-test.sh
+++ b/dev/tasks/python-wheels/manylinux-test.sh
@@ -27,7 +27,7 @@ pip install -q /arrow/python/$WHEEL_TAG/dist/*.whl
# Install test dependencies (pip won't work after removing system zlib)
pip install -q -r /arrow/python/requirements-test.txt
# Run pyarrow tests
-pytest -v --pyargs pyarrow
+pytest -rs --pyargs pyarrow
if [[ "$1" == "--remove-system-libs" ]]; then
# Run import tests after removing the bundled dependencies from the system
diff --git a/dev/tasks/python-wheels/osx-build.sh b/dev/tasks/python-wheels/osx-build.sh
index 415f08d..c1d001d 100755
--- a/dev/tasks/python-wheels/osx-build.sh
+++ b/dev/tasks/python-wheels/osx-build.sh
@@ -183,14 +183,19 @@ function install_wheel {
pip install $(pip_opts) \
$(python $multibuild_dir/supported_wheels.py $wheelhouse/*.whl)
- # Install test dependencies
- pip install $(pip_opts) -r python/requirements-test.txt
popd
}
function run_unit_tests {
+ pushd $1
+
+ # Install test dependencies
+ pip install $(pip_opts) -r python/requirements-test.txt
+
# Run pyarrow tests
- py.test --pyargs pyarrow
+ pytest -rs --pyargs pyarrow
+
+ popd
}
function run_import_tests {
diff --git a/dev/tasks/python-wheels/travis.osx.yml b/dev/tasks/python-wheels/travis.osx.yml
index 6c6b177..8502642 100644
--- a/dev/tasks/python-wheels/travis.osx.yml
+++ b/dev/tasks/python-wheels/travis.osx.yml
@@ -72,7 +72,7 @@ install:
# install the built wheel and test dependencies
- install_wheel arrow
# run unit tests before removing the system libraries
- - run_unit_tests
+ - run_unit_tests arrow
# remove libz to ensure that it is properly bundled
- sudo find /usr -name libz.* -delete
# run the import tests
diff --git a/dev/tasks/python-wheels/win-build.bat b/dev/tasks/python-wheels/win-build.bat
index f5caac0..dbb4e47 100644
--- a/dev/tasks/python-wheels/win-build.bat
+++ b/dev/tasks/python-wheels/win-build.bat
@@ -80,7 +80,7 @@ set ARROW_TEST_DATA=%ARROW_SRC%\testing\data
@rem test the wheel
@rem TODO For maximum reliability, we should test in a plain virtualenv instead.
call conda create -n wheel-test -q -y python=%PYTHON_VERSION% ^
- numpy=%NUMPY_VERSION% pandas pytest hypothesis || exit /B
+ numpy=%NUMPY_VERSION% pandas cython pytest hypothesis || exit /B
call activate wheel-test
@rem install the built wheel
@@ -90,4 +90,4 @@ pip install -vv --no-index --find-links=%ARROW_SRC%\python\dist\ pyarrow || exit
python -c "import pyarrow; import pyarrow.parquet; import pyarrow.flight; import pyarrow.gandiva;" || exit /B
@rem run the python tests
-pytest --pyargs pyarrow || exit /B
+pytest -rs --pyargs pyarrow || exit /B
diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml
index 031dc36..80f281d 100644
--- a/dev/tasks/tasks.yml
+++ b/dev/tasks/tasks.yml
@@ -180,7 +180,7 @@ tasks:
unicode_width: 32
wheel_tag: manylinux1
test_docker_images:
- - python:2.7-slim # debian ucs4
+ - python:2.7 # debian ucs4
test_remove_system_libs: false
artifacts:
- pyarrow-{no_rc_version}-cp27-cp27mu-manylinux1_x86_64.whl
@@ -194,7 +194,7 @@ tasks:
unicode_width: 16
wheel_tag: manylinux1
test_docker_images:
- - python:3.5-slim
+ - python:3.5
test_remove_system_libs: true
artifacts:
- pyarrow-{no_rc_version}-cp35-cp35m-manylinux1_x86_64.whl
@@ -208,7 +208,7 @@ tasks:
unicode_width: 16
wheel_tag: manylinux1
test_docker_images:
- - python:3.6-slim
+ - python:3.6
test_remove_system_libs: true
artifacts:
- pyarrow-{no_rc_version}-cp36-cp36m-manylinux1_x86_64.whl
@@ -222,7 +222,7 @@ tasks:
unicode_width: 16
wheel_tag: manylinux1
test_docker_images:
- - python:3.7-slim
+ - python:3.7
test_remove_system_libs: true
artifacts:
- pyarrow-{no_rc_version}-cp37-cp37m-manylinux1_x86_64.whl
@@ -249,7 +249,7 @@ tasks:
unicode_width: 32
wheel_tag: manylinux2010
test_docker_images:
- - python:2.7-slim # debian ucs4
+ - python:2.7 # debian ucs4
test_remove_system_libs: false
artifacts:
- pyarrow-{no_rc_version}-cp27-cp27mu-manylinux2010_x86_64.whl
@@ -263,7 +263,7 @@ tasks:
unicode_width: 16
wheel_tag: manylinux2010
test_docker_images:
- - python:3.5-slim
+ - python:3.5
test_remove_system_libs: true
artifacts:
- pyarrow-{no_rc_version}-cp35-cp35m-manylinux2010_x86_64.whl
@@ -277,7 +277,7 @@ tasks:
unicode_width: 16
wheel_tag: manylinux2010
test_docker_images:
- - python:3.6-slim
+ - python:3.6
test_remove_system_libs: true
artifacts:
- pyarrow-{no_rc_version}-cp36-cp36m-manylinux2010_x86_64.whl
@@ -291,7 +291,7 @@ tasks:
unicode_width: 16
wheel_tag: manylinux2010
test_docker_images:
- - python:3.7-slim
+ - python:3.7
test_remove_system_libs: true
artifacts:
- pyarrow-{no_rc_version}-cp37-cp37m-manylinux2010_x86_64.whl
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 487065c..f3fd738 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -209,6 +209,43 @@ def get_include():
return _os.path.join(_os.path.dirname(__file__), 'include')
+def _get_pkg_config_executable():
+ return _os.environ.get('PKG_CONFIG', 'pkg-config')
+
+
+def _has_pkg_config(pkgname):
+ import subprocess
+ try:
+ return subprocess.call([_get_pkg_config_executable(),
+ '--exists', pkgname]) == 0
+ except OSError:
+ # TODO: replace with FileNotFoundError once we ditch 2.7
+ return False
+
+
+def _read_pkg_config_variable(pkgname, cli_args):
+ import subprocess
+ cmd = [_get_pkg_config_executable(), pkgname] + cli_args
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ out, err = proc.communicate()
+ if proc.returncode != 0:
+ raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
+ return out.rstrip().decode('utf8')
+
+
+def get_so_version():
+ """
+ Return the SO version for Arrow libraries.
+ """
+ if _sys.platform == 'win32':
+ raise NotImplementedError("Cannot get SO version on Windows")
+ if _has_pkg_config("arrow"):
+ return _read_pkg_config_variable("arrow", ["--variable=so_version"])
+ else:
+ return "100" # XXX Find a way not to hardcode this?
+
+
def get_libraries():
"""
Return list of library names to include in the `libraries` argument for C
@@ -223,38 +260,37 @@ def get_library_dirs():
linking C or Cython extensions using pyarrow
"""
package_cwd = _os.path.dirname(__file__)
-
library_dirs = [package_cwd]
+ def append_library_dir(library_dir):
+ if library_dir not in library_dirs:
+ library_dirs.append(library_dir)
+
# Search library paths via pkg-config. This is necessary if the user
# installed libarrow and the other shared libraries manually and they
# are not shipped inside the pyarrow package (see also ARROW-2976).
- from subprocess import call, PIPE, Popen
- pkg_config_executable = _os.environ.get('PKG_CONFIG', None) or 'pkg-config'
- for package in ["arrow", "plasma", "arrow_python"]:
- cmd = '{0} --exists {1}'.format(pkg_config_executable, package).split()
- try:
- if call(cmd) == 0:
- cmd = [pkg_config_executable, "--libs-only-L", package]
- proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
- out, err = proc.communicate()
- library_dir = out.rstrip().decode('utf-8')[2:] # strip "-L"
- if library_dir not in library_dirs:
- library_dirs.append(library_dir)
- except FileNotFoundError:
- pass
+ pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
+ for pkgname in ["arrow", "arrow_python"]:
+ if _has_pkg_config(pkgname):
+ library_dir = _read_pkg_config_variable(pkgname,
+ ["--libs-only-L"])
+ assert library_dir.startswith("-L")
+ append_library_dir(library_dir[2:])
if _sys.platform == 'win32':
# TODO(wesm): Is this necessary, or does setuptools within a conda
# installation add Library\lib to the linker path for MSVC?
python_base_install = _os.path.dirname(_sys.executable)
- library_lib = _os.path.join(python_base_install, 'Library', 'lib')
+ library_dir = _os.path.join(python_base_install, 'Library', 'lib')
- if _os.path.exists(_os.path.join(library_lib, 'arrow.lib')):
- library_dirs.append(library_lib)
+ if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
+ append_library_dir(library_dir)
# ARROW-4074: Allow for ARROW_HOME to be set to some other directory
- if 'ARROW_HOME' in _os.environ:
- library_dirs.append(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
+ if _os.environ.get('ARROW_HOME'):
+ append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
+ else:
+ # Python wheels bundle the Arrow libraries in the pyarrow directory.
+ append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
return library_dirs
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8798834..c2f82df 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -155,6 +155,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
cdef cppclass CFixedWidthType" arrow::FixedWidthType"(CDataType):
int bit_width()
+ cdef cppclass CNullArray" arrow::NullArray"(CArray):
+ CNullArray(int64_t length)
+
cdef cppclass CDictionaryArray" arrow::DictionaryArray"(CArray):
CDictionaryArray(const shared_ptr[CDataType]& type,
const shared_ptr[CArray]& indices,
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
index cfd1cd71..14a8b53 100644
--- a/python/pyarrow/ipc.pxi
+++ b/python/pyarrow/ipc.pxi
@@ -536,7 +536,7 @@ def read_schema(obj, DictionaryMemo dictionary_memo=None):
get_reader(obj, True, &cpp_file)
if dictionary_memo is not None:
- arg_dict_memo = &dictionary_memo.memo
+ arg_dict_memo = dictionary_memo.memo
else:
arg_dict_memo = &temp_memo
@@ -575,7 +575,7 @@ def read_record_batch(obj, Schema schema,
message = read_message(obj)
if dictionary_memo is not None:
- arg_dict_memo = &dictionary_memo.memo
+ arg_dict_memo = dictionary_memo.memo
else:
arg_dict_memo = &temp_memo
diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd
index 79ab947..9e119fb 100644
--- a/python/pyarrow/lib.pxd
+++ b/python/pyarrow/lib.pxd
@@ -74,7 +74,10 @@ cdef class StructType(DataType):
cdef class DictionaryMemo:
cdef:
- CDictionaryMemo memo
+ # Even though the CDictionaryMemo instance is private, we allocate
+ # it on the heap so as to avoid C++ ABI issues with Python wheels.
+ shared_ptr[CDictionaryMemo] sp_memo
+ CDictionaryMemo* memo
cdef class DictionaryType(DataType):
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 4907557..e7ca7f4 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -38,6 +38,7 @@ h.settings.load_profile(os.environ.get('HYPOTHESIS_PROFILE', 'dev'))
groups = [
+ 'cython',
'hypothesis',
'gandiva',
'hdfs',
@@ -52,6 +53,7 @@ groups = [
defaults = {
+ 'cython': False,
'hypothesis': False,
'gandiva': False,
'hdfs': False,
@@ -61,10 +63,17 @@ defaults = {
'parquet': False,
'plasma': False,
's3': False,
- 'tensorflow': False
+ 'tensorflow': False,
+ 'flight': False
}
try:
+ import cython # noqa
+ defaults['cython'] = True
+except ImportError:
+ pass
+
+try:
import pyarrow.gandiva # noqa
defaults['gandiva'] = True
except ImportError:
@@ -76,14 +85,12 @@ try:
except ImportError:
pass
-
try:
import pandas # noqa
defaults['pandas'] = True
except ImportError:
pass
-
try:
import pyarrow.parquet # noqa
defaults['parquet'] = True
@@ -91,18 +98,23 @@ except ImportError:
pass
try:
- import pyarrow.plasma as plasma # noqa
+ import pyarrow.plasma # noqa
defaults['plasma'] = True
except ImportError:
pass
-
try:
import tensorflow # noqa
defaults['tensorflow'] = True
except ImportError:
pass
+try:
+ import pyarrow.flight # noqa
+ defaults['flight'] = True
+except ImportError:
+ pass
+
def pytest_configure(config):
for mark in groups:
diff --git a/python/pyarrow/tests/pyarrow_cython_example.pyx b/python/pyarrow/tests/pyarrow_cython_example.pyx
index 4a6f3ca..160c151 100644
--- a/python/pyarrow/tests/pyarrow_cython_example.pyx
+++ b/python/pyarrow/tests/pyarrow_cython_example.pyx
@@ -22,9 +22,17 @@ from pyarrow.lib cimport *
def get_array_length(obj):
- # Just an example function accessing both the pyarrow Cython API
+ # An example function accessing both the pyarrow Cython API
# and the Arrow C++ API
cdef shared_ptr[CArray] arr = pyarrow_unwrap_array(obj)
if arr.get() == NULL:
raise TypeError("not an array")
return arr.get().length()
+
+
+def make_null_array(length):
+ # An example function that returns a PyArrow object without PyArrow
+ # being imported explicitly at the Python level.
+ cdef shared_ptr[CArray] null_array
+ null_array.reset(new CNullArray(length))
+ return pyarrow_wrap_array(null_array)
diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py
index 57dbeb5..202868d 100644
--- a/python/pyarrow/tests/test_cython.py
+++ b/python/pyarrow/tests/test_cython.py
@@ -23,11 +23,12 @@ import sys
import pytest
import pyarrow as pa
-
import pyarrow.tests.util as test_util
+
here = os.path.dirname(os.path.abspath(__file__))
+
setup_template = """if 1:
from distutils.core import setup
from Cython.Build import cythonize
@@ -50,6 +51,8 @@ setup_template = """if 1:
if custom_ld_path:
ext.library_dirs.append(custom_ld_path)
ext.extra_compile_args.extend(compiler_opts)
+ print("Extension module:",
+ ext, ext.include_dirs, ext.libraries, ext.library_dirs)
setup(
ext_modules=ext_modules,
@@ -57,18 +60,15 @@ setup_template = """if 1:
"""
-@pytest.mark.skipif(
- 'ARROW_HOME' not in os.environ,
- reason='ARROW_HOME environment variable not defined')
+@pytest.mark.cython
def test_cython_api(tmpdir):
"""
Basic test for the Cython API.
"""
- pytest.importorskip('Cython')
+ # Fail early if cython is not found
+ import cython # noqa
- ld_path_default = os.path.join(os.environ['ARROW_HOME'], 'lib')
-
- test_ld_path = os.environ.get('PYARROW_TEST_LD_PATH', ld_path_default)
+ test_ld_path = os.environ.get('PYARROW_TEST_LD_PATH', '')
with tmpdir.as_cwd():
# Set up temporary workspace
@@ -106,3 +106,27 @@ def test_cython_api(tmpdir):
mod.get_array_length(None)
finally:
sys.path = orig_path
+
+ # Check the extension module is loadable from a subprocess without
+ # pyarrow imported first.
+ code = """if 1:
+ import sys
+
+ mod = __import__({mod_name!r})
+ arr = mod.make_null_array(5)
+ assert mod.get_array_length(arr) == 5
+ assert arr.null_count == 5
+ """.format(mod_path=str(tmpdir), mod_name='pyarrow_cython_example')
+
+ if sys.platform == 'win32':
+ delim, var = ';', 'PATH'
+ else:
+ delim, var = ':', 'LD_LIBRARY_PATH'
+
+ subprocess_env[var] = delim.join(
+ pa.get_library_dirs() + [subprocess_env.get(var, '')]
+ )
+
+ subprocess.check_call([sys.executable, '-c', code],
+ stdout=subprocess.PIPE,
+ env=subprocess_env)
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 412ebb7..657df04 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -197,7 +197,9 @@ cdef class DictionaryMemo:
"""
Tracking container for dictionary-encoded fields
"""
- pass
+ def __cinit__(self):
+ self.sp_memo.reset(new CDictionaryMemo())
+ self.memo = self.sp_memo.get()
cdef class DictionaryType(DataType):
@@ -1034,7 +1036,7 @@ cdef class Schema:
CDictionaryMemo* arg_dict_memo
if dictionary_memo is not None:
- arg_dict_memo = &dictionary_memo.memo
+ arg_dict_memo = dictionary_memo.memo
else:
arg_dict_memo = &temp_memo
diff --git a/python/requirements-test.txt b/python/requirements-test.txt
index 2f0e9e7..73eabfe 100644
--- a/python/requirements-test.txt
+++ b/python/requirements-test.txt
@@ -1,5 +1,6 @@
+cython
+hypothesis
pandas
+pathlib2; python_version < "3.4"
pytest
-hypothesis
pytz
-pathlib2; python_version < "3.4"
diff --git a/python/setup.py b/python/setup.py
index fa01adf..18d0a56 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -510,6 +510,11 @@ def _move_shared_libs_unix(build_prefix, build_lib, lib_name):
lib_filename = os.path.basename(libs[0])
shutil.move(pjoin(build_prefix, lib_filename),
pjoin(build_lib, 'pyarrow', lib_filename))
+ for lib in libs[1:]:
+ filename = os.path.basename(lib)
+ link_name = pjoin(build_lib, 'pyarrow', filename)
+ if not os.path.exists(link_name):
+ os.symlink(lib_filename, link_name)
# If the event of not running from a git clone (e.g. from a git archive