You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2021/04/18 14:40:13 UTC
[arrow-rs] 03/14: Removed Python.
This is an automated email from the ASF dual-hosted git repository.
jorgecarleitao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
commit 4d14b301cda523d363e17ee5d03a581675915a32
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Sun Apr 18 14:19:46 2021 +0000
Removed Python.
---
python/.coveragerc | 19 -
python/.flake8.cython | 20 -
python/.gitignore | 45 -
python/CMakeLists.txt | 619 ---
python/MANIFEST.in | 15 -
python/README.md | 59 -
python/asv-build.sh | 75 -
python/asv-install.sh | 21 -
python/asv-uninstall.sh | 21 -
python/asv.conf.json | 187 -
python/benchmarks/__init__.py | 16 -
python/benchmarks/array_ops.py | 34 -
python/benchmarks/common.py | 349 --
python/benchmarks/convert_builtins.py | 87 -
python/benchmarks/convert_pandas.py | 121 -
python/benchmarks/io.py | 89 -
python/benchmarks/microbenchmarks.py | 45 -
python/benchmarks/parquet.py | 156 -
python/benchmarks/plasma.py | 72 -
python/benchmarks/streaming.py | 70 -
python/cmake_modules | 1 -
python/examples/flight/client.py | 189 -
python/examples/flight/middleware.py | 167 -
python/examples/flight/server.py | 154 -
python/examples/minimal_build/Dockerfile.fedora | 31 -
python/examples/minimal_build/Dockerfile.ubuntu | 38 -
python/examples/minimal_build/README.md | 73 -
python/examples/minimal_build/build_conda.sh | 119 -
python/examples/minimal_build/build_venv.sh | 84 -
python/examples/plasma/sorting/multimerge.pyx | 102 -
python/examples/plasma/sorting/setup.py | 27 -
python/examples/plasma/sorting/sort_df.py | 203 -
python/pyarrow/__init__.pxd | 42 -
python/pyarrow/__init__.py | 504 ---
python/pyarrow/_compute.pxd | 27 -
python/pyarrow/_compute.pyx | 1092 -----
python/pyarrow/_csv.pxd | 46 -
python/pyarrow/_csv.pyx | 952 -----
python/pyarrow/_cuda.pxd | 67 -
python/pyarrow/_cuda.pyx | 1059 -----
python/pyarrow/_dataset.pyx | 2977 -------------
python/pyarrow/_flight.pyx | 2578 ------------
python/pyarrow/_fs.pxd | 94 -
python/pyarrow/_fs.pyx | 1088 -----
python/pyarrow/_hdfs.pyx | 141 -
python/pyarrow/_json.pyx | 249 --
python/pyarrow/_orc.pxd | 53 -
python/pyarrow/_orc.pyx | 111 -
python/pyarrow/_parquet.pxd | 553 ---
python/pyarrow/_parquet.pyx | 1435 -------
python/pyarrow/_plasma.pyx | 868 ----
python/pyarrow/_s3fs.pyx | 260 --
python/pyarrow/array.pxi | 2387 -----------
python/pyarrow/benchmark.pxi | 20 -
python/pyarrow/benchmark.py | 21 -
python/pyarrow/builder.pxi | 82 -
python/pyarrow/cffi.py | 71 -
python/pyarrow/compat.pxi | 65 -
python/pyarrow/compat.py | 29 -
python/pyarrow/compute.py | 493 ---
python/pyarrow/config.pxi | 74 -
python/pyarrow/csv.py | 22 -
python/pyarrow/cuda.py | 25 -
python/pyarrow/dataset.py | 779 ----
python/pyarrow/error.pxi | 231 --
python/pyarrow/feather.pxi | 105 -
python/pyarrow/feather.py | 262 --
python/pyarrow/filesystem.py | 511 ---
python/pyarrow/flight.py | 63 -
python/pyarrow/fs.py | 326 --
python/pyarrow/gandiva.pyx | 482 ---
python/pyarrow/hdfs.py | 240 --
python/pyarrow/includes/__init__.pxd | 0
python/pyarrow/includes/common.pxd | 137 -
python/pyarrow/includes/libarrow.pxd | 2356 -----------
python/pyarrow/includes/libarrow_cuda.pxd | 107 -
python/pyarrow/includes/libarrow_dataset.pxd | 384 --
python/pyarrow/includes/libarrow_flight.pxd | 552 ---
python/pyarrow/includes/libarrow_fs.pxd | 268 --
python/pyarrow/includes/libgandiva.pxd | 281 --
python/pyarrow/includes/libplasma.pxd | 25 -
python/pyarrow/io-hdfs.pxi | 470 ---
python/pyarrow/io.pxi | 1896 ---------
python/pyarrow/ipc.pxi | 968 -----
python/pyarrow/ipc.py | 233 --
python/pyarrow/json.py | 19 -
python/pyarrow/jvm.py | 335 --
python/pyarrow/lib.pxd | 597 ---
python/pyarrow/lib.pyx | 158 -
python/pyarrow/memory.pxi | 216 -
python/pyarrow/orc.py | 149 -
python/pyarrow/pandas-shim.pxi | 254 --
python/pyarrow/pandas_compat.py | 1226 ------
python/pyarrow/parquet.py | 2076 ---------
python/pyarrow/plasma.py | 152 -
python/pyarrow/public-api.pxi | 418 --
python/pyarrow/scalar.pxi | 927 -----
python/pyarrow/serialization.pxi | 556 ---
python/pyarrow/serialization.py | 504 ---
python/pyarrow/table.pxi | 2266 ----------
python/pyarrow/tensor.pxi | 892 ----
python/pyarrow/tensorflow/plasma_op.cc | 391 --
python/pyarrow/tests/__init__.py | 0
python/pyarrow/tests/arrow_7980.py | 30 -
python/pyarrow/tests/conftest.py | 277 --
.../v0.17.0.version=2-compression=lz4.feather | Bin 594 -> 0 bytes
python/pyarrow/tests/data/orc/README.md | 22 -
.../tests/data/orc/TestOrcFile.emptyFile.jsn.gz | Bin 50 -> 0 bytes
.../tests/data/orc/TestOrcFile.emptyFile.orc | Bin 523 -> 0 bytes
.../tests/data/orc/TestOrcFile.test1.jsn.gz | Bin 323 -> 0 bytes
.../pyarrow/tests/data/orc/TestOrcFile.test1.orc | Bin 1711 -> 0 bytes
.../tests/data/orc/TestOrcFile.testDate1900.jsn.gz | Bin 182453 -> 0 bytes
.../tests/data/orc/TestOrcFile.testDate1900.orc | Bin 30941 -> 0 bytes
python/pyarrow/tests/data/orc/decimal.jsn.gz | Bin 19313 -> 0 bytes
python/pyarrow/tests/data/orc/decimal.orc | Bin 16337 -> 0 bytes
.../data/parquet/v0.7.1.all-named-index.parquet | Bin 3948 -> 0 bytes
.../v0.7.1.column-metadata-handling.parquet | Bin 2012 -> 0 bytes
python/pyarrow/tests/data/parquet/v0.7.1.parquet | Bin 4372 -> 0 bytes
.../data/parquet/v0.7.1.some-named-index.parquet | Bin 4008 -> 0 bytes
python/pyarrow/tests/deserialize_buffer.py | 26 -
python/pyarrow/tests/pandas_examples.py | 172 -
python/pyarrow/tests/pandas_threaded_import.py | 44 -
python/pyarrow/tests/parquet/common.py | 177 -
python/pyarrow/tests/parquet/conftest.py | 87 -
python/pyarrow/tests/parquet/test_basic.py | 586 ---
.../tests/parquet/test_compliant_nested_type.py | 113 -
python/pyarrow/tests/parquet/test_data_types.py | 524 ---
python/pyarrow/tests/parquet/test_dataset.py | 1588 -------
python/pyarrow/tests/parquet/test_datetime.py | 373 --
python/pyarrow/tests/parquet/test_metadata.py | 477 ---
python/pyarrow/tests/parquet/test_pandas.py | 687 ---
python/pyarrow/tests/parquet/test_parquet_file.py | 258 --
.../pyarrow/tests/parquet/test_parquet_writer.py | 275 --
python/pyarrow/tests/pyarrow_cython_example.pyx | 55 -
python/pyarrow/tests/strategies.py | 414 --
python/pyarrow/tests/test_adhoc_memory_leak.py | 43 -
python/pyarrow/tests/test_array.py | 2680 ------------
python/pyarrow/tests/test_builder.py | 67 -
python/pyarrow/tests/test_cffi.py | 295 --
python/pyarrow/tests/test_compute.py | 1243 ------
python/pyarrow/tests/test_convert_builtin.py | 2156 ----------
python/pyarrow/tests/test_csv.py | 1345 ------
python/pyarrow/tests/test_cuda.py | 792 ----
python/pyarrow/tests/test_cuda_numba_interop.py | 235 --
python/pyarrow/tests/test_cython.py | 143 -
python/pyarrow/tests/test_dataset.py | 3158 --------------
python/pyarrow/tests/test_deprecations.py | 23 -
python/pyarrow/tests/test_extension_type.py | 668 ---
python/pyarrow/tests/test_feather.py | 792 ----
python/pyarrow/tests/test_filesystem.py | 67 -
python/pyarrow/tests/test_flight.py | 1808 --------
python/pyarrow/tests/test_fs.py | 1521 -------
python/pyarrow/tests/test_gandiva.py | 365 --
python/pyarrow/tests/test_hdfs.py | 442 --
python/pyarrow/tests/test_io.py | 1754 --------
python/pyarrow/tests/test_ipc.py | 962 -----
python/pyarrow/tests/test_json.py | 310 --
python/pyarrow/tests/test_jvm.py | 433 --
python/pyarrow/tests/test_memory.py | 156 -
python/pyarrow/tests/test_misc.py | 175 -
python/pyarrow/tests/test_orc.py | 165 -
python/pyarrow/tests/test_pandas.py | 4383 --------------------
python/pyarrow/tests/test_plasma.py | 1073 -----
python/pyarrow/tests/test_plasma_tf_op.py | 104 -
python/pyarrow/tests/test_scalars.py | 625 ---
python/pyarrow/tests/test_schema.py | 721 ----
python/pyarrow/tests/test_serialization.py | 1233 ------
.../pyarrow/tests/test_serialization_deprecated.py | 56 -
python/pyarrow/tests/test_sparse_tensor.py | 491 ---
python/pyarrow/tests/test_strategies.py | 61 -
python/pyarrow/tests/test_table.py | 1687 --------
python/pyarrow/tests/test_tensor.py | 215 -
python/pyarrow/tests/test_types.py | 1041 -----
python/pyarrow/tests/util.py | 231 --
python/pyarrow/types.pxi | 2781 -------------
python/pyarrow/types.py | 357 --
python/pyarrow/util.py | 152 -
python/pyarrow/vendored/__init__.py | 16 -
python/pyarrow/vendored/version.py | 545 ---
python/pyproject.toml | 26 -
python/requirements-build.txt | 4 -
python/requirements-test.txt | 7 -
python/requirements-wheel-build.txt | 6 -
python/requirements-wheel-test.txt | 11 -
python/scripts/test_imports.py | 21 -
python/scripts/test_leak.py | 110 -
python/setup.cfg | 34 -
python/setup.py | 628 ---
188 files changed, 87207 deletions(-)
diff --git a/python/.coveragerc b/python/.coveragerc
deleted file mode 100644
index f5dc6e3..0000000
--- a/python/.coveragerc
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[run]
-plugins = Cython.Coverage
diff --git a/python/.flake8.cython b/python/.flake8.cython
deleted file mode 100644
index 4bc1958..0000000
--- a/python/.flake8.cython
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-[flake8]
-filename = *.pyx,*.pxd,*.pxi
-ignore = E211,E901,E999,E225,E226,E227,W504
diff --git a/python/.gitignore b/python/.gitignore
deleted file mode 100644
index ef1237a..0000000
--- a/python/.gitignore
+++ /dev/null
@@ -1,45 +0,0 @@
-thirdparty/
-CMakeFiles/
-CMakeCache.txt
-CTestTestfile.cmake
-Makefile
-cmake_install.cmake
-build/
-Testing/
-
-# Python stuff
-
-# Editor temporary/working/backup files
-*flymake*
-
-# Generated sources
-*.c
-*.cpp
-pyarrow/*_api.h
-pyarrow/_generated_version.py
-
-# Bundled headers
-pyarrow/include
-
-# setup.py working directory
-build
-# setup.py dist directory
-dist
-# Coverage
-.coverage
-coverage.xml
-htmlcov
-# Cache
-.cache
-
-# benchmark working dir
-.asv
-pyarrow/_table_api.h
-
-# manylinux temporary files
-manylinux1/arrow
-nm_arrow.log
-visible_symbols.log
-
-# plasma store
-pyarrow/plasma-store-server
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
deleted file mode 100644
index 0714aa4..0000000
--- a/python/CMakeLists.txt
+++ /dev/null
@@ -1,619 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more cod ntributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-# Includes code assembled from BSD/MIT/Apache-licensed code from some 3rd-party
-# projects, including Kudu, Impala, and libdynd. See python/LICENSE.txt
-
-cmake_minimum_required(VERSION 3.2)
-project(pyarrow)
-
-# Running from a Python sdist tarball
-set(LOCAL_CMAKE_MODULES "${CMAKE_SOURCE_DIR}/cmake_modules")
-if(EXISTS "${LOCAL_CMAKE_MODULES}")
- set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${LOCAL_CMAKE_MODULES})
-endif()
-
-# Running from a git source tree
-set(CPP_CMAKE_MODULES "${CMAKE_SOURCE_DIR}/../cpp/cmake_modules")
-if(EXISTS "${CPP_CMAKE_MODULES}")
- set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CPP_CMAKE_MODULES})
-endif()
-
-include(CMakeParseArguments)
-
-# Only interpret if() arguments as variables or keywords when unquoted.
-# https://www.cmake.org/cmake/help/latest/policy/CMP0054.html
-cmake_policy(SET CMP0054 NEW)
-
-# Use the first Python installation on PATH, not the newest one
-set(Python3_FIND_STRATEGY "LOCATION")
-# On Windows, use registry last, not first
-set(Python3_FIND_REGISTRY "LAST")
-# On macOS, use framework last, not first
-set(Python3_FIND_FRAMEWORK "LAST")
-
-# Allow "make install" to not depend on all targets.
-#
-# Must be declared in the top-level CMakeLists.txt.
-set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY true)
-
-set(CMAKE_MACOSX_RPATH 1)
-if(DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
- set(CMAKE_OSX_DEPLOYMENT_TARGET $ENV{MACOSX_DEPLOYMENT_TARGET})
-else()
- set(CMAKE_OSX_DEPLOYMENT_TARGET 10.9)
-endif()
-
-# Generate a Clang compile_commands.json "compilation database" file for use
-# with various development tools, such as Vim's YouCompleteMe plugin.
-# See http://clang.llvm.org/docs/JSONCompilationDatabase.html
-if("$ENV{CMAKE_EXPORT_COMPILE_COMMANDS}" STREQUAL "1")
- set(CMAKE_EXPORT_COMPILE_COMMANDS 1)
-endif()
-
-# Top level cmake dir
-if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
- option(PYARROW_BUILD_CUDA "Build the PyArrow CUDA support" OFF)
- option(PYARROW_BUILD_FLIGHT "Build the PyArrow Flight integration" OFF)
- option(PYARROW_BUILD_DATASET "Build the PyArrow Dataset integration" OFF)
- option(PYARROW_BUILD_GANDIVA "Build the PyArrow Gandiva integration" OFF)
- option(PYARROW_BUILD_PARQUET "Build the PyArrow Parquet integration" OFF)
- option(PYARROW_PARQUET_USE_SHARED "Rely on parquet shared libraries where relevant" ON)
- option(PYARROW_BOOST_USE_SHARED
- "Rely on boost shared libraries on linking static parquet" ON)
- option(PYARROW_BUILD_PLASMA "Build the PyArrow Plasma integration" OFF)
- option(PYARROW_USE_TENSORFLOW "Build PyArrow with TensorFlow support" OFF)
- option(PYARROW_BUILD_ORC "Build the PyArrow ORC integration" OFF)
- option(PYARROW_BUNDLE_ARROW_CPP "Bundle the Arrow C++ libraries" OFF)
- option(PYARROW_BUNDLE_BOOST "Bundle the Boost libraries when we bundle Arrow C++" OFF)
- option(PYARROW_GENERATE_COVERAGE "Build with Cython code coverage enabled" OFF)
- set(PYARROW_CXXFLAGS "" CACHE STRING "Compiler flags to append when compiling Arrow")
-endif()
-
-find_program(CCACHE_FOUND ccache)
-if(CCACHE_FOUND)
- set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
- set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
-endif(CCACHE_FOUND)
-
-#
-# Compiler flags
-#
-
-include(BuildUtils)
-
-# Cython generated code emits way to many warnings at CHECKIN and EVERYTHING
-set(BUILD_WARNING_LEVEL "PRODUCTION")
-
-# This must be synchronized with the definition in
-# cpp/cmake_modules/DefineOptions.cmake.
-set(ARROW_ARMV8_ARCH
- "armv8-a"
- CACHE STRING "Arm64 arch and extensions: armv8-a, armv8-a or armv8-a+crc+crypto")
-include(SetupCxxFlags)
-
-# Add common flags
-set(CMAKE_CXX_FLAGS "${CXX_COMMON_FLAGS} ${CMAKE_CXX_FLAGS}")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PYARROW_CXXFLAGS}")
-
-if(MSVC)
- # MSVC version of -Wno-return-type-c-linkage
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4190")
-
- # Cython generates some bitshift expressions that MSVC does not like in
- # __Pyx_PyFloat_DivideObjC
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4293")
-
- # Converting to/from C++ bool is pretty wonky in Cython. The C4800 warning
- # seem harmless, and probably not worth the effort of working around it
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4800")
-
- # See https://github.com/cython/cython/issues/2731. Change introduced in
- # Cython 0.29.1 causes "unsafe use of type 'bool' in operation"
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4804")
-else()
- # Enable perf and other tools to work properly
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
-
- # Suppress Cython warnings
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable -Wno-maybe-uninitialized")
-
- if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
- OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
- # Cython warnings in clang
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constant-logical-operand")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations")
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sometimes-uninitialized")
-
- # We have public Cython APIs which return C++ types, which are in an extern
- # "C" blog (no symbol mangling) and clang doesn't like this
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type-c-linkage")
- endif()
-endif()
-
-# For any C code, use the same flags.
-set(CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS}")
-
-# Add C++-only flags, like -std=c++11
-set(CMAKE_CXX_FLAGS "${CXX_ONLY_FLAGS} ${CMAKE_CXX_FLAGS}")
-
-if(MSVC)
- # MSVC makes its own output directories based on the build configuration
- set(BUILD_SUBDIR_NAME "")
-else()
- # Set compile output directory
- string(TOLOWER ${CMAKE_BUILD_TYPE} BUILD_SUBDIR_NAME)
-endif()
-
-# If build in-source, create the latest symlink. If build out-of-source, which is
-# preferred, simply output the binaries in the build folder
-if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_BINARY_DIR})
- set(BUILD_OUTPUT_ROOT_DIRECTORY
- "${CMAKE_CURRENT_BINARY_DIR}/build/${BUILD_SUBDIR_NAME}")
- # Link build/latest to the current build directory, to avoid developers
- # accidentally running the latest debug build when in fact they're building
- # release builds.
- file(MAKE_DIRECTORY ${BUILD_OUTPUT_ROOT_DIRECTORY})
- if(NOT APPLE)
- set(MORE_ARGS "-T")
- endif()
- execute_process(COMMAND ln
- ${MORE_ARGS}
- -sf
- ${BUILD_OUTPUT_ROOT_DIRECTORY}
- ${CMAKE_CURRENT_BINARY_DIR}/build/latest)
-else()
- set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}")
-endif()
-
-message(STATUS "Generator: ${CMAKE_GENERATOR}")
-message(STATUS "Build output directory: ${BUILD_OUTPUT_ROOT_DIRECTORY}")
-
-# where to put generated archives (.a files)
-set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}")
-set(ARCHIVE_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}")
-
-# where to put generated libraries (.so files)
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}")
-set(LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}")
-
-# where to put generated binaries
-set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}")
-
-if(PYARROW_USE_TENSORFLOW)
- # TensorFlow uses the old GLIBCXX ABI, so we have to use it too
- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-endif()
-
-# Python and Numpy libraries
-find_package(Python3Alt REQUIRED)
-include(UseCython)
-
-include_directories(SYSTEM ${NUMPY_INCLUDE_DIRS} ${PYTHON_INCLUDE_DIRS} src)
-
-#
-# Dependencies
-#
-
-if(PYARROW_BUILD_FLIGHT)
- set(ARROW_FLIGHT TRUE)
-endif()
-
-# Arrow
-find_package(ArrowPython REQUIRED)
-include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
-
-function(bundle_arrow_lib library_path)
- set(options)
- set(one_value_args SO_VERSION)
- set(multi_value_args)
- cmake_parse_arguments(ARG
- "${options}"
- "${one_value_args}"
- "${multi_value_args}"
- ${ARGN})
- if(ARG_UNPARSED_ARGUMENTS)
- message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}")
- endif()
-
- get_filename_component(LIBRARY_DIR ${${library_path}} DIRECTORY)
- get_filename_component(LIBRARY_NAME ${${library_path}} NAME_WE)
-
- # Only copy the shared library with ABI version on Linux and macOS
-
- if(MSVC)
- configure_file(
- ${${library_path}}
- ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}
- COPYONLY)
- elseif(APPLE)
- configure_file(
- ${LIBRARY_DIR}/${LIBRARY_NAME}.${ARG_SO_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX}
- ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}.${ARG_SO_VERSION}${CMAKE_SHARED_LIBRARY_SUFFIX}
- COPYONLY)
- else()
- configure_file(
- ${${library_path}}.${ARG_SO_VERSION}
- ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}.${ARG_SO_VERSION}
- COPYONLY)
- endif()
-
-endfunction(bundle_arrow_lib)
-
-function(bundle_arrow_import_lib library_path)
- get_filename_component(LIBRARY_DIR ${${library_path}} DIRECTORY)
- get_filename_component(LIBRARY_NAME ${${library_path}} NAME_WE)
- configure_file(${${library_path}} ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}.lib
- COPYONLY)
-endfunction(bundle_arrow_import_lib)
-
-function(bundle_boost_lib library_path)
- get_filename_component(LIBRARY_NAME ${${library_path}} NAME)
- get_filename_component(LIBRARY_NAME_WE ${${library_path}} NAME_WE)
- configure_file(${${library_path}} ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME}
- COPYONLY)
- set(Boost_SO_VERSION
- "${Boost_MAJOR_VERSION}.${Boost_MINOR_VERSION}.${Boost_SUBMINOR_VERSION}")
- if(APPLE)
- configure_file(
- ${${library_path}}
- ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME_WE}${CMAKE_SHARED_LIBRARY_SUFFIX}
- COPYONLY)
- else()
- configure_file(
- ${${library_path}}
- ${BUILD_OUTPUT_ROOT_DIRECTORY}/${LIBRARY_NAME_WE}${CMAKE_SHARED_LIBRARY_SUFFIX}.${Boost_SO_VERSION}
- COPYONLY)
- endif()
-endfunction()
-
-function(bundle_arrow_dependency library_name)
- if(MSVC)
- if(DEFINED ENV{CONDA_PREFIX})
- file(TO_CMAKE_PATH "$ENV{CONDA_PREFIX}\\Library" SHARED_LIB_HOME)
- endif()
- else()
- if(DEFINED ENV{CONDA_PREFIX})
- file(TO_CMAKE_PATH "$ENV{CONDA_PREFIX}" SHARED_LIB_HOME)
- endif()
- endif()
- if(DEFINED ENV{${library_name}_HOME})
- file(TO_CMAKE_PATH "$ENV{${library_name}_HOME}" SHARED_LIB_HOME)
- endif()
- arrow_build_shared_library_name(shared_lib_name "${library_name}")
- unset(SHARED_LIB_PATH CACHE)
- if(MSVC)
- set(CMAKE_SHARED_LIBRARY_SUFFIXES_ORIGINAL ${CMAKE_FIND_LIBRARY_SUFFIXES})
- # .dll isn't found by find_library with MSVC because .dll isn't included in
- # CMAKE_FIND_LIBRARY_SUFFIXES.
- list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
- endif()
- if(SHARED_LIB_HOME)
- find_library(SHARED_LIB_PATH
- NAMES "${shared_lib_name}"
- PATHS "${SHARED_LIB_HOME}"
- PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES}
- NO_DEFAULT_PATH)
- else()
- find_library(SHARED_LIB_PATH
- NAMES "${shared_lib_name}"
- PATH_SUFFIXES ${ARROW_SEARCH_LIB_PATH_SUFFIXES})
- endif()
- if(MSVC)
- set(CMAKE_SHARED_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_ORIGINAL})
- endif()
- if(SHARED_LIB_PATH)
- get_filename_component(SHARED_LIB_REALPATH ${SHARED_LIB_PATH} REALPATH)
- get_filename_component(SHARED_LIB_NAME ${SHARED_LIB_PATH} NAME)
- message(
- STATUS
- "Bundle dependency ${library_name}: ${SHARED_LIB_REALPATH} as ${SHARED_LIB_NAME}")
- configure_file(${SHARED_LIB_REALPATH}
- ${BUILD_OUTPUT_ROOT_DIRECTORY}/${SHARED_LIB_NAME} COPYONLY)
- else()
- message(FATAL_ERROR "Unable to bundle dependency: ${library_name}")
- endif()
-endfunction()
-
-# Always bundle includes
-file(COPY ${ARROW_INCLUDE_DIR}/arrow DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include)
-
-if(PYARROW_BUNDLE_ARROW_CPP)
- # arrow
- bundle_arrow_lib(ARROW_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
- bundle_arrow_lib(ARROW_PYTHON_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
-
- # boost
- if(PYARROW_BOOST_USE_SHARED AND PYARROW_BUNDLE_BOOST)
- set(Boost_USE_STATIC_LIBS OFF)
- set(Boost_USE_MULTITHREADED ON)
- if(MSVC AND ARROW_USE_STATIC_CRT)
- set(Boost_USE_STATIC_RUNTIME ON)
- endif()
- set(Boost_ADDITIONAL_VERSIONS
- "1.66.0"
- "1.66"
- "1.65.0"
- "1.65"
- "1.64.0"
- "1.64"
- "1.63.0"
- "1.63"
- "1.62.0"
- "1.61"
- "1.61.0"
- "1.62"
- "1.60.0"
- "1.60")
- list(GET Boost_ADDITIONAL_VERSIONS 0 BOOST_LATEST_VERSION)
- string(REPLACE "." "_" BOOST_LATEST_VERSION_IN_PATH ${BOOST_LATEST_VERSION})
- if(MSVC)
- # disable autolinking in boost
- add_definitions(-DBOOST_ALL_NO_LIB)
- endif()
- find_package(Boost COMPONENTS regex REQUIRED)
- bundle_boost_lib(Boost_REGEX_LIBRARY)
- endif()
-
- if(MSVC)
- # TODO(kszucs): locate msvcp140.dll in a portable fashion and bundle it
- bundle_arrow_import_lib(ARROW_IMPORT_LIB)
- bundle_arrow_import_lib(ARROW_PYTHON_IMPORT_LIB)
- endif()
-endif()
-
-#
-# Subdirectories
-#
-
-if(UNIX)
- set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
-endif()
-
-set(CYTHON_EXTENSIONS
- lib
- _fs
- _compute
- _csv
- _json)
-
-set(LINK_LIBS arrow_shared arrow_python_shared)
-
-if(PYARROW_BUILD_S3)
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _s3fs)
-endif()
-
-if(PYARROW_BUILD_HDFS)
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _hdfs)
-endif()
-
-if(PYARROW_BUILD_CUDA)
- # Arrow CUDA
- find_package(ArrowCUDA REQUIRED)
-
- if(PYARROW_BUNDLE_ARROW_CPP)
- bundle_arrow_lib(ARROW_CUDA_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
- if(MSVC)
- bundle_arrow_import_lib(ARROW_CUDA_IMPORT_LIB)
- endif()
- endif()
- set(CUDA_LINK_LIBS arrow_cuda_shared)
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _cuda)
-endif()
-
-# Dataset
-if(PYARROW_BUILD_DATASET)
- # Arrow Dataset
- find_package(ArrowDataset REQUIRED)
-
- if(PYARROW_BUNDLE_ARROW_CPP)
- bundle_arrow_lib(ARROW_DATASET_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
- if(MSVC)
- bundle_arrow_import_lib(ARROW_DATASET_IMPORT_LIB)
- endif()
- endif()
-
- set(DATASET_LINK_LIBS arrow_dataset_shared)
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _dataset)
-endif()
-
-if(PYARROW_BUILD_PARQUET)
- # Parquet
- find_package(Parquet REQUIRED)
-
- include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
-
- if(PYARROW_BUNDLE_ARROW_CPP)
- file(COPY ${PARQUET_INCLUDE_DIR}/parquet
- DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include)
- endif()
-
- if(PYARROW_PARQUET_USE_SHARED)
- if(PYARROW_BUNDLE_ARROW_CPP)
- bundle_arrow_lib(PARQUET_SHARED_LIB SO_VERSION ${PARQUET_SO_VERSION})
- if(MSVC)
- bundle_arrow_import_lib(PARQUET_IMPORT_LIB)
- endif()
- endif()
- set(PARQUET_LINK_LIBS parquet_shared)
- else()
- find_package(Thrift)
- if(PYARROW_BOOST_USE_SHARED)
- set(Boost_USE_STATIC_LIBS OFF)
- else()
- set(Boost_USE_STATIC_LIBS ON)
- endif()
- find_package(Boost COMPONENTS regex REQUIRED)
- add_thirdparty_lib(boost_regex STATIC_LIB ${Boost_REGEX_LIBRARY_RELEASE})
- add_thirdparty_lib(thrift STATIC_LIB ${THRIFT_STATIC_LIB})
- set(PARQUET_LINK_LIBS parquet_static thrift_static boost_regex_static)
- endif()
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _parquet)
-endif()
-
-# Plasma
-if(PYARROW_BUILD_PLASMA)
- find_package(Plasma REQUIRED)
-
- include_directories(SYSTEM ${PLASMA_INCLUDE_DIR})
-
- file(COPY ${ARROW_INCLUDE_DIR}/plasma
- DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include)
-
- if(PYARROW_BUNDLE_ARROW_CPP)
- bundle_arrow_lib(PLASMA_SHARED_LIB SO_VERSION ${PLASMA_SO_VERSION})
- endif()
- set(PLASMA_LINK_LIBS plasma_shared)
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _plasma)
- file(COPY ${PLASMA_STORE_SERVER} DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY})
-endif()
-
-if(PYARROW_BUILD_ORC)
- # ORC
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _orc)
-endif()
-
-# Flight
-if(PYARROW_BUILD_FLIGHT)
- # Arrow Flight
- find_package(ArrowPythonFlight REQUIRED)
-
- if(PYARROW_BUNDLE_ARROW_CPP)
- bundle_arrow_lib(ARROW_FLIGHT_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
- bundle_arrow_lib(ARROW_PYTHON_FLIGHT_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
- if(MSVC)
- bundle_arrow_import_lib(ARROW_FLIGHT_IMPORT_LIB)
- bundle_arrow_import_lib(ARROW_PYTHON_FLIGHT_IMPORT_LIB)
- # XXX Hardcoded library names because CMake is too stupid to give us
- # the shared library paths.
- # https://gitlab.kitware.com/cmake/cmake/issues/16210
- # bundle_arrow_dependency(libcrypto-1_1-x64)
- # bundle_arrow_dependency(libssl-1_1-x64)
- endif()
- endif()
-
- set(FLIGHT_LINK_LIBS arrow_flight_shared arrow_python_flight_shared)
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _flight)
-endif()
-
-# Gandiva
-if(PYARROW_BUILD_GANDIVA)
- find_package(Gandiva REQUIRED)
-
- include_directories(SYSTEM ${GANDIVA_INCLUDE_DIR})
-
- if(PYARROW_BUNDLE_ARROW_CPP)
- file(COPY ${GANDIVA_INCLUDE_DIR}/gandiva
- DESTINATION ${BUILD_OUTPUT_ROOT_DIRECTORY}/include)
-
- bundle_arrow_lib(GANDIVA_SHARED_LIB SO_VERSION ${ARROW_SO_VERSION})
-
- if(MSVC)
- bundle_arrow_import_lib(GANDIVA_IMPORT_LIB)
- endif()
- endif()
-
- set(GANDIVA_LINK_LIBS gandiva_shared)
- set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} gandiva)
-endif()
-
-#
-# Setup and build Cython modules
-#
-
-if(PYARROW_GENERATE_COVERAGE)
- set(CYTHON_FLAGS "${CYTHON_FLAGS}" "-Xlinetrace=True")
-endif()
-
-foreach(module ${CYTHON_EXTENSIONS})
- string(REPLACE "." ";" directories ${module})
- list(GET directories -1 module_name)
- list(REMOVE_AT directories -1)
-
- string(REPLACE "." "/" module_root "${module}")
- set(module_SRC pyarrow/${module_root}.pyx)
- set_source_files_properties(${module_SRC} PROPERTIES CYTHON_IS_CXX 1)
-
- cython_add_module(${module_name} ${module_name}_pyx ${module_name}_output ${module_SRC})
-
- if(directories)
- string(REPLACE ";" "/" module_output_directory ${directories})
- set_target_properties(${module_name}
- PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${module_output_directory})
- endif()
-
- if(PYARROW_BUNDLE_ARROW_CPP)
- # In the event that we are bundling the shared libraries (e.g. in a
- # manylinux1 wheel), we need to set the RPATH of the extensions to the
- # root of the pyarrow/ package so that libarrow/libarrow_python are able
- # to be loaded properly
- if(APPLE)
- set(module_install_rpath "@loader_path/")
- else()
- set(module_install_rpath "\$ORIGIN")
- endif()
-
- # XXX(wesm): ARROW-2326 this logic is only needed when we have Cython
- # modules in interior directories. Since all of our C extensions and
- # bundled libraries are in the same place, we can skip this part
-
- # list(LENGTH directories i)
- # while(${i} GREATER 0)
- # set(module_install_rpath "${module_install_rpath}/..")
- # math(EXPR i "${i} - 1" )
- # endwhile(${i} GREATER 0)
-
- set_target_properties(${module_name} PROPERTIES INSTALL_RPATH ${module_install_rpath})
- endif()
-
- if(PYARROW_GENERATE_COVERAGE)
- set_target_properties(${module_name}
- PROPERTIES COMPILE_DEFINITIONS
- "CYTHON_TRACE=1;CYTHON_TRACE_NOGIL=1")
- endif()
-
- target_link_libraries(${module_name} PRIVATE ${LINK_LIBS})
-
- # Generated files will be moved to the right directory by setup.py.
-endforeach(module)
-
-# Additional link libraries
-
-if(PYARROW_BUILD_CUDA)
- target_link_libraries(_cuda PRIVATE ${CUDA_LINK_LIBS})
-endif()
-
-if(PYARROW_BUILD_FLIGHT)
- target_link_libraries(_flight PRIVATE ${FLIGHT_LINK_LIBS})
-endif()
-
-if(PYARROW_BUILD_DATASET)
- target_link_libraries(_dataset PRIVATE ${DATASET_LINK_LIBS})
-endif()
-
-if(PYARROW_BUILD_GANDIVA)
- target_link_libraries(gandiva PRIVATE ${GANDIVA_LINK_LIBS})
-endif()
-
-if(PYARROW_BUILD_PARQUET)
- target_link_libraries(_parquet PRIVATE ${PARQUET_LINK_LIBS})
-endif()
-
-if(PYARROW_BUILD_PLASMA)
- target_link_libraries(_plasma PRIVATE ${PLASMA_LINK_LIBS})
-endif()
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
deleted file mode 100644
index ed7012e..0000000
--- a/python/MANIFEST.in
+++ /dev/null
@@ -1,15 +0,0 @@
-include README.md
-include ../LICENSE.txt
-include ../NOTICE.txt
-
-global-include CMakeLists.txt
-graft pyarrow
-graft cmake_modules
-
-global-exclude *.so
-global-exclude *.pyc
-global-exclude *~
-global-exclude \#*
-global-exclude .git*
-global-exclude .DS_Store
-prune .asv
diff --git a/python/README.md b/python/README.md
deleted file mode 100644
index def98a3..0000000
--- a/python/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-## Python library for Apache Arrow
-
-[![pypi](https://img.shields.io/pypi/v/pyarrow.svg)](https://pypi.org/project/pyarrow/) [![conda-forge](https://img.shields.io/conda/vn/conda-forge/pyarrow.svg)](https://anaconda.org/conda-forge/pyarrow)
-
-This library provides a Python API for functionality provided by the Arrow C++
-libraries, along with tools for Arrow integration and interoperability with
-pandas, NumPy, and other software in the Python ecosystem.
-
-## Installing
-
-Across platforms, you can install a recent version of pyarrow with the conda
-package manager:
-
-```shell
-conda install pyarrow -c conda-forge
-```
-
-On Linux, macOS, and Windows, you can also install binary wheels from PyPI with
-pip:
-
-```shell
-pip install pyarrow
-```
-
-If you encounter any issues importing the pip wheels on Windows, you may need
-to install the [Visual C++ Redistributable for Visual Studio 2015][6].
-
-## Development
-
-See [Python Development][2] in the documentation subproject.
-
-### Building the documentation
-
-See [documentation build instructions][1] in the documentation subproject.
-
-[1]: https://github.com/apache/arrow/blob/master/docs/source/developers/documentation.rst
-[2]: https://github.com/apache/arrow/blob/master/docs/source/developers/python.rst
-[3]: https://github.com/pandas-dev/pandas
-[5]: https://arrow.apache.org/docs/latest/python/benchmarks.html
-[6]: https://www.microsoft.com/en-us/download/details.aspx?id=48145
\ No newline at end of file
diff --git a/python/asv-build.sh b/python/asv-build.sh
deleted file mode 100755
index 7de5ff4..0000000
--- a/python/asv-build.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-# ASV doesn't activate its conda environment for us
-if [ -z "$ASV_ENV_DIR" ]; then exit 1; fi
-
-if [ -z "$CONDA_HOME" ]; then
- echo "Please set \$CONDA_HOME to point to your root conda installation"
- exit 1;
-fi
-
-eval "$($CONDA_HOME/bin/conda shell.bash hook)"
-
-conda activate $ASV_ENV_DIR
-echo "== Conda Prefix for benchmarks: " $CONDA_PREFIX " =="
-
-# Build Arrow C++ libraries
-export ARROW_HOME=$CONDA_PREFIX
-export PARQUET_HOME=$CONDA_PREFIX
-export ORC_HOME=$CONDA_PREFIX
-export PROTOBUF_HOME=$CONDA_PREFIX
-export BOOST_ROOT=$CONDA_PREFIX
-
-pushd ../cpp
-mkdir -p build
-pushd build
-
-cmake -GNinja \
- -DCMAKE_BUILD_TYPE=release \
- -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
- -DARROW_CXXFLAGS=$CXXFLAGS \
- -DARROW_USE_GLOG=off \
- -DARROW_FLIGHT=on \
- -DARROW_ORC=on \
- -DARROW_PARQUET=on \
- -DARROW_PYTHON=on \
- -DARROW_PLASMA=on \
- -DARROW_S3=on \
- -DARROW_BUILD_TESTS=off \
- ..
-cmake --build . --target install
-
-popd
-popd
-
-# Build pyarrow wrappers
-export SETUPTOOLS_SCM_PRETEND_VERSION=0.0.1
-export PYARROW_BUILD_TYPE=release
-export PYARROW_PARALLEL=8
-export PYARROW_WITH_FLIGHT=1
-export PYARROW_WITH_ORC=1
-export PYARROW_WITH_PARQUET=1
-export PYARROW_WITH_PLASMA=1
-
-python setup.py clean
-find pyarrow -name "*.so" -delete
-python setup.py develop
diff --git a/python/asv-install.sh b/python/asv-install.sh
deleted file mode 100755
index beef730..0000000
--- a/python/asv-install.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Deliberately empty, but exists so that we don't have to change
-# asv.conf.json if we need specific commands here.
diff --git a/python/asv-uninstall.sh b/python/asv-uninstall.sh
deleted file mode 100755
index beef730..0000000
--- a/python/asv-uninstall.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Deliberately empty, but exists so that we don't have to change
-# asv.conf.json if we need specific commands here.
diff --git a/python/asv.conf.json b/python/asv.conf.json
deleted file mode 100644
index cdb178c..0000000
--- a/python/asv.conf.json
+++ /dev/null
@@ -1,187 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-{
- // The version of the config file format. Do not change, unless
- // you know what you are doing.
- "version": 1,
-
- // The name of the project being benchmarked
- "project": "pyarrow",
-
- // The project's homepage
- "project_url": "https://arrow.apache.org/",
-
- // The URL or local path of the source code repository for the
- // project being benchmarked
- "repo": "..",
-
- // The Python project's subdirectory in your repo. If missing or
- // the empty string, the project is assumed to be located at the root
- // of the repository.
- "repo_subdir": "python",
-
- // Custom build commands for Arrow.
- "build_command": ["/bin/bash {build_dir}/asv-build.sh"],
- "install_command": ["/bin/bash {build_dir}/asv-install.sh"],
- "uninstall_command": ["/bin/bash {build_dir}/asv-uninstall.sh"],
-
- // List of branches to benchmark. If not provided, defaults to "master"
- // (for git) or "default" (for mercurial).
- // "branches": ["master"], // for git
- // "branches": ["default"], // for mercurial
-
- // The DVCS being used. If not set, it will be automatically
- // determined from "repo" by looking at the protocol in the URL
- // (if remote), or by looking for special directories, such as
- // ".git" (if local).
- "dvcs": "git",
-
- // The tool to use to create environments. May be "conda",
- // "virtualenv" or other value depending on the plugins in use.
- // If missing or the empty string, the tool will be automatically
- // determined by looking for tools on the PATH environment
- // variable.
- "environment_type": "conda",
- // Avoid conda-forge to avoid C++ ABI issues
- "conda_channels": ["defaults"],
-
- // the base URL to show a commit for the project.
- "show_commit_url": "https://github.com/apache/arrow/commit/",
-
- // The Pythons you'd like to test against. If not provided, defaults
- // to the current version of Python used to run `asv`.
- "pythons": ["3.7"],
-
- // The matrix of dependencies to test. Each key is the name of a
- // package (in PyPI) and the values are version numbers. An empty
- // list or empty string indicates to just test against the default
- // (latest) version. null indicates that the package is to not be
- // installed. If the package to be tested is only available from
- // PyPi, and the 'environment_type' is conda, then you can preface
- // the package name by 'pip+', and the package will be installed via
- // pip (with all the conda available packages installed first,
- // followed by the pip installed packages).
- //
- // "matrix": {
- // "numpy": ["1.6", "1.7"],
- // "six": ["", null], // test with and without six installed
- // "pip+emcee": [""], // emcee is only available for install with pip.
- // },
- "matrix": {
- // Use older boost since it works on more editions of the project
- "aws-sdk-cpp": [],
- "boost-cpp": ["1.68.0"],
- "brotli": [],
- "cmake": [],
- "cython": [],
- "flatbuffers": [],
- "grpc-cpp": [],
- "libprotobuf": [],
- "lz4-c": [],
- "ninja": [],
- "numpy": [],
- "pandas": ["0.25.1"],
- "pip+setuptools_scm": [],
- "rapidjson": [],
- "re2": [],
- "snappy": [],
- "thrift-cpp": [],
- "zstd": [],
- },
-
- // Combinations of libraries/python versions can be excluded/included
- // from the set to test. Each entry is a dictionary containing additional
- // key-value pairs to include/exclude.
- //
- // An exclude entry excludes entries where all values match. The
- // values are regexps that should match the whole string.
- //
- // An include entry adds an environment. Only the packages listed
- // are installed. The 'python' key is required. The exclude rules
- // do not apply to includes.
- //
- // In addition to package names, the following keys are available:
- //
- // - python
- // Python version, as in the *pythons* variable above.
- // - environment_type
- // Environment type, as above.
- // - sys_platform
- // Platform, as in sys.platform. Possible values for the common
- // cases: 'linux2', 'win32', 'cygwin', 'darwin'.
- //
- // "exclude": [
- // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
- // {"environment_type": "conda", "six": null}, // don't run without six on conda
- // ],
- //
- // "include": [
- // // additional env for python2.7
- // {"python": "2.7", "numpy": "1.8"},
- // // additional env if run on windows+conda
- // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
- // ],
-
- // The directory (relative to the current directory) that benchmarks are
- // stored in. If not provided, defaults to "benchmarks"
- "benchmark_dir": "benchmarks",
-
- // The directory (relative to the current directory) to cache the Python
- // environments in. If not provided, defaults to "env"
- "env_dir": ".asv/env",
-
- // The directory (relative to the current directory) that raw benchmark
- // results are stored in. If not provided, defaults to "results".
- "results_dir": ".asv/results",
-
- // The directory (relative to the current directory) that the html tree
- // should be written to. If not provided, defaults to "html".
- "html_dir": "build/benchmarks/html",
-
- // The number of characters to retain in the commit hashes.
- // "hash_length": 8,
-
- // `asv` will cache wheels of the recent builds in each
- // environment, making them faster to install next time. This is
- // number of builds to keep, per environment.
- // "wheel_cache_size": 0,
-
- // The commits after which the regression search in `asv publish`
- // should start looking for regressions. Dictionary whose keys are
- // regexps matching to benchmark names, and values corresponding to
- // the commit (exclusive) after which to start looking for
- // regressions. The default is to start from the first commit
- // with results. If the commit is `null`, regression detection is
- // skipped for the matching benchmark.
- //
- // "regressions_first_commits": {
- // "some_benchmark": "352cdf", // Consider regressions only after this commit
- // "another_benchmark": null, // Skip regression detection altogether
- // }
-
- // The thresholds for relative change in results, after which `asv
- // publish` starts reporting regressions. Dictionary of the same
- // form as in ``regressions_first_commits``, with values
- // indicating the thresholds. If multiple entries match, the
- // maximum is taken. If no entry matches, the default is 5%.
- //
- // "regressions_thresholds": {
- // "some_benchmark": 0.01, // Threshold of 1%
- // "another_benchmark": 0.5, // Threshold of 50%
- // }
-}
diff --git a/python/benchmarks/__init__.py b/python/benchmarks/__init__.py
deleted file mode 100644
index 13a8339..0000000
--- a/python/benchmarks/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/python/benchmarks/array_ops.py b/python/benchmarks/array_ops.py
deleted file mode 100644
index 696b171..0000000
--- a/python/benchmarks/array_ops.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pyarrow as pa
-
-
-class ScalarAccess(object):
- n = 10 ** 5
-
- def setUp(self):
- self._array = pa.array(list(range(self.n)), type=pa.int64())
- self._array_items = list(self._array)
-
- def time_getitem(self):
- for i in range(self.n):
- self._array[i]
-
- def time_as_py(self):
- for item in self._array_items:
- item.as_py()
diff --git a/python/benchmarks/common.py b/python/benchmarks/common.py
deleted file mode 100644
index 48526a4..0000000
--- a/python/benchmarks/common.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import codecs
-import decimal
-from functools import partial
-import itertools
-import sys
-import unicodedata
-
-import numpy as np
-
-import pyarrow as pa
-
-
-KILOBYTE = 1 << 10
-MEGABYTE = KILOBYTE * KILOBYTE
-
-DEFAULT_NONE_PROB = 0.3
-
-
-def _multiplicate_sequence(base, target_size):
- q, r = divmod(target_size, len(base))
- return [base] * q + [base[:r]]
-
-
-def get_random_bytes(n, seed=42):
- """
- Generate a random bytes object of size *n*.
- Note the result might be compressible.
- """
- rnd = np.random.RandomState(seed)
- # Computing a huge random bytestring can be costly, so we get at most
- # 100KB and duplicate the result as needed
- base_size = 100003
- q, r = divmod(n, base_size)
- if q == 0:
- result = rnd.bytes(r)
- else:
- base = rnd.bytes(base_size)
- result = b''.join(_multiplicate_sequence(base, n))
- assert len(result) == n
- return result
-
-
-def get_random_ascii(n, seed=42):
- """
- Get a random ASCII-only unicode string of size *n*.
- """
- arr = np.frombuffer(get_random_bytes(n, seed=seed), dtype=np.int8) & 0x7f
- result, _ = codecs.ascii_decode(arr)
- assert isinstance(result, str)
- assert len(result) == n
- return result
-
-
-def _random_unicode_letters(n, seed=42):
- """
- Generate a string of random unicode letters (slow).
- """
- def _get_more_candidates():
- return rnd.randint(0, sys.maxunicode, size=n).tolist()
-
- rnd = np.random.RandomState(seed)
- out = []
- candidates = []
-
- while len(out) < n:
- if not candidates:
- candidates = _get_more_candidates()
- ch = chr(candidates.pop())
- # XXX Do we actually care that the code points are valid?
- if unicodedata.category(ch)[0] == 'L':
- out.append(ch)
- return out
-
-
-_1024_random_unicode_letters = _random_unicode_letters(1024)
-
-
-def get_random_unicode(n, seed=42):
- """
- Get a random non-ASCII unicode string of size *n*.
- """
- indices = np.frombuffer(get_random_bytes(n * 2, seed=seed),
- dtype=np.int16) & 1023
- unicode_arr = np.array(_1024_random_unicode_letters)[indices]
-
- result = ''.join(unicode_arr.tolist())
- assert len(result) == n, (len(result), len(unicode_arr))
- return result
-
-
-class BuiltinsGenerator(object):
-
- def __init__(self, seed=42):
- self.rnd = np.random.RandomState(seed)
-
- def sprinkle(self, lst, prob, value):
- """
- Sprinkle *value* entries in list *lst* with likelihood *prob*.
- """
- for i, p in enumerate(self.rnd.random_sample(size=len(lst))):
- if p < prob:
- lst[i] = value
-
- def sprinkle_nones(self, lst, prob):
- """
- Sprinkle None entries in list *lst* with likelihood *prob*.
- """
- self.sprinkle(lst, prob, None)
-
- def generate_int_list(self, n, none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of Python ints with *none_prob* probability of
- an entry being None.
- """
- data = list(range(n))
- self.sprinkle_nones(data, none_prob)
- return data
-
- def generate_float_list(self, n, none_prob=DEFAULT_NONE_PROB,
- use_nan=False):
- """
- Generate a list of Python floats with *none_prob* probability of
- an entry being None (or NaN if *use_nan* is true).
- """
- # Make sure we get Python floats, not np.float64
- data = list(map(float, self.rnd.uniform(0.0, 1.0, n)))
- assert len(data) == n
- self.sprinkle(data, none_prob, value=float('nan') if use_nan else None)
- return data
-
- def generate_bool_list(self, n, none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of Python bools with *none_prob* probability of
- an entry being None.
- """
- # Make sure we get Python bools, not np.bool_
- data = [bool(x >= 0.5) for x in self.rnd.uniform(0.0, 1.0, n)]
- assert len(data) == n
- self.sprinkle_nones(data, none_prob)
- return data
-
- def generate_decimal_list(self, n, none_prob=DEFAULT_NONE_PROB,
- use_nan=False):
- """
- Generate a list of Python Decimals with *none_prob* probability of
- an entry being None (or NaN if *use_nan* is true).
- """
- data = [decimal.Decimal('%.9f' % f)
- for f in self.rnd.uniform(0.0, 1.0, n)]
- assert len(data) == n
- self.sprinkle(data, none_prob,
- value=decimal.Decimal('nan') if use_nan else None)
- return data
-
- def generate_object_list(self, n, none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of generic Python objects with *none_prob*
- probability of an entry being None.
- """
- data = [object() for i in range(n)]
- self.sprinkle_nones(data, none_prob)
- return data
-
- def _generate_varying_sequences(self, random_factory, n, min_size,
- max_size, none_prob):
- """
- Generate a list of *n* sequences of varying size between *min_size*
- and *max_size*, with *none_prob* probability of an entry being None.
- The base material for each sequence is obtained by calling
- `random_factory(<some size>)`
- """
- base_size = 10000
- base = random_factory(base_size + max_size)
- data = []
- for i in range(n):
- off = self.rnd.randint(base_size)
- if min_size == max_size:
- size = min_size
- else:
- size = self.rnd.randint(min_size, max_size + 1)
- data.append(base[off:off + size])
- self.sprinkle_nones(data, none_prob)
- assert len(data) == n
- return data
-
- def generate_fixed_binary_list(self, n, size, none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of bytestrings with a fixed *size*.
- """
- return self._generate_varying_sequences(get_random_bytes, n,
- size, size, none_prob)
-
- def generate_varying_binary_list(self, n, min_size, max_size,
- none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of bytestrings with a random size between
- *min_size* and *max_size*.
- """
- return self._generate_varying_sequences(get_random_bytes, n,
- min_size, max_size, none_prob)
-
- def generate_ascii_string_list(self, n, min_size, max_size,
- none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of ASCII strings with a random size between
- *min_size* and *max_size*.
- """
- return self._generate_varying_sequences(get_random_ascii, n,
- min_size, max_size, none_prob)
-
- def generate_unicode_string_list(self, n, min_size, max_size,
- none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of unicode strings with a random size between
- *min_size* and *max_size*.
- """
- return self._generate_varying_sequences(get_random_unicode, n,
- min_size, max_size, none_prob)
-
- def generate_int_list_list(self, n, min_size, max_size,
- none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of lists of Python ints with a random size between
- *min_size* and *max_size*.
- """
- return self._generate_varying_sequences(
- partial(self.generate_int_list, none_prob=none_prob),
- n, min_size, max_size, none_prob)
-
- def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of tuples with random values.
- Each tuple has the form `(int value, float value, bool value)`
- """
- dicts = self.generate_dict_list(n, none_prob=none_prob)
- tuples = [(d.get('u'), d.get('v'), d.get('w'))
- if d is not None else None
- for d in dicts]
- assert len(tuples) == n
- return tuples
-
- def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB):
- """
- Generate a list of dicts with random values.
- Each dict has the form
-
- `{'u': int value, 'v': float value, 'w': bool value}`
- """
- ints = self.generate_int_list(n, none_prob=none_prob)
- floats = self.generate_float_list(n, none_prob=none_prob)
- bools = self.generate_bool_list(n, none_prob=none_prob)
- dicts = []
- # Keep half the Nones, omit the other half
- keep_nones = itertools.cycle([True, False])
- for u, v, w in zip(ints, floats, bools):
- d = {}
- if u is not None or next(keep_nones):
- d['u'] = u
- if v is not None or next(keep_nones):
- d['v'] = v
- if w is not None or next(keep_nones):
- d['w'] = w
- dicts.append(d)
- self.sprinkle_nones(dicts, none_prob)
- assert len(dicts) == n
- return dicts
-
- def get_type_and_builtins(self, n, type_name):
- """
- Return a `(arrow type, list)` tuple where the arrow type
- corresponds to the given logical *type_name*, and the list
- is a list of *n* random-generated Python objects compatible
- with the arrow type.
- """
- size = None
-
- if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'):
- kind = type_name
- elif type_name.startswith(('int', 'uint')):
- kind = 'int'
- elif type_name.startswith('float'):
- kind = 'float'
- elif type_name.startswith('struct'):
- kind = 'struct'
- elif type_name == 'binary':
- kind = 'varying binary'
- elif type_name.startswith('binary'):
- kind = 'fixed binary'
- size = int(type_name[6:])
- assert size > 0
- else:
- raise ValueError("unrecognized type %r" % (type_name,))
-
- if kind in ('int', 'float'):
- ty = getattr(pa, type_name)()
- elif kind == 'bool':
- ty = pa.bool_()
- elif kind == 'decimal':
- ty = pa.decimal128(9, 9)
- elif kind == 'fixed binary':
- ty = pa.binary(size)
- elif kind == 'varying binary':
- ty = pa.binary()
- elif kind in ('ascii', 'unicode'):
- ty = pa.string()
- elif kind == 'int64 list':
- ty = pa.list_(pa.int64())
- elif kind == 'struct':
- ty = pa.struct([pa.field('u', pa.int64()),
- pa.field('v', pa.float64()),
- pa.field('w', pa.bool_())])
-
- factories = {
- 'int': self.generate_int_list,
- 'float': self.generate_float_list,
- 'bool': self.generate_bool_list,
- 'decimal': self.generate_decimal_list,
- 'fixed binary': partial(self.generate_fixed_binary_list,
- size=size),
- 'varying binary': partial(self.generate_varying_binary_list,
- min_size=3, max_size=40),
- 'ascii': partial(self.generate_ascii_string_list,
- min_size=3, max_size=40),
- 'unicode': partial(self.generate_unicode_string_list,
- min_size=3, max_size=40),
- 'int64 list': partial(self.generate_int_list_list,
- min_size=0, max_size=20),
- 'struct': self.generate_dict_list,
- 'struct from tuples': self.generate_tuple_list,
- }
- data = factories[kind](n)
- return ty, data
diff --git a/python/benchmarks/convert_builtins.py b/python/benchmarks/convert_builtins.py
deleted file mode 100644
index 48a38fa..0000000
--- a/python/benchmarks/convert_builtins.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pyarrow as pa
-
-from . import common
-
-
-# TODO:
-# - test dates and times
-
-
-class ConvertPyListToArray(object):
- """
- Benchmark pa.array(list of values, type=...)
- """
- size = 10 ** 5
- types = ('int32', 'uint32', 'int64', 'uint64',
- 'float32', 'float64', 'bool', 'decimal',
- 'binary', 'binary10', 'ascii', 'unicode',
- 'int64 list', 'struct', 'struct from tuples')
-
- param_names = ['type']
- params = [types]
-
- def setup(self, type_name):
- gen = common.BuiltinsGenerator()
- self.ty, self.data = gen.get_type_and_builtins(self.size, type_name)
-
- def time_convert(self, *args):
- pa.array(self.data, type=self.ty)
-
-
-class InferPyListToArray(object):
- """
- Benchmark pa.array(list of values) with type inference
- """
- size = 10 ** 5
- types = ('int64', 'float64', 'bool', 'decimal', 'binary', 'ascii',
- 'unicode', 'int64 list', 'struct')
-
- param_names = ['type']
- params = [types]
-
- def setup(self, type_name):
- gen = common.BuiltinsGenerator()
- self.ty, self.data = gen.get_type_and_builtins(self.size, type_name)
-
- def time_infer(self, *args):
- arr = pa.array(self.data)
- assert arr.type == self.ty
-
-
-class ConvertArrayToPyList(object):
- """
- Benchmark pa.array.to_pylist()
- """
- size = 10 ** 5
- types = ('int32', 'uint32', 'int64', 'uint64',
- 'float32', 'float64', 'bool', 'decimal',
- 'binary', 'binary10', 'ascii', 'unicode',
- 'int64 list', 'struct')
-
- param_names = ['type']
- params = [types]
-
- def setup(self, type_name):
- gen = common.BuiltinsGenerator()
- self.ty, self.data = gen.get_type_and_builtins(self.size, type_name)
- self.arr = pa.array(self.data, type=self.ty)
-
- def time_convert(self, *args):
- self.arr.to_pylist()
diff --git a/python/benchmarks/convert_pandas.py b/python/benchmarks/convert_pandas.py
deleted file mode 100644
index 9cf6bde..0000000
--- a/python/benchmarks/convert_pandas.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import pandas as pd
-import pandas.util.testing as tm
-
-import pyarrow as pa
-
-
-class PandasConversionsBase(object):
- def setup(self, n, dtype):
- if dtype == 'float64_nans':
- arr = np.arange(n).astype('float64')
- arr[arr % 10 == 0] = np.nan
- else:
- arr = np.arange(n).astype(dtype)
- self.data = pd.DataFrame({'column': arr})
-
-
-class PandasConversionsToArrow(PandasConversionsBase):
- param_names = ('size', 'dtype')
- params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
-
- def time_from_series(self, n, dtype):
- pa.Table.from_pandas(self.data)
-
-
-class PandasConversionsFromArrow(PandasConversionsBase):
- param_names = ('size', 'dtype')
- params = ((10, 10 ** 6), ('int64', 'float64', 'float64_nans', 'str'))
-
- def setup(self, n, dtype):
- super(PandasConversionsFromArrow, self).setup(n, dtype)
- self.arrow_data = pa.Table.from_pandas(self.data)
-
- def time_to_series(self, n, dtype):
- self.arrow_data.to_pandas()
-
-
-class ToPandasStrings(object):
-
- param_names = ('uniqueness', 'total')
- params = ((0.001, 0.01, 0.1, 0.5), (1000000,))
- string_length = 25
-
- def setup(self, uniqueness, total):
- nunique = int(total * uniqueness)
- unique_values = [tm.rands(self.string_length) for i in range(nunique)]
- values = unique_values * (total // nunique)
- self.arr = pa.array(values, type=pa.string())
- self.table = pa.Table.from_arrays([self.arr], ['f0'])
-
- def time_to_pandas_dedup(self, *args):
- self.arr.to_pandas()
-
- def time_to_pandas_no_dedup(self, *args):
- self.arr.to_pandas(deduplicate_objects=False)
-
-
-class ZeroCopyPandasRead(object):
-
- def setup(self):
- # Transpose to make column-major
- values = np.random.randn(10, 100000)
-
- df = pd.DataFrame(values.T)
- ctx = pa.default_serialization_context()
-
- self.serialized = ctx.serialize(df)
- self.as_buffer = self.serialized.to_buffer()
- self.as_components = self.serialized.to_components()
-
- def time_deserialize_from_buffer(self):
- pa.deserialize(self.as_buffer)
-
- def time_deserialize_from_components(self):
- pa.deserialize_components(self.as_components)
-
-
-class SerializeDeserializePandas(object):
-
- def setup(self):
- # 10 million length
- n = 10000000
- self.df = pd.DataFrame({'data': np.random.randn(n)})
- self.serialized = pa.serialize_pandas(self.df)
-
- def time_serialize_pandas(self):
- pa.serialize_pandas(self.df)
-
- def time_deserialize_pandas(self):
- pa.deserialize_pandas(self.serialized)
-
-
-class TableFromPandasMicroperformance(object):
- # ARROW-4629
-
- def setup(self):
- ser = pd.Series(range(10000))
- df = pd.DataFrame({col: ser.copy(deep=True) for col in range(100)})
- # Simulate a real dataset by converting some columns to strings
- self.df = df.astype({col: str for col in range(50)})
-
- def time_Table_from_pandas(self):
- for _ in range(50):
- pa.Table.from_pandas(self.df, nthreads=1)
diff --git a/python/benchmarks/io.py b/python/benchmarks/io.py
deleted file mode 100644
index 01a9acb..0000000
--- a/python/benchmarks/io.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import time
-import pyarrow as pa
-
-
-class HighLatencyReader(object):
-
- def __init__(self, raw, latency):
- self.raw = raw
- self.latency = latency
-
- def close(self):
- self.raw.close()
-
- @property
- def closed(self):
- return self.raw.closed
-
- def read(self, nbytes=None):
- time.sleep(self.latency)
- return self.raw.read(nbytes)
-
-
-class HighLatencyWriter(object):
-
- def __init__(self, raw, latency):
- self.raw = raw
- self.latency = latency
-
- def close(self):
- self.raw.close()
-
- @property
- def closed(self):
- return self.raw.closed
-
- def write(self, data):
- time.sleep(self.latency)
- self.raw.write(data)
-
-
-class BufferedIOHighLatency(object):
- """Benchmark creating a parquet manifest."""
-
- increment = 1024
- total_size = 16 * (1 << 20) # 16 MB
- buffer_size = 1 << 20 # 1 MB
- latency = 0.1 # 100ms
-
- param_names = ('latency',)
- params = [0, 0.01, 0.1]
-
- def time_buffered_writes(self, latency):
- test_data = b'x' * self.increment
- bytes_written = 0
- out = pa.BufferOutputStream()
- slow_out = HighLatencyWriter(out, latency)
- buffered_out = pa.output_stream(slow_out, buffer_size=self.buffer_size)
-
- while bytes_written < self.total_size:
- buffered_out.write(test_data)
- bytes_written += self.increment
- buffered_out.flush()
-
- def time_buffered_reads(self, latency):
- bytes_read = 0
- reader = pa.input_stream(pa.py_buffer(b'x' * self.total_size))
- slow_reader = HighLatencyReader(reader, latency)
- buffered_reader = pa.input_stream(slow_reader,
- buffer_size=self.buffer_size)
- while bytes_read < self.total_size:
- buffered_reader.read(self.increment)
- bytes_read += self.increment
diff --git a/python/benchmarks/microbenchmarks.py b/python/benchmarks/microbenchmarks.py
deleted file mode 100644
index f8ba383..0000000
--- a/python/benchmarks/microbenchmarks.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pyarrow.benchmark as pb
-
-from . import common
-
-
-class PandasObjectIsNull(object):
- size = 10 ** 5
- types = ('int', 'float', 'object', 'decimal')
-
- param_names = ['type']
- params = [types]
-
- def setup(self, type_name):
- gen = common.BuiltinsGenerator()
- if type_name == 'int':
- lst = gen.generate_int_list(self.size)
- elif type_name == 'float':
- lst = gen.generate_float_list(self.size, use_nan=True)
- elif type_name == 'object':
- lst = gen.generate_object_list(self.size)
- elif type_name == 'decimal':
- lst = gen.generate_decimal_list(self.size)
- else:
- assert 0
- self.lst = lst
-
- def time_PandasObjectIsNull(self, *args):
- pb.benchmark_PandasObjectIsNull(self.lst)
diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py
deleted file mode 100644
index 3aeca42..0000000
--- a/python/benchmarks/parquet.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import shutil
-import tempfile
-
-from pandas.util.testing import rands
-import numpy as np
-import pandas as pd
-
-import pyarrow as pa
-try:
- import pyarrow.parquet as pq
-except ImportError:
- pq = None
-
-
-class ParquetManifestCreation(object):
- """Benchmark creating a parquet manifest."""
-
- size = 10 ** 6
- tmpdir = None
-
- param_names = ('num_partitions', 'num_threads')
- params = [(10, 100, 1000), (1, 8)]
-
- def setup(self, num_partitions, num_threads):
- if pq is None:
- raise NotImplementedError("Parquet support not enabled")
-
- self.tmpdir = tempfile.mkdtemp('benchmark_parquet')
- rnd = np.random.RandomState(42)
- num1 = rnd.randint(0, num_partitions, size=self.size)
- num2 = rnd.randint(0, 1000, size=self.size)
- output_df = pd.DataFrame({'num1': num1, 'num2': num2})
- output_table = pa.Table.from_pandas(output_df)
- pq.write_to_dataset(output_table, self.tmpdir, ['num1'])
-
- def teardown(self, num_partitions, num_threads):
- if self.tmpdir is not None:
- shutil.rmtree(self.tmpdir)
-
- def time_manifest_creation(self, num_partitions, num_threads):
- pq.ParquetManifest(self.tmpdir, metadata_nthreads=num_threads)
-
-
-class ParquetWriteBinary(object):
-
- def setup(self):
- nuniques = 100000
- value_size = 50
- length = 1000000
- num_cols = 10
-
- unique_values = np.array([rands(value_size) for
- i in range(nuniques)], dtype='O')
- values = unique_values[np.random.randint(0, nuniques, size=length)]
- self.table = pa.table([pa.array(values) for i in range(num_cols)],
- names=['f{}'.format(i) for i in range(num_cols)])
- self.table_df = self.table.to_pandas()
-
- def time_write_binary_table(self):
- out = pa.BufferOutputStream()
- pq.write_table(self.table, out)
-
- def time_write_binary_table_uncompressed(self):
- out = pa.BufferOutputStream()
- pq.write_table(self.table, out, compression='none')
-
- def time_write_binary_table_no_dictionary(self):
- out = pa.BufferOutputStream()
- pq.write_table(self.table, out, use_dictionary=False)
-
- def time_convert_pandas_and_write_binary_table(self):
- out = pa.BufferOutputStream()
- pq.write_table(pa.table(self.table_df), out)
-
-
-def generate_dict_strings(string_size, nunique, length, random_order=True):
- uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O')
- if random_order:
- indices = np.random.randint(0, nunique, size=length).astype('i4')
- else:
- indices = np.arange(nunique).astype('i4').repeat(length // nunique)
- return pa.DictionaryArray.from_arrays(indices, uniques)
-
-
-def generate_dict_table(num_cols, string_size, nunique, length,
- random_order=True):
- data = generate_dict_strings(string_size, nunique, length,
- random_order=random_order)
- return pa.table([
- data for i in range(num_cols)
- ], names=['f{}'.format(i) for i in range(num_cols)])
-
-
-class ParquetWriteDictionaries(object):
-
- param_names = ('nunique',)
- params = [(1000), (100000)]
-
- def setup(self, nunique):
- self.num_cols = 10
- self.value_size = 32
- self.nunique = nunique
- self.length = 10000000
-
- self.table = generate_dict_table(self.num_cols, self.value_size,
- self.nunique, self.length)
- self.table_sequential = generate_dict_table(self.num_cols,
- self.value_size,
- self.nunique, self.length,
- random_order=False)
-
- def time_write_random_order(self, nunique):
- pq.write_table(self.table, pa.BufferOutputStream())
-
- def time_write_sequential(self, nunique):
- pq.write_table(self.table_sequential, pa.BufferOutputStream())
-
-
-class ParquetManyColumns(object):
-
- total_cells = 10000000
- param_names = ('num_cols',)
- params = [100, 1000, 10000]
-
- def setup(self, num_cols):
- num_rows = self.total_cells // num_cols
- self.table = pa.table({'c' + str(i): np.random.randn(num_rows)
- for i in range(num_cols)})
-
- out = pa.BufferOutputStream()
- pq.write_table(self.table, out)
- self.buf = out.getvalue()
-
- def time_write(self, num_cols):
- out = pa.BufferOutputStream()
- pq.write_table(self.table, out)
-
- def time_read(self, num_cols):
- pq.read_table(self.buf)
diff --git a/python/benchmarks/plasma.py b/python/benchmarks/plasma.py
deleted file mode 100644
index 90a2845..0000000
--- a/python/benchmarks/plasma.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import timeit
-
-try:
- import pyarrow.plasma as plasma
-except ImportError:
- # TODO(wesm): These are not asv benchmarks, so we can just fail
- # silently here
- pass
-
-
-class SimplePlasmaThroughput(object):
- """Benchmark plasma store throughput with a single client."""
-
- params = [1000, 100000, 10000000]
-
- timer = timeit.default_timer
-
- def setup(self, size):
- self.plasma_store_ctx = plasma.start_plasma_store(
- plasma_store_memory=10**9)
- plasma_store_name, p = self.plasma_store_ctx.__enter__()
- self.plasma_client = plasma.connect(plasma_store_name)
-
- self.data = np.random.randn(size // 8)
-
- def teardown(self, size):
- self.plasma_store_ctx.__exit__(None, None, None)
-
- def time_plasma_put_data(self, size):
- self.plasma_client.put(self.data)
-
-
-class SimplePlasmaLatency(object):
- """Benchmark plasma store latency with a single client."""
-
- timer = timeit.default_timer
-
- def setup(self):
- self.plasma_store_ctx = plasma.start_plasma_store(
- plasma_store_memory=10**9)
- plasma_store_name, p = self.plasma_store_ctx.__enter__()
- self.plasma_client = plasma.connect(plasma_store_name)
-
- def teardown(self):
- self.plasma_store_ctx.__exit__(None, None, None)
-
- def time_plasma_put(self):
- for i in range(1000):
- self.plasma_client.put(1)
-
- def time_plasma_putget(self):
- for i in range(1000):
- x = self.plasma_client.put(1)
- self.plasma_client.get(x)
diff --git a/python/benchmarks/streaming.py b/python/benchmarks/streaming.py
deleted file mode 100644
index c0c63e6..0000000
--- a/python/benchmarks/streaming.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-import pandas as pd
-import pyarrow as pa
-
-from . import common
-from .common import KILOBYTE, MEGABYTE
-
-
-def generate_chunks(total_size, nchunks, ncols, dtype=np.dtype('int64')):
- rowsize = total_size // nchunks // ncols
- assert rowsize % dtype.itemsize == 0
-
- def make_column(col, chunk):
- return np.frombuffer(common.get_random_bytes(
- rowsize, seed=col + 997 * chunk)).view(dtype)
-
- return [pd.DataFrame({
- 'c' + str(col): make_column(col, chunk)
- for col in range(ncols)})
- for chunk in range(nchunks)]
-
-
-class StreamReader(object):
- """
- Benchmark in-memory streaming to a Pandas dataframe.
- """
- total_size = 64 * MEGABYTE
- ncols = 8
- chunk_sizes = [16 * KILOBYTE, 256 * KILOBYTE, 8 * MEGABYTE]
-
- param_names = ['chunk_size']
- params = [chunk_sizes]
-
- def setup(self, chunk_size):
- # Note we're careful to stream different chunks instead of
- # streaming N times the same chunk, so that we avoid operating
- # entirely out of L1/L2.
- chunks = generate_chunks(self.total_size,
- nchunks=self.total_size // chunk_size,
- ncols=self.ncols)
- batches = [pa.RecordBatch.from_pandas(df)
- for df in chunks]
- schema = batches[0].schema
- sink = pa.BufferOutputStream()
- stream_writer = pa.RecordBatchStreamWriter(sink, schema)
- for batch in batches:
- stream_writer.write_batch(batch)
- self.source = sink.getvalue()
-
- def time_read_to_dataframe(self, *args):
- reader = pa.RecordBatchStreamReader(self.source)
- table = reader.read_all()
- df = table.to_pandas() # noqa
diff --git a/python/cmake_modules b/python/cmake_modules
deleted file mode 120000
index 76e2a8d..0000000
--- a/python/cmake_modules
+++ /dev/null
@@ -1 +0,0 @@
-../cpp/cmake_modules
\ No newline at end of file
diff --git a/python/examples/flight/client.py b/python/examples/flight/client.py
deleted file mode 100644
index ed6ce54..0000000
--- a/python/examples/flight/client.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""An example Flight CLI client."""
-
-import argparse
-import sys
-
-import pyarrow
-import pyarrow.flight
-import pyarrow.csv as csv
-
-
-def list_flights(args, client, connection_args={}):
- print('Flights\n=======')
- for flight in client.list_flights():
- descriptor = flight.descriptor
- if descriptor.descriptor_type == pyarrow.flight.DescriptorType.PATH:
- print("Path:", descriptor.path)
- elif descriptor.descriptor_type == pyarrow.flight.DescriptorType.CMD:
- print("Command:", descriptor.command)
- else:
- print("Unknown descriptor type")
-
- print("Total records:", end=" ")
- if flight.total_records >= 0:
- print(flight.total_records)
- else:
- print("Unknown")
-
- print("Total bytes:", end=" ")
- if flight.total_bytes >= 0:
- print(flight.total_bytes)
- else:
- print("Unknown")
-
- print("Number of endpoints:", len(flight.endpoints))
- print("Schema:")
- print(flight.schema)
- print('---')
-
- print('\nActions\n=======')
- for action in client.list_actions():
- print("Type:", action.type)
- print("Description:", action.description)
- print('---')
-
-
-def do_action(args, client, connection_args={}):
- try:
- buf = pyarrow.allocate_buffer(0)
- action = pyarrow.flight.Action(args.action_type, buf)
- print('Running action', args.action_type)
- for result in client.do_action(action):
- print("Got result", result.body.to_pybytes())
- except pyarrow.lib.ArrowIOError as e:
- print("Error calling action:", e)
-
-
-def push_data(args, client, connection_args={}):
- print('File Name:', args.file)
- my_table = csv.read_csv(args.file)
- print('Table rows=', str(len(my_table)))
- df = my_table.to_pandas()
- print(df.head())
- writer, _ = client.do_put(
- pyarrow.flight.FlightDescriptor.for_path(args.file), my_table.schema)
- writer.write_table(my_table)
- writer.close()
-
-
-def get_flight(args, client, connection_args={}):
- if args.path:
- descriptor = pyarrow.flight.FlightDescriptor.for_path(*args.path)
- else:
- descriptor = pyarrow.flight.FlightDescriptor.for_command(args.command)
-
- info = client.get_flight_info(descriptor)
- for endpoint in info.endpoints:
- print('Ticket:', endpoint.ticket)
- for location in endpoint.locations:
- print(location)
- get_client = pyarrow.flight.FlightClient(location,
- **connection_args)
- reader = get_client.do_get(endpoint.ticket)
- df = reader.read_pandas()
- print(df)
-
-
-def _add_common_arguments(parser):
- parser.add_argument('--tls', action='store_true',
- help='Enable transport-level security')
- parser.add_argument('--tls-roots', default=None,
- help='Path to trusted TLS certificate(s)')
- parser.add_argument("--mtls", nargs=2, default=None,
- metavar=('CERTFILE', 'KEYFILE'),
- help="Enable transport-level security")
- parser.add_argument('host', type=str,
- help="Address or hostname to connect to")
-
-
-def main():
- parser = argparse.ArgumentParser()
- subcommands = parser.add_subparsers()
-
- cmd_list = subcommands.add_parser('list')
- cmd_list.set_defaults(action='list')
- _add_common_arguments(cmd_list)
- cmd_list.add_argument('-l', '--list', action='store_true',
- help="Print more details.")
-
- cmd_do = subcommands.add_parser('do')
- cmd_do.set_defaults(action='do')
- _add_common_arguments(cmd_do)
- cmd_do.add_argument('action_type', type=str,
- help="The action type to run.")
-
- cmd_put = subcommands.add_parser('put')
- cmd_put.set_defaults(action='put')
- _add_common_arguments(cmd_put)
- cmd_put.add_argument('file', type=str,
- help="CSV file to upload.")
-
- cmd_get = subcommands.add_parser('get')
- cmd_get.set_defaults(action='get')
- _add_common_arguments(cmd_get)
- cmd_get_descriptor = cmd_get.add_mutually_exclusive_group(required=True)
- cmd_get_descriptor.add_argument('-p', '--path', type=str, action='append',
- help="The path for the descriptor.")
- cmd_get_descriptor.add_argument('-c', '--command', type=str,
- help="The command for the descriptor.")
-
- args = parser.parse_args()
- if not hasattr(args, 'action'):
- parser.print_help()
- sys.exit(1)
-
- commands = {
- 'list': list_flights,
- 'do': do_action,
- 'get': get_flight,
- 'put': push_data,
- }
- host, port = args.host.split(':')
- port = int(port)
- scheme = "grpc+tcp"
- connection_args = {}
- if args.tls:
- scheme = "grpc+tls"
- if args.tls_roots:
- with open(args.tls_roots, "rb") as root_certs:
- connection_args["tls_root_certs"] = root_certs.read()
- if args.mtls:
- with open(args.mtls[0], "rb") as cert_file:
- tls_cert_chain = cert_file.read()
- with open(args.mtls[1], "rb") as key_file:
- tls_private_key = key_file.read()
- connection_args["cert_chain"] = tls_cert_chain
- connection_args["private_key"] = tls_private_key
- client = pyarrow.flight.FlightClient(f"{scheme}://{host}:{port}",
- **connection_args)
- while True:
- try:
- action = pyarrow.flight.Action("healthcheck", b"")
- options = pyarrow.flight.FlightCallOptions(timeout=1)
- list(client.do_action(action, options=options))
- break
- except pyarrow.ArrowIOError as e:
- if "Deadline" in str(e):
- print("Server is not ready, waiting...")
- commands[args.action](args, client, connection_args)
-
-
-if __name__ == '__main__':
- main()
diff --git a/python/examples/flight/middleware.py b/python/examples/flight/middleware.py
deleted file mode 100644
index 2056bae..0000000
--- a/python/examples/flight/middleware.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Example of invisibly propagating a request ID with middleware."""
-
-import argparse
-import sys
-import threading
-import uuid
-
-import pyarrow as pa
-import pyarrow.flight as flight
-
-
-class TraceContext:
- _locals = threading.local()
- _locals.trace_id = None
-
- @classmethod
- def current_trace_id(cls):
- if not getattr(cls._locals, "trace_id", None):
- cls.set_trace_id(uuid.uuid4().hex)
- return cls._locals.trace_id
-
- @classmethod
- def set_trace_id(cls, trace_id):
- cls._locals.trace_id = trace_id
-
-
-TRACE_HEADER = "x-tracing-id"
-
-
-class TracingServerMiddleware(flight.ServerMiddleware):
- def __init__(self, trace_id):
- self.trace_id = trace_id
-
- def sending_headers(self):
- return {
- TRACE_HEADER: self.trace_id,
- }
-
-
-class TracingServerMiddlewareFactory(flight.ServerMiddlewareFactory):
- def start_call(self, info, headers):
- print("Starting new call:", info)
- if TRACE_HEADER in headers:
- trace_id = headers[TRACE_HEADER][0]
- print("Found trace header with value:", trace_id)
- TraceContext.set_trace_id(trace_id)
- return TracingServerMiddleware(TraceContext.current_trace_id())
-
-
-class TracingClientMiddleware(flight.ClientMiddleware):
- def sending_headers(self):
- print("Sending trace ID:", TraceContext.current_trace_id())
- return {
- "x-tracing-id": TraceContext.current_trace_id(),
- }
-
- def received_headers(self, headers):
- if TRACE_HEADER in headers:
- trace_id = headers[TRACE_HEADER][0]
- print("Found trace header with value:", trace_id)
- # Don't overwrite our trace ID
-
-
-class TracingClientMiddlewareFactory(flight.ClientMiddlewareFactory):
- def start_call(self, info):
- print("Starting new call:", info)
- return TracingClientMiddleware()
-
-
-class FlightServer(flight.FlightServerBase):
- def __init__(self, delegate, **kwargs):
- super().__init__(**kwargs)
- if delegate:
- self.delegate = flight.connect(
- delegate,
- middleware=(TracingClientMiddlewareFactory(),))
- else:
- self.delegate = None
-
- def list_actions(self, context):
- return [
- ("get-trace-id", "Get the trace context ID."),
- ]
-
- def do_action(self, context, action):
- trace_middleware = context.get_middleware("trace")
- if trace_middleware:
- TraceContext.set_trace_id(trace_middleware.trace_id)
- if action.type == "get-trace-id":
- if self.delegate:
- for result in self.delegate.do_action(action):
- yield result
- else:
- trace_id = TraceContext.current_trace_id().encode("utf-8")
- print("Returning trace ID:", trace_id)
- buf = pa.py_buffer(trace_id)
- yield pa.flight.Result(buf)
- else:
- raise KeyError(f"Unknown action {action.type!r}")
-
-
-def main():
- parser = argparse.ArgumentParser()
-
- subparsers = parser.add_subparsers(dest="command")
- client = subparsers.add_parser("client", help="Run the client.")
- client.add_argument("server")
- client.add_argument("--request-id", default=None)
-
- server = subparsers.add_parser("server", help="Run the server.")
- server.add_argument(
- "--listen",
- required=True,
- help="The location to listen on (example: grpc://localhost:5050)",
- )
- server.add_argument(
- "--delegate",
- required=False,
- default=None,
- help=("A location to delegate to. That is, this server will "
- "simply call the given server for the response. Demonstrates "
- "propagation of the trace ID between servers."),
- )
-
- args = parser.parse_args()
- if not getattr(args, "command"):
- parser.print_help()
- return 1
-
- if args.command == "server":
- server = FlightServer(
- args.delegate,
- location=args.listen,
- middleware={"trace": TracingServerMiddlewareFactory()})
- server.serve()
- elif args.command == "client":
- client = flight.connect(
- args.server,
- middleware=(TracingClientMiddlewareFactory(),))
- if args.request_id:
- TraceContext.set_trace_id(args.request_id)
- else:
- TraceContext.set_trace_id("client-chosen-id")
-
- for result in client.do_action(flight.Action("get-trace-id", b"")):
- print(result.body.to_pybytes())
-
-
-if __name__ == "__main__":
- sys.exit(main() or 0)
diff --git a/python/examples/flight/server.py b/python/examples/flight/server.py
deleted file mode 100644
index 7a6b669..0000000
--- a/python/examples/flight/server.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""An example Flight Python server."""
-
-import argparse
-import ast
-import threading
-import time
-
-import pyarrow
-import pyarrow.flight
-
-
-class FlightServer(pyarrow.flight.FlightServerBase):
- def __init__(self, host="localhost", location=None,
- tls_certificates=None, verify_client=False,
- root_certificates=None, auth_handler=None):
- super(FlightServer, self).__init__(
- location, auth_handler, tls_certificates, verify_client,
- root_certificates)
- self.flights = {}
- self.host = host
- self.tls_certificates = tls_certificates
-
- @classmethod
- def descriptor_to_key(self, descriptor):
- return (descriptor.descriptor_type.value, descriptor.command,
- tuple(descriptor.path or tuple()))
-
- def _make_flight_info(self, key, descriptor, table):
- if self.tls_certificates:
- location = pyarrow.flight.Location.for_grpc_tls(
- self.host, self.port)
- else:
- location = pyarrow.flight.Location.for_grpc_tcp(
- self.host, self.port)
- endpoints = [pyarrow.flight.FlightEndpoint(repr(key), [location]), ]
-
- mock_sink = pyarrow.MockOutputStream()
- stream_writer = pyarrow.RecordBatchStreamWriter(
- mock_sink, table.schema)
- stream_writer.write_table(table)
- stream_writer.close()
- data_size = mock_sink.size()
-
- return pyarrow.flight.FlightInfo(table.schema,
- descriptor, endpoints,
- table.num_rows, data_size)
-
- def list_flights(self, context, criteria):
- for key, table in self.flights.items():
- if key[1] is not None:
- descriptor = \
- pyarrow.flight.FlightDescriptor.for_command(key[1])
- else:
- descriptor = pyarrow.flight.FlightDescriptor.for_path(*key[2])
-
- yield self._make_flight_info(key, descriptor, table)
-
- def get_flight_info(self, context, descriptor):
- key = FlightServer.descriptor_to_key(descriptor)
- if key in self.flights:
- table = self.flights[key]
- return self._make_flight_info(key, descriptor, table)
- raise KeyError('Flight not found.')
-
- def do_put(self, context, descriptor, reader, writer):
- key = FlightServer.descriptor_to_key(descriptor)
- print(key)
- self.flights[key] = reader.read_all()
- print(self.flights[key])
-
- def do_get(self, context, ticket):
- key = ast.literal_eval(ticket.ticket.decode())
- if key not in self.flights:
- return None
- return pyarrow.flight.RecordBatchStream(self.flights[key])
-
- def list_actions(self, context):
- return [
- ("clear", "Clear the stored flights."),
- ("shutdown", "Shut down this server."),
- ]
-
- def do_action(self, context, action):
- if action.type == "clear":
- raise NotImplementedError(
- "{} is not implemented.".format(action.type))
- elif action.type == "healthcheck":
- pass
- elif action.type == "shutdown":
- yield pyarrow.flight.Result(pyarrow.py_buffer(b'Shutdown!'))
- # Shut down on background thread to avoid blocking current
- # request
- threading.Thread(target=self._shutdown).start()
- else:
- raise KeyError("Unknown action {!r}".format(action.type))
-
- def _shutdown(self):
- """Shut down after a delay."""
- print("Server is shutting down...")
- time.sleep(2)
- self.shutdown()
-
-
-def main():
- parser = argparse.ArgumentParser()
- parser.add_argument("--host", type=str, default="localhost",
- help="Address or hostname to listen on")
- parser.add_argument("--port", type=int, default=5005,
- help="Port number to listen on")
- parser.add_argument("--tls", nargs=2, default=None,
- metavar=('CERTFILE', 'KEYFILE'),
- help="Enable transport-level security")
- parser.add_argument("--verify_client", type=bool, default=False,
- help="enable mutual TLS and verify the client if True")
-
- args = parser.parse_args()
- tls_certificates = []
- scheme = "grpc+tcp"
- if args.tls:
- scheme = "grpc+tls"
- with open(args.tls[0], "rb") as cert_file:
- tls_cert_chain = cert_file.read()
- with open(args.tls[1], "rb") as key_file:
- tls_private_key = key_file.read()
- tls_certificates.append((tls_cert_chain, tls_private_key))
-
- location = "{}://{}:{}".format(scheme, args.host, args.port)
-
- server = FlightServer(args.host, location,
- tls_certificates=tls_certificates,
- verify_client=args.verify_client)
- print("Serving on", location)
- server.serve()
-
-
-if __name__ == '__main__':
- main()
diff --git a/python/examples/minimal_build/Dockerfile.fedora b/python/examples/minimal_build/Dockerfile.fedora
deleted file mode 100644
index 7dc3291..0000000
--- a/python/examples/minimal_build/Dockerfile.fedora
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-FROM fedora:31
-
-RUN dnf update -y && \
- dnf install -y \
- autoconf \
- gcc \
- gcc-g++ \
- git \
- wget \
- make \
- cmake \
- ninja-build \
- python3-devel \
- python3-virtualenv
\ No newline at end of file
diff --git a/python/examples/minimal_build/Dockerfile.ubuntu b/python/examples/minimal_build/Dockerfile.ubuntu
deleted file mode 100644
index d7b8408..0000000
--- a/python/examples/minimal_build/Dockerfile.ubuntu
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-FROM ubuntu:bionic
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update -y -q && \
- apt-get install -y -q --no-install-recommends \
- apt-transport-https \
- software-properties-common \
- wget && \
- apt-get install -y -q --no-install-recommends \
- build-essential \
- cmake \
- git \
- ninja-build \
- python3-dev \
- python3-pip && \
- apt-get clean && rm -rf /var/lib/apt/lists*
-
-RUN pip3 install wheel && \
- pip3 install -U setuptools && \
- pip3 install wheel virtualenv
\ No newline at end of file
diff --git a/python/examples/minimal_build/README.md b/python/examples/minimal_build/README.md
deleted file mode 100644
index 9803e18..0000000
--- a/python/examples/minimal_build/README.md
+++ /dev/null
@@ -1,73 +0,0 @@
-<!---
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-# Minimal Python source build on Linux
-
-This directory shows how to bootstrap a local build from source on Linux with
-an eye toward maximum portability across different Linux distributions. This
-may help for contributors debugging build issues caused by their local
-environments.
-
-## Fedora 31
-
-First, build the Docker image using:
-```
-docker build -t arrow_fedora_minimal -f Dockerfile.fedora .
-```
-
-Then build PyArrow with conda or pip/virtualenv, respectively:
-```
-# With pip/virtualenv
-docker run --rm -t -i -v $PWD:/io arrow_fedora_minimal /io/build_venv.sh
-
-# With conda
-docker run --rm -t -i -v $PWD:/io arrow_fedora_minimal /io/build_conda.sh
-```
-
-## Ubuntu 18.04
-
-First, build the Docker image using:
-```
-docker build -t arrow_ubuntu_minimal -f Dockerfile.ubuntu .
-```
-
-Then build PyArrow with conda or pip/virtualenv, respectively:
-```
-# With pip/virtualenv
-docker run --rm -t -i -v $PWD:/io arrow_ubuntu_minimal /io/build_venv.sh
-
-# With conda
-docker run --rm -t -i -v $PWD:/io arrow_ubuntu_minimal /io/build_conda.sh
-```
-
-## Building on Fedora - Podman and SELinux
-
-In addition to using Podman instead of Docker, you need to specify `:Z`
-for SELinux relabelling when binding a volume.
-
-First, build the image using:
-```
-podman build -t arrow_fedora_minimal -f Dockerfile.fedora
-```
-
-Then build PyArrow with pip/virtualenv:
-```
-# With pip/virtualenv
-podman run --rm -i -v $PWD:/io:Z -t arrow_fedora_minimal /io/build_venv.sh
-```
diff --git a/python/examples/minimal_build/build_conda.sh b/python/examples/minimal_build/build_conda.sh
deleted file mode 100755
index 6f93ebd..0000000
--- a/python/examples/minimal_build/build_conda.sh
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-#----------------------------------------------------------------------
-# Change this to whatever makes sense for your system
-
-HOME=
-MINICONDA=$HOME/miniconda-for-arrow
-LIBRARY_INSTALL_DIR=$HOME/local-libs
-CPP_BUILD_DIR=$HOME/arrow-cpp-build
-ARROW_ROOT=/arrow
-PYTHON=3.7
-
-git clone https://github.com/apache/arrow.git /arrow
-
-#----------------------------------------------------------------------
-# Run these only once
-
-function setup_miniconda() {
- MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh"
- wget -O miniconda.sh $MINICONDA_URL
- bash miniconda.sh -b -p $MINICONDA
- rm -f miniconda.sh
- LOCAL_PATH=$PATH
- export PATH="$MINICONDA/bin:$PATH"
-
- conda update -y -q conda
- conda config --set auto_update_conda false
- conda info -a
-
- conda config --set show_channel_urls True
- conda config --add channels https://repo.continuum.io/pkgs/free
- conda config --add channels conda-forge
-
- conda create -y -n pyarrow-$PYTHON -c conda-forge \
- --file arrow/ci/conda_env_unix.yml \
- --file arrow/ci/conda_env_cpp.yml \
- --file arrow/ci/conda_env_python.yml \
- compilers \
- python=3.7 \
- pandas
-
- export PATH=$LOCAL_PATH
-}
-
-setup_miniconda
-
-#----------------------------------------------------------------------
-# Activate conda in bash and activate conda environment
-
-. $MINICONDA/etc/profile.d/conda.sh
-conda activate pyarrow-$PYTHON
-export ARROW_HOME=$CONDA_PREFIX
-
-#----------------------------------------------------------------------
-# Build C++ library
-
-mkdir -p $CPP_BUILD_DIR
-pushd $CPP_BUILD_DIR
-
-cmake -GNinja \
- -DCMAKE_BUILD_TYPE=DEBUG \
- -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
- -DCMAKE_INSTALL_LIBDIR=lib \
- -DARROW_FLIGHT=ON \
- -DARROW_WITH_BZ2=ON \
- -DARROW_WITH_ZLIB=ON \
- -DARROW_WITH_ZSTD=ON \
- -DARROW_WITH_LZ4=ON \
- -DARROW_WITH_SNAPPY=ON \
- -DARROW_WITH_BROTLI=ON \
- -DARROW_PARQUET=ON \
- -DARROW_PLASMA=ON \
- -DARROW_PYTHON=ON \
- $ARROW_ROOT/cpp
-
-ninja install
-
-popd
-
-#----------------------------------------------------------------------
-# Build and test Python library
-pushd $ARROW_ROOT/python
-
-rm -rf build/ # remove any pesky pre-existing build directory
-
-export PYARROW_BUILD_TYPE=Debug
-export PYARROW_CMAKE_GENERATOR=Ninja
-export PYARROW_WITH_FLIGHT=1
-export PYARROW_WITH_PARQUET=1
-
-# You can run either "develop" or "build_ext --inplace". Your pick
-
-# python setup.py build_ext --inplace
-python setup.py develop
-
-# git submodules are required for unit tests
-git submodule update --init
-export PARQUET_TEST_DATA="$ARROW_ROOT/cpp/submodules/parquet-testing/data"
-export ARROW_TEST_DATA="$ARROW_ROOT/testing/data"
-
-py.test pyarrow
diff --git a/python/examples/minimal_build/build_venv.sh b/python/examples/minimal_build/build_venv.sh
deleted file mode 100755
index afa4206..0000000
--- a/python/examples/minimal_build/build_venv.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-#----------------------------------------------------------------------
-# Change this to whatever makes sense for your system
-
-WORKDIR=${WORKDIR:-$HOME}
-MINICONDA=$WORKDIR/miniconda-for-arrow
-LIBRARY_INSTALL_DIR=$WORKDIR/local-libs
-CPP_BUILD_DIR=$WORKDIR/arrow-cpp-build
-ARROW_ROOT=$WORKDIR/arrow
-export ARROW_HOME=$WORKDIR/dist
-export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH
-
-virtualenv $WORKDIR/venv
-source $WORKDIR/venv/bin/activate
-
-git clone https://github.com/apache/arrow.git $ARROW_ROOT
-
-pip install -r $ARROW_ROOT/python/requirements-build.txt \
- -r $ARROW_ROOT/python/requirements-test.txt
-
-#----------------------------------------------------------------------
-# Build C++ library
-
-mkdir -p $CPP_BUILD_DIR
-pushd $CPP_BUILD_DIR
-
-cmake -GNinja \
- -DCMAKE_BUILD_TYPE=DEBUG \
- -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
- -DCMAKE_INSTALL_LIBDIR=lib \
- -DARROW_WITH_BZ2=ON \
- -DARROW_WITH_ZLIB=ON \
- -DARROW_WITH_ZSTD=ON \
- -DARROW_WITH_LZ4=ON \
- -DARROW_WITH_SNAPPY=ON \
- -DARROW_WITH_BROTLI=ON \
- -DARROW_PARQUET=ON \
- -DARROW_PYTHON=ON \
- $ARROW_ROOT/cpp
-
-ninja install
-
-popd
-
-#----------------------------------------------------------------------
-# Build and test Python library
-pushd $ARROW_ROOT/python
-
-rm -rf build/ # remove any pesky pre-existing build directory
-
-export PYARROW_BUILD_TYPE=Debug
-export PYARROW_CMAKE_GENERATOR=Ninja
-export PYARROW_WITH_PARQUET=1
-
-# You can run either "develop" or "build_ext --inplace". Your pick
-
-# python setup.py build_ext --inplace
-python setup.py develop
-
-# git submodules are required for unit tests
-git submodule update --init
-export PARQUET_TEST_DATA="$ARROW_ROOT/cpp/submodules/parquet-testing/data"
-export ARROW_TEST_DATA="$ARROW_ROOT/testing/data"
-
-py.test pyarrow
diff --git a/python/examples/plasma/sorting/multimerge.pyx b/python/examples/plasma/sorting/multimerge.pyx
deleted file mode 100644
index 5e77fdf..0000000
--- a/python/examples/plasma/sorting/multimerge.pyx
+++ /dev/null
@@ -1,102 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-
-from libc.stdint cimport uintptr_t
-from libcpp.vector cimport vector
-from libcpp.pair cimport pair
-
-import numpy as np
-
-cimport numpy as np
-
-cdef extern from "<queue>" namespace "std" nogil:
- cdef cppclass priority_queue[T]:
- priority_queue() except +
- priority_queue(priority_queue&) except +
- bint empty()
- void pop()
- void push(T&)
- size_t size()
- T& top()
-
-
-def multimerge2d(*arrays):
- """Merge a list of sorted 2d arrays into a sorted 2d array.
-
- This assumes C style ordering for both input and output arrays. For
- each input array we have array[i,0] <= array[i+1,0] and for the output
- array the same will hold.
-
- Ideally this code would be simpler and also support both C style
- and Fortran style ordering.
- """
- cdef int num_arrays = len(arrays)
- assert num_arrays > 0
-
- cdef int num_cols = arrays[0].shape[1]
-
- for i in range(num_arrays):
- assert arrays[i].ndim == 2
- assert arrays[i].dtype == np.float64
- assert arrays[i].shape[1] == num_cols
- assert not np.isfortran(arrays[i])
-
- cdef vector[double*] data
-
- # The indices vector keeps track of the index of the next row to process in
- # each array.
- cdef vector[int] indices = num_arrays * [0]
-
- # The sizes vector stores the total number of elements that each array has.
- cdef vector[int] sizes
-
- cdef priority_queue[pair[double, int]] queue
- cdef pair[double, int] top
- cdef int num_rows = sum([array.shape[0] for array in arrays])
- cdef np.ndarray[np.float64_t, ndim=2] result = np.zeros(
- (num_rows, num_cols), dtype=np.float64)
- cdef double* result_ptr = <double*> np.PyArray_DATA(result)
- for i in range(num_arrays):
- if arrays[i].size > 0:
- sizes.push_back(arrays[i].size)
- data.push_back(<double*> np.PyArray_DATA(arrays[i]))
- queue.push(pair[double, int](-data[i][0], i))
-
- cdef int curr_idx = 0
- cdef int j
- cdef int col = 0
-
- for j in range(num_rows):
- top = queue.top()
- for col in range(num_cols):
- result_ptr[curr_idx + col] = (
- data[top.second][indices[top.second] + col])
-
- indices[top.second] += num_cols
- curr_idx += num_cols
-
- queue.pop()
- if indices[top.second] < sizes[top.second]:
- queue.push(
- pair[double, int](-data[top.second][indices[top.second]],
- top.second))
-
- return result
diff --git a/python/examples/plasma/sorting/setup.py b/python/examples/plasma/sorting/setup.py
deleted file mode 100644
index a5dfa5a..0000000
--- a/python/examples/plasma/sorting/setup.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import numpy as np
-from setuptools import setup
-from Cython.Build import cythonize
-
-setup(
- name="multimerge",
- extra_compile_args=["-O3", "-mtune=native", "-march=native"],
- ext_modules=cythonize("multimerge.pyx"),
- include_dirs=[np.get_include()],
-)
diff --git a/python/examples/plasma/sorting/sort_df.py b/python/examples/plasma/sorting/sort_df.py
deleted file mode 100644
index 2a51759..0000000
--- a/python/examples/plasma/sorting/sort_df.py
+++ /dev/null
@@ -1,203 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from multiprocessing import Pool
-import numpy as np
-import pandas as pd
-import pyarrow as pa
-import pyarrow.plasma as plasma
-import subprocess
-import time
-
-import multimerge
-
-# To run this example, you will first need to run "python setup.py install" in
-# this directory to build the Cython module.
-#
-# You will only see speedups if you run this code on more data, this is just a
-# small example that can run on a laptop.
-#
-# The values we used to get a speedup (on a m4.10xlarge instance on EC2) were
-# object_store_size = 84 * 10 ** 9
-# num_cores = 20
-# num_rows = 10 ** 9
-# num_cols = 1
-
-client = None
-object_store_size = 2 * 10 ** 9 # 2 GB
-num_cores = 8
-num_rows = 200000
-num_cols = 2
-column_names = [str(i) for i in range(num_cols)]
-column_to_sort = column_names[0]
-
-
-# Connect to clients
-def connect():
- global client
- client = plasma.connect('/tmp/store')
- np.random.seed(int(time.time() * 10e7) % 10000000)
-
-
-def put_df(df):
- record_batch = pa.RecordBatch.from_pandas(df)
-
- # Get size of record batch and schema
- mock_sink = pa.MockOutputStream()
- stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema)
- stream_writer.write_batch(record_batch)
- data_size = mock_sink.size()
-
- # Generate an ID and allocate a buffer in the object store for the
- # serialized DataFrame
- object_id = plasma.ObjectID(np.random.bytes(20))
- buf = client.create(object_id, data_size)
-
- # Write the serialized DataFrame to the object store
- sink = pa.FixedSizeBufferWriter(buf)
- stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema)
- stream_writer.write_batch(record_batch)
-
- # Seal the object
- client.seal(object_id)
-
- return object_id
-
-
-def get_dfs(object_ids):
- """Retrieve dataframes from the object store given their object IDs."""
- buffers = client.get_buffers(object_ids)
- return [pa.RecordBatchStreamReader(buf).read_next_batch().to_pandas()
- for buf in buffers]
-
-
-def local_sort(object_id):
- """Sort a partition of a dataframe."""
- # Get the dataframe from the object store.
- [df] = get_dfs([object_id])
- # Sort the dataframe.
- sorted_df = df.sort_values(by=column_to_sort)
- # Get evenly spaced values from the dataframe.
- indices = np.linspace(0, len(df) - 1, num=num_cores, dtype=np.int64)
- # Put the sorted dataframe in the object store and return the corresponding
- # object ID as well as the sampled values.
- return put_df(sorted_df), sorted_df.as_matrix().take(indices)
-
-
-def local_partitions(object_id_and_pivots):
- """Take a sorted partition of a dataframe and split it into more pieces."""
- object_id, pivots = object_id_and_pivots
- [df] = get_dfs([object_id])
- split_at = df[column_to_sort].searchsorted(pivots)
- split_at = [0] + list(split_at) + [len(df)]
- # Partition the sorted dataframe and put each partition into the object
- # store.
- return [put_df(df[i:j]) for i, j in zip(split_at[:-1], split_at[1:])]
-
-
-def merge(object_ids):
- """Merge a number of sorted dataframes into a single sorted dataframe."""
- dfs = get_dfs(object_ids)
-
- # In order to use our multimerge code, we have to convert the arrays from
- # the Fortran format to the C format.
- arrays = [np.ascontiguousarray(df.as_matrix()) for df in dfs]
- for a in arrays:
- assert a.dtype == np.float64
- assert not np.isfortran(a)
-
- # Filter out empty arrays.
- arrays = [a for a in arrays if a.shape[0] > 0]
-
- if len(arrays) == 0:
- return None
-
- resulting_array = multimerge.multimerge2d(*arrays)
- merged_df2 = pd.DataFrame(resulting_array, columns=column_names)
-
- return put_df(merged_df2)
-
-
-if __name__ == '__main__':
- # Start the plasma store.
- p = subprocess.Popen(['plasma_store',
- '-s', '/tmp/store',
- '-m', str(object_store_size)])
-
- # Connect to the plasma store.
- connect()
-
- # Connect the processes in the pool.
- pool = Pool(initializer=connect, initargs=(), processes=num_cores)
-
- # Create a DataFrame from a numpy array.
- df = pd.DataFrame(np.random.randn(num_rows, num_cols),
- columns=column_names)
-
- partition_ids = [put_df(partition) for partition
- in np.split(df, num_cores)]
-
- # Begin timing the parallel sort example.
- parallel_sort_start = time.time()
-
- # Sort each partition and subsample them. The subsampled values will be
- # used to create buckets.
- sorted_df_ids, pivot_groups = list(zip(*pool.map(local_sort,
- partition_ids)))
-
- # Choose the pivots.
- all_pivots = np.concatenate(pivot_groups)
- indices = np.linspace(0, len(all_pivots) - 1, num=num_cores,
- dtype=np.int64)
- pivots = np.take(np.sort(all_pivots), indices)
-
- # Break all of the sorted partitions into even smaller partitions. Group
- # the object IDs from each bucket together.
- results = list(zip(*pool.map(local_partitions,
- zip(sorted_df_ids,
- len(sorted_df_ids) * [pivots]))))
-
- # Merge each of the buckets and store the results in the object store.
- object_ids = pool.map(merge, results)
-
- resulting_ids = [object_id for object_id in object_ids
- if object_id is not None]
-
- # Stop timing the paralle sort example.
- parallel_sort_end = time.time()
-
- print('Parallel sort took {} seconds.'
- .format(parallel_sort_end - parallel_sort_start))
-
- serial_sort_start = time.time()
-
- original_sorted_df = df.sort_values(by=column_to_sort)
-
- serial_sort_end = time.time()
-
- # Check that we sorted the DataFrame properly.
-
- sorted_dfs = get_dfs(resulting_ids)
- sorted_df = pd.concat(sorted_dfs)
-
- print('Serial sort took {} seconds.'
- .format(serial_sort_end - serial_sort_start))
-
- assert np.allclose(sorted_df.values, original_sorted_df.values)
-
- # Kill the object store.
- p.kill()
diff --git a/python/pyarrow/__init__.pxd b/python/pyarrow/__init__.pxd
deleted file mode 100644
index 8cc54b4..0000000
--- a/python/pyarrow/__init__.pxd
+++ /dev/null
@@ -1,42 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from libcpp.memory cimport shared_ptr
-from pyarrow.includes.libarrow cimport (CArray, CBuffer, CDataType,
- CField, CRecordBatch, CSchema,
- CTable, CTensor, CSparseCOOTensor,
- CSparseCSRMatrix, CSparseCSCMatrix,
- CSparseCSFTensor)
-
-cdef extern from "arrow/python/pyarrow.h" namespace "arrow::py":
- cdef int import_pyarrow() except -1
- cdef object wrap_buffer(const shared_ptr[CBuffer]& buffer)
- cdef object wrap_data_type(const shared_ptr[CDataType]& type)
- cdef object wrap_field(const shared_ptr[CField]& field)
- cdef object wrap_schema(const shared_ptr[CSchema]& schema)
- cdef object wrap_array(const shared_ptr[CArray]& sp_array)
- cdef object wrap_tensor(const shared_ptr[CTensor]& sp_tensor)
- cdef object wrap_sparse_tensor_coo(
- const shared_ptr[CSparseCOOTensor]& sp_sparse_tensor)
- cdef object wrap_sparse_tensor_csr(
- const shared_ptr[CSparseCSRMatrix]& sp_sparse_tensor)
- cdef object wrap_sparse_tensor_csc(
- const shared_ptr[CSparseCSCMatrix]& sp_sparse_tensor)
- cdef object wrap_sparse_tensor_csf(
- const shared_ptr[CSparseCSFTensor]& sp_sparse_tensor)
- cdef object wrap_table(const shared_ptr[CTable]& ctable)
- cdef object wrap_batch(const shared_ptr[CRecordBatch]& cbatch)
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
deleted file mode 100644
index adfd69c..0000000
--- a/python/pyarrow/__init__.py
+++ /dev/null
@@ -1,504 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# flake8: noqa
-
-"""
-PyArrow is the python implementation of Apache Arrow.
-
-Apache Arrow is a cross-language development platform for in-memory data.
-It specifies a standardized language-independent columnar memory format for
-flat and hierarchical data, organized for efficient analytic operations on
-modern hardware. It also provides computational libraries and zero-copy
-streaming messaging and interprocess communication.
-
-For more information see the official page at https://arrow.apache.org
-"""
-
-import gc as _gc
-import os as _os
-import sys as _sys
-import warnings as _warnings
-
-try:
- from ._generated_version import version as __version__
-except ImportError:
- # Package is not installed, parse git tag at runtime
- try:
- import setuptools_scm
- # Code duplicated from setup.py to avoid a dependency on each other
-
- def parse_git(root, **kwargs):
- """
- Parse function for setuptools_scm that ignores tags for non-C++
- subprojects, e.g. apache-arrow-js-XXX tags.
- """
- from setuptools_scm.git import parse
- kwargs['describe_command'] = \
- "git describe --dirty --tags --long --match 'apache-arrow-[0-9].*'"
- return parse(root, **kwargs)
- __version__ = setuptools_scm.get_version('../',
- parse=parse_git)
- except ImportError:
- __version__ = None
-
-# ARROW-8684: Disable GC while initializing Cython extension module,
-# to workaround Cython bug in https://github.com/cython/cython/issues/3603
-_gc_enabled = _gc.isenabled()
-_gc.disable()
-import pyarrow.lib as _lib
-if _gc_enabled:
- _gc.enable()
-
-from pyarrow.lib import (BuildInfo, RuntimeInfo, VersionInfo,
- cpp_build_info, cpp_version, cpp_version_info,
- runtime_info, cpu_count, set_cpu_count,
- enable_signal_handlers)
-
-
-def show_versions():
- """
- Print various version information, to help with error reporting.
- """
- # TODO: CPU information and flags
- print("pyarrow version info\n--------------------")
- print("Package kind: {}".format(cpp_build_info.package_kind
- if len(cpp_build_info.package_kind) > 0
- else "not indicated"))
- print("Arrow C++ library version: {0}".format(cpp_build_info.version))
- print("Arrow C++ compiler: {0} {1}"
- .format(cpp_build_info.compiler_id, cpp_build_info.compiler_version))
- print("Arrow C++ compiler flags: {0}"
- .format(cpp_build_info.compiler_flags))
- print("Arrow C++ git revision: {0}".format(cpp_build_info.git_id))
- print("Arrow C++ git description: {0}"
- .format(cpp_build_info.git_description))
-
-
-from pyarrow.lib import (null, bool_,
- int8, int16, int32, int64,
- uint8, uint16, uint32, uint64,
- time32, time64, timestamp, date32, date64, duration,
- float16, float32, float64,
- binary, string, utf8,
- large_binary, large_string, large_utf8,
- decimal128, decimal256,
- list_, large_list, map_, struct,
- union, sparse_union, dense_union,
- dictionary,
- field,
- type_for_alias,
- DataType, DictionaryType, StructType,
- ListType, LargeListType, MapType, FixedSizeListType,
- UnionType, SparseUnionType, DenseUnionType,
- TimestampType, Time32Type, Time64Type, DurationType,
- FixedSizeBinaryType, Decimal128Type, Decimal256Type,
- BaseExtensionType, ExtensionType,
- PyExtensionType, UnknownExtensionType,
- register_extension_type, unregister_extension_type,
- DictionaryMemo,
- KeyValueMetadata,
- Field,
- Schema,
- schema,
- unify_schemas,
- Array, Tensor,
- array, chunked_array, record_batch, nulls, repeat,
- SparseCOOTensor, SparseCSRMatrix, SparseCSCMatrix,
- SparseCSFTensor,
- infer_type, from_numpy_dtype,
- NullArray,
- NumericArray, IntegerArray, FloatingPointArray,
- BooleanArray,
- Int8Array, UInt8Array,
- Int16Array, UInt16Array,
- Int32Array, UInt32Array,
- Int64Array, UInt64Array,
- ListArray, LargeListArray, MapArray,
- FixedSizeListArray, UnionArray,
- BinaryArray, StringArray,
- LargeBinaryArray, LargeStringArray,
- FixedSizeBinaryArray,
- DictionaryArray,
- Date32Array, Date64Array, TimestampArray,
- Time32Array, Time64Array, DurationArray,
- Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
- scalar, NA, _NULL as NULL, Scalar,
- NullScalar, BooleanScalar,
- Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
- UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar,
- HalfFloatScalar, FloatScalar, DoubleScalar,
- Decimal128Scalar, Decimal256Scalar,
- ListScalar, LargeListScalar, FixedSizeListScalar,
- Date32Scalar, Date64Scalar,
- Time32Scalar, Time64Scalar,
- BinaryScalar, LargeBinaryScalar,
- StringScalar, LargeStringScalar,
- FixedSizeBinaryScalar, DictionaryScalar,
- MapScalar, UnionScalar, StructScalar,
- TimestampScalar, DurationScalar)
-
-# Buffers, allocation
-from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer,
- Codec, compress, decompress, allocate_buffer)
-
-from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool,
- total_allocated_bytes, set_memory_pool,
- default_memory_pool, system_memory_pool,
- jemalloc_memory_pool, mimalloc_memory_pool,
- logging_memory_pool, proxy_memory_pool,
- log_memory_allocations, jemalloc_set_decay_ms)
-
-# I/O
-from pyarrow.lib import (HdfsFile, NativeFile, PythonFile,
- BufferedInputStream, BufferedOutputStream,
- CompressedInputStream, CompressedOutputStream,
- TransformInputStream, transcoding_input_stream,
- FixedSizeBufferWriter,
- BufferReader, BufferOutputStream,
- OSFile, MemoryMappedFile, memory_map,
- create_memory_map, have_libhdfs,
- MockOutputStream, input_stream, output_stream)
-
-from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table,
- concat_arrays, concat_tables)
-
-# Exceptions
-from pyarrow.lib import (ArrowCancelled,
- ArrowCapacityError,
- ArrowException,
- ArrowKeyError,
- ArrowIndexError,
- ArrowInvalid,
- ArrowIOError,
- ArrowMemoryError,
- ArrowNotImplementedError,
- ArrowTypeError,
- ArrowSerializationError)
-
-# Serialization
-from pyarrow.lib import (deserialize_from, deserialize,
- deserialize_components,
- serialize, serialize_to, read_serialized,
- SerializationCallbackError,
- DeserializationCallbackError)
-
-import pyarrow.hdfs as hdfs
-
-from pyarrow.ipc import serialize_pandas, deserialize_pandas
-import pyarrow.ipc as ipc
-
-from pyarrow.serialization import (default_serialization_context,
- register_default_serialization_handlers,
- register_torch_serialization_handlers)
-
-import pyarrow.types as types
-
-
-# deprecated top-level access
-
-
-from pyarrow.filesystem import FileSystem as _FileSystem
-from pyarrow.filesystem import LocalFileSystem as _LocalFileSystem
-from pyarrow.hdfs import HadoopFileSystem as _HadoopFileSystem
-
-from pyarrow.lib import SerializationContext as _SerializationContext
-from pyarrow.lib import SerializedPyObject as _SerializedPyObject
-
-
-_localfs = _LocalFileSystem._get_instance()
-
-
-_msg = (
- "pyarrow.{0} is deprecated as of 2.0.0, please use pyarrow.fs.{1} instead."
-)
-
-_serialization_msg = (
- "'pyarrow.{0}' is deprecated and will be removed in a future version. "
- "Use pickle or the pyarrow IPC functionality instead."
-)
-
-_deprecated = {
- "localfs": (_localfs, "LocalFileSystem"),
- "FileSystem": (_FileSystem, "FileSystem"),
- "LocalFileSystem": (_LocalFileSystem, "LocalFileSystem"),
- "HadoopFileSystem": (_HadoopFileSystem, "HadoopFileSystem"),
-}
-
-_serialization_deprecatd = {
- "SerializationContext": _SerializationContext,
- "SerializedPyObject": _SerializedPyObject,
-}
-
-if _sys.version_info >= (3, 7):
- def __getattr__(name):
- if name in _deprecated:
- obj, new_name = _deprecated[name]
- _warnings.warn(_msg.format(name, new_name),
- FutureWarning, stacklevel=2)
- return obj
- elif name in _serialization_deprecatd:
- _warnings.warn(_serialization_msg.format(name),
- FutureWarning, stacklevel=2)
- return _serialization_deprecatd[name]
-
- raise AttributeError(
- "module 'pyarrow' has no attribute '{0}'".format(name)
- )
-else:
- localfs = _localfs
- FileSystem = _FileSystem
- LocalFileSystem = _LocalFileSystem
- HadoopFileSystem = _HadoopFileSystem
- SerializationContext = _SerializationContext
- SerializedPyObject = _SerializedPyObject
-
-
-# Entry point for starting the plasma store
-
-
-def _plasma_store_entry_point():
- """Entry point for starting the plasma store.
-
- This can be used by invoking e.g.
- ``plasma_store -s /tmp/plasma -m 1000000000``
- from the command line and will start the plasma_store executable with the
- given arguments.
- """
- import pyarrow
- plasma_store_executable = _os.path.join(pyarrow.__path__[0],
- "plasma-store-server")
- _os.execv(plasma_store_executable, _sys.argv)
-
-
-# ----------------------------------------------------------------------
-# Deprecations
-
-from pyarrow.util import _deprecate_api, _deprecate_class
-
-read_message = _deprecate_api("read_message", "ipc.read_message",
- ipc.read_message, "0.17.0")
-
-read_record_batch = _deprecate_api("read_record_batch",
- "ipc.read_record_batch",
- ipc.read_record_batch, "0.17.0")
-
-read_schema = _deprecate_api("read_schema", "ipc.read_schema",
- ipc.read_schema, "0.17.0")
-
-read_tensor = _deprecate_api("read_tensor", "ipc.read_tensor",
- ipc.read_tensor, "0.17.0")
-
-write_tensor = _deprecate_api("write_tensor", "ipc.write_tensor",
- ipc.write_tensor, "0.17.0")
-
-get_record_batch_size = _deprecate_api("get_record_batch_size",
- "ipc.get_record_batch_size",
- ipc.get_record_batch_size, "0.17.0")
-
-get_tensor_size = _deprecate_api("get_tensor_size",
- "ipc.get_tensor_size",
- ipc.get_tensor_size, "0.17.0")
-
-open_stream = _deprecate_api("open_stream", "ipc.open_stream",
- ipc.open_stream, "0.17.0")
-
-open_file = _deprecate_api("open_file", "ipc.open_file", ipc.open_file,
- "0.17.0")
-
-
-def _deprecate_scalar(ty, symbol):
- return _deprecate_class("{}Value".format(ty), symbol, "1.0.0")
-
-
-ArrayValue = _deprecate_class("ArrayValue", Scalar, "1.0.0")
-NullType = _deprecate_class("NullType", NullScalar, "1.0.0")
-
-BooleanValue = _deprecate_scalar("Boolean", BooleanScalar)
-Int8Value = _deprecate_scalar("Int8", Int8Scalar)
-Int16Value = _deprecate_scalar("Int16", Int16Scalar)
-Int32Value = _deprecate_scalar("Int32", Int32Scalar)
-Int64Value = _deprecate_scalar("Int64", Int64Scalar)
-UInt8Value = _deprecate_scalar("UInt8", UInt8Scalar)
-UInt16Value = _deprecate_scalar("UInt16", UInt16Scalar)
-UInt32Value = _deprecate_scalar("UInt32", UInt32Scalar)
-UInt64Value = _deprecate_scalar("UInt64", UInt64Scalar)
-HalfFloatValue = _deprecate_scalar("HalfFloat", HalfFloatScalar)
-FloatValue = _deprecate_scalar("Float", FloatScalar)
-DoubleValue = _deprecate_scalar("Double", DoubleScalar)
-ListValue = _deprecate_scalar("List", ListScalar)
-LargeListValue = _deprecate_scalar("LargeList", LargeListScalar)
-MapValue = _deprecate_scalar("Map", MapScalar)
-FixedSizeListValue = _deprecate_scalar("FixedSizeList", FixedSizeListScalar)
-BinaryValue = _deprecate_scalar("Binary", BinaryScalar)
-StringValue = _deprecate_scalar("String", StringScalar)
-LargeBinaryValue = _deprecate_scalar("LargeBinary", LargeBinaryScalar)
-LargeStringValue = _deprecate_scalar("LargeString", LargeStringScalar)
-FixedSizeBinaryValue = _deprecate_scalar("FixedSizeBinary",
- FixedSizeBinaryScalar)
-Decimal128Value = _deprecate_scalar("Decimal128", Decimal128Scalar)
-Decimal256Value = _deprecate_scalar("Decimal256", Decimal256Scalar)
-UnionValue = _deprecate_scalar("Union", UnionScalar)
-StructValue = _deprecate_scalar("Struct", StructScalar)
-DictionaryValue = _deprecate_scalar("Dictionary", DictionaryScalar)
-Date32Value = _deprecate_scalar("Date32", Date32Scalar)
-Date64Value = _deprecate_scalar("Date64", Date64Scalar)
-Time32Value = _deprecate_scalar("Time32", Time32Scalar)
-Time64Value = _deprecate_scalar("Time64", Time64Scalar)
-TimestampValue = _deprecate_scalar("Timestamp", TimestampScalar)
-DurationValue = _deprecate_scalar("Duration", DurationScalar)
-
-
-# TODO: Deprecate these somehow in the pyarrow namespace
-from pyarrow.ipc import (Message, MessageReader, MetadataVersion,
- RecordBatchFileReader, RecordBatchFileWriter,
- RecordBatchStreamReader, RecordBatchStreamWriter)
-
-# ----------------------------------------------------------------------
-# Returning absolute path to the pyarrow include directory (if bundled, e.g. in
-# wheels)
-
-
-def get_include():
- """
- Return absolute path to directory containing Arrow C++ include
- headers. Similar to numpy.get_include
- """
- return _os.path.join(_os.path.dirname(__file__), 'include')
-
-
-def _get_pkg_config_executable():
- return _os.environ.get('PKG_CONFIG', 'pkg-config')
-
-
-def _has_pkg_config(pkgname):
- import subprocess
- try:
- return subprocess.call([_get_pkg_config_executable(),
- '--exists', pkgname]) == 0
- except FileNotFoundError:
- return False
-
-
-def _read_pkg_config_variable(pkgname, cli_args):
- import subprocess
- cmd = [_get_pkg_config_executable(), pkgname] + cli_args
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
- stderr=subprocess.PIPE)
- out, err = proc.communicate()
- if proc.returncode != 0:
- raise RuntimeError("pkg-config failed: " + err.decode('utf8'))
- return out.rstrip().decode('utf8')
-
-
-def get_libraries():
- """
- Return list of library names to include in the `libraries` argument for C
- or Cython extensions using pyarrow
- """
- return ['arrow', 'arrow_python']
-
-
-def create_library_symlinks():
- """
- With Linux and macOS wheels, the bundled shared libraries have an embedded
- ABI version like libarrow.so.17 or libarrow.17.dylib and so linking to them
- with -larrow won't work unless we create symlinks at locations like
- site-packages/pyarrow/libarrow.so. This unfortunate workaround addresses
- prior problems we had with shipping two copies of the shared libraries to
- permit third party projects like turbodbc to build their C++ extensions
- against the pyarrow wheels.
-
- This function must only be invoked once and only when the shared libraries
- are bundled with the Python package, which should only apply to wheel-based
- installs. It requires write access to the site-packages/pyarrow directory
- and so depending on your system may need to be run with root.
- """
- import glob
- if _sys.platform == 'win32':
- return
- package_cwd = _os.path.dirname(__file__)
-
- if _sys.platform == 'linux':
- bundled_libs = glob.glob(_os.path.join(package_cwd, '*.so.*'))
-
- def get_symlink_path(hard_path):
- return hard_path.rsplit('.', 1)[0]
- else:
- bundled_libs = glob.glob(_os.path.join(package_cwd, '*.*.dylib'))
-
- def get_symlink_path(hard_path):
- return '.'.join((hard_path.rsplit('.', 2)[0], 'dylib'))
-
- for lib_hard_path in bundled_libs:
- symlink_path = get_symlink_path(lib_hard_path)
- if _os.path.exists(symlink_path):
- continue
- try:
- _os.symlink(lib_hard_path, symlink_path)
- except PermissionError:
- print("Tried creating symlink {}. If you need to link to "
- "bundled shared libraries, run "
- "pyarrow.create_library_symlinks() as root")
-
-
-def get_library_dirs():
- """
- Return lists of directories likely to contain Arrow C++ libraries for
- linking C or Cython extensions using pyarrow
- """
- package_cwd = _os.path.dirname(__file__)
- library_dirs = [package_cwd]
-
- def append_library_dir(library_dir):
- if library_dir not in library_dirs:
- library_dirs.append(library_dir)
-
- # Search library paths via pkg-config. This is necessary if the user
- # installed libarrow and the other shared libraries manually and they
- # are not shipped inside the pyarrow package (see also ARROW-2976).
- pkg_config_executable = _os.environ.get('PKG_CONFIG') or 'pkg-config'
- for pkgname in ["arrow", "arrow_python"]:
- if _has_pkg_config(pkgname):
- library_dir = _read_pkg_config_variable(pkgname,
- ["--libs-only-L"])
- # pkg-config output could be empty if Arrow is installed
- # as a system package.
- if library_dir:
- if not library_dir.startswith("-L"):
- raise ValueError(
- "pkg-config --libs-only-L returned unexpected "
- "value {!r}".format(library_dir))
- append_library_dir(library_dir[2:])
-
- if _sys.platform == 'win32':
- # TODO(wesm): Is this necessary, or does setuptools within a conda
- # installation add Library\lib to the linker path for MSVC?
- python_base_install = _os.path.dirname(_sys.executable)
- library_dir = _os.path.join(python_base_install, 'Library', 'lib')
-
- if _os.path.exists(_os.path.join(library_dir, 'arrow.lib')):
- append_library_dir(library_dir)
-
- # ARROW-4074: Allow for ARROW_HOME to be set to some other directory
- if _os.environ.get('ARROW_HOME'):
- append_library_dir(_os.path.join(_os.environ['ARROW_HOME'], 'lib'))
- else:
- # Python wheels bundle the Arrow libraries in the pyarrow directory.
- append_library_dir(_os.path.dirname(_os.path.abspath(__file__)))
-
- return library_dirs
diff --git a/python/pyarrow/_compute.pxd b/python/pyarrow/_compute.pxd
deleted file mode 100644
index e187ed7..0000000
--- a/python/pyarrow/_compute.pxd
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: language_level = 3
-
-from pyarrow.lib cimport *
-from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport *
-
-
-cdef class FunctionOptions(_Weakrefable):
-
- cdef const CFunctionOptions* get_options(self) except NULL
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
deleted file mode 100644
index 1515bdc..0000000
--- a/python/pyarrow/_compute.pyx
+++ /dev/null
@@ -1,1092 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: language_level = 3
-
-from cython.operator cimport dereference as deref
-
-from collections import namedtuple
-
-from pyarrow.lib import frombytes, tobytes, ordered_dict
-from pyarrow.lib cimport *
-from pyarrow.includes.libarrow cimport *
-import pyarrow.lib as lib
-
-import numpy as np
-
-
-cdef wrap_scalar_function(const shared_ptr[CFunction]& sp_func):
- """
- Wrap a C++ scalar Function in a ScalarFunction object.
- """
- cdef ScalarFunction func = ScalarFunction.__new__(ScalarFunction)
- func.init(sp_func)
- return func
-
-
-cdef wrap_vector_function(const shared_ptr[CFunction]& sp_func):
- """
- Wrap a C++ vector Function in a VectorFunction object.
- """
- cdef VectorFunction func = VectorFunction.__new__(VectorFunction)
- func.init(sp_func)
- return func
-
-
-cdef wrap_scalar_aggregate_function(const shared_ptr[CFunction]& sp_func):
- """
- Wrap a C++ aggregate Function in a ScalarAggregateFunction object.
- """
- cdef ScalarAggregateFunction func = (
- ScalarAggregateFunction.__new__(ScalarAggregateFunction)
- )
- func.init(sp_func)
- return func
-
-
-cdef wrap_hash_aggregate_function(const shared_ptr[CFunction]& sp_func):
- """
- Wrap a C++ aggregate Function in a HashAggregateFunction object.
- """
- cdef HashAggregateFunction func = (
- HashAggregateFunction.__new__(HashAggregateFunction)
- )
- func.init(sp_func)
- return func
-
-
-cdef wrap_meta_function(const shared_ptr[CFunction]& sp_func):
- """
- Wrap a C++ meta Function in a MetaFunction object.
- """
- cdef MetaFunction func = (
- MetaFunction.__new__(MetaFunction)
- )
- func.init(sp_func)
- return func
-
-
-cdef wrap_function(const shared_ptr[CFunction]& sp_func):
- """
- Wrap a C++ Function in a Function object.
-
- This dispatches to specialized wrappers depending on the function kind.
- """
- if sp_func.get() == NULL:
- raise ValueError('Function was NULL')
-
- cdef FunctionKind c_kind = sp_func.get().kind()
- if c_kind == FunctionKind_SCALAR:
- return wrap_scalar_function(sp_func)
- elif c_kind == FunctionKind_VECTOR:
- return wrap_vector_function(sp_func)
- elif c_kind == FunctionKind_SCALAR_AGGREGATE:
- return wrap_scalar_aggregate_function(sp_func)
- elif c_kind == FunctionKind_HASH_AGGREGATE:
- return wrap_hash_aggregate_function(sp_func)
- elif c_kind == FunctionKind_META:
- return wrap_meta_function(sp_func)
- else:
- raise NotImplementedError("Unknown Function::Kind")
-
-
-cdef wrap_scalar_kernel(const CScalarKernel* c_kernel):
- if c_kernel == NULL:
- raise ValueError('Kernel was NULL')
- cdef ScalarKernel kernel = ScalarKernel.__new__(ScalarKernel)
- kernel.init(c_kernel)
- return kernel
-
-
-cdef wrap_vector_kernel(const CVectorKernel* c_kernel):
- if c_kernel == NULL:
- raise ValueError('Kernel was NULL')
- cdef VectorKernel kernel = VectorKernel.__new__(VectorKernel)
- kernel.init(c_kernel)
- return kernel
-
-
-cdef wrap_scalar_aggregate_kernel(const CScalarAggregateKernel* c_kernel):
- if c_kernel == NULL:
- raise ValueError('Kernel was NULL')
- cdef ScalarAggregateKernel kernel = (
- ScalarAggregateKernel.__new__(ScalarAggregateKernel)
- )
- kernel.init(c_kernel)
- return kernel
-
-
-cdef wrap_hash_aggregate_kernel(const CHashAggregateKernel* c_kernel):
- if c_kernel == NULL:
- raise ValueError('Kernel was NULL')
- cdef HashAggregateKernel kernel = (
- HashAggregateKernel.__new__(HashAggregateKernel)
- )
- kernel.init(c_kernel)
- return kernel
-
-
-cdef class Kernel(_Weakrefable):
- """
- A kernel object.
-
- Kernels handle the execution of a Function for a certain signature.
- """
-
- def __init__(self):
- raise TypeError("Do not call {}'s constructor directly"
- .format(self.__class__.__name__))
-
-
-cdef class ScalarKernel(Kernel):
- cdef:
- const CScalarKernel* kernel
-
- cdef void init(self, const CScalarKernel* kernel) except *:
- self.kernel = kernel
-
- def __repr__(self):
- return ("ScalarKernel<{}>"
- .format(frombytes(self.kernel.signature.get().ToString())))
-
-
-cdef class VectorKernel(Kernel):
- cdef:
- const CVectorKernel* kernel
-
- cdef void init(self, const CVectorKernel* kernel) except *:
- self.kernel = kernel
-
- def __repr__(self):
- return ("VectorKernel<{}>"
- .format(frombytes(self.kernel.signature.get().ToString())))
-
-
-cdef class ScalarAggregateKernel(Kernel):
- cdef:
- const CScalarAggregateKernel* kernel
-
- cdef void init(self, const CScalarAggregateKernel* kernel) except *:
- self.kernel = kernel
-
- def __repr__(self):
- return ("ScalarAggregateKernel<{}>"
- .format(frombytes(self.kernel.signature.get().ToString())))
-
-
-cdef class HashAggregateKernel(Kernel):
- cdef:
- const CHashAggregateKernel* kernel
-
- cdef void init(self, const CHashAggregateKernel* kernel) except *:
- self.kernel = kernel
-
- def __repr__(self):
- return ("HashAggregateKernel<{}>"
- .format(frombytes(self.kernel.signature.get().ToString())))
-
-
-FunctionDoc = namedtuple(
- "FunctionDoc",
- ("summary", "description", "arg_names", "options_class"))
-
-
-cdef class Function(_Weakrefable):
- """
- A compute function.
-
- A function implements a certain logical computation over a range of
- possible input signatures. Each signature accepts a range of input
- types and is implemented by a given Kernel.
-
- Functions can be of different kinds:
-
- * "scalar" functions apply an item-wise computation over all items
- of their inputs. Each item in the output only depends on the values
- of the inputs at the same position. Examples: addition, comparisons,
- string predicates...
-
- * "vector" functions apply a collection-wise computation, such that
- each item in the output may depend on the values of several items
- in each input. Examples: dictionary encoding, sorting, extracting
- unique values...
-
- * "scalar_aggregate" functions reduce the dimensionality of the inputs by
- applying a reduction function. Examples: sum, min_max, mode...
-
- * "hash_aggregate" functions apply a reduction function to an input
- subdivided by grouping criteria. They may not be directly called.
- Examples: hash_sum, hash_min_max...
-
- * "meta" functions dispatch to other functions.
- """
- cdef:
- shared_ptr[CFunction] sp_func
- CFunction* base_func
-
- def __init__(self):
- raise TypeError("Do not call {}'s constructor directly"
- .format(self.__class__.__name__))
-
- cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
- self.sp_func = sp_func
- self.base_func = sp_func.get()
-
- def __repr__(self):
- return ("arrow.compute.Function<name={}, kind={}, "
- "arity={}, num_kernels={}>"
- ).format(self.name, self.kind, self.arity, self.num_kernels)
-
- def __reduce__(self):
- # Reduction uses the global registry
- return get_function, (self.name,)
-
- @property
- def name(self):
- """
- The function name.
- """
- return frombytes(self.base_func.name())
-
- @property
- def arity(self):
- """
- The function arity.
-
- If Ellipsis (i.e. `...`) is returned, the function takes a variable
- number of arguments.
- """
- cdef CArity arity = self.base_func.arity()
- if arity.is_varargs:
- return ...
- else:
- return arity.num_args
-
- @property
- def kind(self):
- """
- The function kind.
- """
- cdef FunctionKind c_kind = self.base_func.kind()
- if c_kind == FunctionKind_SCALAR:
- return 'scalar'
- elif c_kind == FunctionKind_VECTOR:
- return 'vector'
- elif c_kind == FunctionKind_SCALAR_AGGREGATE:
- return 'scalar_aggregate'
- elif c_kind == FunctionKind_HASH_AGGREGATE:
- return 'hash_aggregate'
- elif c_kind == FunctionKind_META:
- return 'meta'
- else:
- raise NotImplementedError("Unknown Function::Kind")
-
- @property
- def _doc(self):
- """
- The C++-like function documentation (for internal use).
- """
- cdef CFunctionDoc c_doc = self.base_func.doc()
-
- return FunctionDoc(frombytes(c_doc.summary),
- frombytes(c_doc.description),
- [frombytes(s) for s in c_doc.arg_names],
- frombytes(c_doc.options_class))
-
- @property
- def num_kernels(self):
- """
- The number of kernels implementing this function.
- """
- return self.base_func.num_kernels()
-
- def call(self, args, FunctionOptions options=None,
- MemoryPool memory_pool=None):
- """
- Call the function on the given arguments.
- """
- cdef:
- const CFunctionOptions* c_options = NULL
- CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
- CExecContext c_exec_ctx = CExecContext(pool)
- vector[CDatum] c_args
- CDatum result
-
- _pack_compute_args(args, &c_args)
-
- if options is not None:
- c_options = options.get_options()
-
- with nogil:
- result = GetResultValue(self.base_func.Execute(c_args,
- c_options,
- &c_exec_ctx))
-
- return wrap_datum(result)
-
-
-cdef class ScalarFunction(Function):
- cdef:
- const CScalarFunction* func
-
- cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
- Function.init(self, sp_func)
- self.func = <const CScalarFunction*> sp_func.get()
-
- @property
- def kernels(self):
- """
- The kernels implementing this function.
- """
- cdef vector[const CScalarKernel*] kernels = self.func.kernels()
- return [wrap_scalar_kernel(k) for k in kernels]
-
-
-cdef class VectorFunction(Function):
- cdef:
- const CVectorFunction* func
-
- cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
- Function.init(self, sp_func)
- self.func = <const CVectorFunction*> sp_func.get()
-
- @property
- def kernels(self):
- """
- The kernels implementing this function.
- """
- cdef vector[const CVectorKernel*] kernels = self.func.kernels()
- return [wrap_vector_kernel(k) for k in kernels]
-
-
-cdef class ScalarAggregateFunction(Function):
- cdef:
- const CScalarAggregateFunction* func
-
- cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
- Function.init(self, sp_func)
- self.func = <const CScalarAggregateFunction*> sp_func.get()
-
- @property
- def kernels(self):
- """
- The kernels implementing this function.
- """
- cdef vector[const CScalarAggregateKernel*] kernels = (
- self.func.kernels()
- )
- return [wrap_scalar_aggregate_kernel(k) for k in kernels]
-
-
-cdef class HashAggregateFunction(Function):
- cdef:
- const CHashAggregateFunction* func
-
- cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
- Function.init(self, sp_func)
- self.func = <const CHashAggregateFunction*> sp_func.get()
-
- @property
- def kernels(self):
- """
- The kernels implementing this function.
- """
- cdef vector[const CHashAggregateKernel*] kernels = (
- self.func.kernels()
- )
- return [wrap_hash_aggregate_kernel(k) for k in kernels]
-
-
-cdef class MetaFunction(Function):
- cdef:
- const CMetaFunction* func
-
- cdef void init(self, const shared_ptr[CFunction]& sp_func) except *:
- Function.init(self, sp_func)
- self.func = <const CMetaFunction*> sp_func.get()
-
- # Since num_kernels is exposed, also expose a kernels property
-
- @property
- def kernels(self):
- """
- The kernels implementing this function.
- """
- return []
-
-
-cdef _pack_compute_args(object values, vector[CDatum]* out):
- for val in values:
- if isinstance(val, (list, np.ndarray)):
- val = lib.asarray(val)
-
- if isinstance(val, Array):
- out.push_back(CDatum((<Array> val).sp_array))
- continue
- elif isinstance(val, ChunkedArray):
- out.push_back(CDatum((<ChunkedArray> val).sp_chunked_array))
- continue
- elif isinstance(val, Scalar):
- out.push_back(CDatum((<Scalar> val).unwrap()))
- continue
- elif isinstance(val, RecordBatch):
- out.push_back(CDatum((<RecordBatch> val).sp_batch))
- continue
- elif isinstance(val, Table):
- out.push_back(CDatum((<Table> val).sp_table))
- continue
- else:
- # Is it a Python scalar?
- try:
- scal = lib.scalar(val)
- except Exception:
- # Raise dedicated error below
- pass
- else:
- out.push_back(CDatum((<Scalar> scal).unwrap()))
- continue
-
- raise TypeError("Got unexpected argument type {} "
- "for compute function".format(type(val)))
-
-
-cdef class FunctionRegistry(_Weakrefable):
- cdef:
- CFunctionRegistry* registry
-
- def __init__(self):
- self.registry = GetFunctionRegistry()
-
- def list_functions(self):
- """
- Return all function names in the registry.
- """
- cdef vector[c_string] names = self.registry.GetFunctionNames()
- return [frombytes(name) for name in names]
-
- def get_function(self, name):
- """
- Look up a function by name in the registry.
- """
- cdef:
- c_string c_name = tobytes(name)
- shared_ptr[CFunction] func
- with nogil:
- func = GetResultValue(self.registry.GetFunction(c_name))
- return wrap_function(func)
-
-
-cdef FunctionRegistry _global_func_registry = FunctionRegistry()
-
-
-def function_registry():
- return _global_func_registry
-
-
-def get_function(name):
- """
- Get a function by name.
-
- The function is looked up in the global registry
- (as returned by `function_registry()`).
- """
- return _global_func_registry.get_function(name)
-
-
-def list_functions():
- """
- Return all function names in the global registry.
- """
- return _global_func_registry.list_functions()
-
-
-def call_function(name, args, options=None, memory_pool=None):
- """
- Call a named function.
-
- The function is looked up in the global registry
- (as returned by `function_registry()`).
- """
- func = _global_func_registry.get_function(name)
- return func.call(args, options=options, memory_pool=memory_pool)
-
-
-cdef class FunctionOptions(_Weakrefable):
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- raise NotImplementedError("Unimplemented base options")
-
-
-# NOTE:
-# To properly expose the constructor signature of FunctionOptions
-# subclasses, we use a two-level inheritance:
-# 1. a C extension class that implements option validation and setting
-# (won't expose function signatures because of
-# https://github.com/cython/cython/issues/3873)
-# 2. a Python derived class that implements the constructor
-
-cdef class _CastOptions(FunctionOptions):
- cdef:
- unique_ptr[CCastOptions] options
-
- __slots__ = () # avoid mistakingly creating attributes
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.options.get()
-
- def _set_options(self, DataType target_type, allow_int_overflow,
- allow_time_truncate, allow_time_overflow,
- allow_float_truncate, allow_invalid_utf8):
- self.options.reset(new CCastOptions())
- self._set_type(target_type)
- if allow_int_overflow is not None:
- self.allow_int_overflow = allow_int_overflow
- if allow_time_truncate is not None:
- self.allow_time_truncate = allow_time_truncate
- if allow_time_overflow is not None:
- self.allow_time_overflow = allow_time_overflow
- if allow_float_truncate is not None:
- self.allow_float_truncate = allow_float_truncate
- if allow_invalid_utf8 is not None:
- self.allow_invalid_utf8 = allow_invalid_utf8
-
- def _set_type(self, target_type=None):
- if target_type is not None:
- deref(self.options).to_type = (
- (<DataType> ensure_type(target_type)).sp_type
- )
-
- def _set_safe(self):
- self.options.reset(new CCastOptions(CCastOptions.Safe()))
-
- def _set_unsafe(self):
- self.options.reset(new CCastOptions(CCastOptions.Unsafe()))
-
- def is_safe(self):
- return not (
- deref(self.options).allow_int_overflow or
- deref(self.options).allow_time_truncate or
- deref(self.options).allow_time_overflow or
- deref(self.options).allow_float_truncate or
- deref(self.options).allow_invalid_utf8
- )
-
- @property
- def allow_int_overflow(self):
- return deref(self.options).allow_int_overflow
-
- @allow_int_overflow.setter
- def allow_int_overflow(self, bint flag):
- deref(self.options).allow_int_overflow = flag
-
- @property
- def allow_time_truncate(self):
- return deref(self.options).allow_time_truncate
-
- @allow_time_truncate.setter
- def allow_time_truncate(self, bint flag):
- deref(self.options).allow_time_truncate = flag
-
- @property
- def allow_time_overflow(self):
- return deref(self.options).allow_time_overflow
-
- @allow_time_overflow.setter
- def allow_time_overflow(self, bint flag):
- deref(self.options).allow_time_overflow = flag
-
- @property
- def allow_float_truncate(self):
- return deref(self.options).allow_float_truncate
-
- @allow_float_truncate.setter
- def allow_float_truncate(self, bint flag):
- deref(self.options).allow_float_truncate = flag
-
- @property
- def allow_invalid_utf8(self):
- return deref(self.options).allow_invalid_utf8
-
- @allow_invalid_utf8.setter
- def allow_invalid_utf8(self, bint flag):
- deref(self.options).allow_invalid_utf8 = flag
-
-
-class CastOptions(_CastOptions):
-
- def __init__(self, target_type=None, *, allow_int_overflow=None,
- allow_time_truncate=None, allow_time_overflow=None,
- allow_float_truncate=None, allow_invalid_utf8=None):
- self._set_options(target_type, allow_int_overflow,
- allow_time_truncate, allow_time_overflow,
- allow_float_truncate, allow_invalid_utf8)
-
- @staticmethod
- def safe(target_type=None):
- self = CastOptions()
- self._set_safe()
- self._set_type(target_type)
- return self
-
- @staticmethod
- def unsafe(target_type=None):
- self = CastOptions()
- self._set_unsafe()
- self._set_type(target_type)
- return self
-
-
-cdef class _MatchSubstringOptions(FunctionOptions):
- cdef:
- unique_ptr[CMatchSubstringOptions] match_substring_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.match_substring_options.get()
-
- def _set_options(self, pattern):
- self.match_substring_options.reset(
- new CMatchSubstringOptions(tobytes(pattern)))
-
-
-class MatchSubstringOptions(_MatchSubstringOptions):
- def __init__(self, pattern):
- self._set_options(pattern)
-
-
-cdef class _TrimOptions(FunctionOptions):
- cdef:
- unique_ptr[CTrimOptions] trim_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.trim_options.get()
-
- def _set_options(self, characters):
- self.trim_options.reset(
- new CTrimOptions(tobytes(characters)))
-
-
-class TrimOptions(_TrimOptions):
- def __init__(self, characters):
- self._set_options(characters)
-
-
-cdef class _ReplaceSubstringOptions(FunctionOptions):
- cdef:
- unique_ptr[CReplaceSubstringOptions] replace_substring_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.replace_substring_options.get()
-
- def _set_options(self, pattern, replacement, max_replacements):
- self.replace_substring_options.reset(
- new CReplaceSubstringOptions(tobytes(pattern),
- tobytes(replacement),
- max_replacements)
- )
-
-
-class ReplaceSubstringOptions(_ReplaceSubstringOptions):
- def __init__(self, pattern, replacement, max_replacements=-1):
- self._set_options(pattern, replacement, max_replacements)
-
-
-cdef class _FilterOptions(FunctionOptions):
- cdef:
- unique_ptr[CFilterOptions] filter_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.filter_options.get()
-
- def _set_options(self, null_selection_behavior):
- if null_selection_behavior == 'drop':
- self.filter_options.reset(
- new CFilterOptions(CFilterNullSelectionBehavior_DROP))
- elif null_selection_behavior == 'emit_null':
- self.filter_options.reset(
- new CFilterOptions(CFilterNullSelectionBehavior_EMIT_NULL))
- else:
- raise ValueError(
- '"{}" is not a valid null_selection_behavior'
- .format(null_selection_behavior))
-
-
-class FilterOptions(_FilterOptions):
- def __init__(self, null_selection_behavior='drop'):
- self._set_options(null_selection_behavior)
-
-
-cdef class _DictionaryEncodeOptions(FunctionOptions):
- cdef:
- unique_ptr[CDictionaryEncodeOptions] dictionary_encode_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.dictionary_encode_options.get()
-
- def _set_options(self, null_encoding_behavior):
- if null_encoding_behavior == 'encode':
- self.dictionary_encode_options.reset(
- new CDictionaryEncodeOptions(
- CDictionaryEncodeNullEncodingBehavior_ENCODE))
- elif null_encoding_behavior == 'mask':
- self.dictionary_encode_options.reset(
- new CDictionaryEncodeOptions(
- CDictionaryEncodeNullEncodingBehavior_MASK))
- else:
- raise ValueError('"{}" is not a valid null_encoding_behavior'
- .format(null_encoding_behavior))
-
-
-class DictionaryEncodeOptions(_DictionaryEncodeOptions):
- def __init__(self, null_encoding_behavior='mask'):
- self._set_options(null_encoding_behavior)
-
-
-cdef class _TakeOptions(FunctionOptions):
- cdef:
- unique_ptr[CTakeOptions] take_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.take_options.get()
-
- def _set_options(self, boundscheck):
- self.take_options.reset(new CTakeOptions(boundscheck))
-
-
-class TakeOptions(_TakeOptions):
- def __init__(self, *, boundscheck=True):
- self._set_options(boundscheck)
-
-
-cdef class _PartitionNthOptions(FunctionOptions):
- cdef:
- unique_ptr[CPartitionNthOptions] partition_nth_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.partition_nth_options.get()
-
- def _set_options(self, int64_t pivot):
- self.partition_nth_options.reset(new CPartitionNthOptions(pivot))
-
-
-class PartitionNthOptions(_PartitionNthOptions):
- def __init__(self, int64_t pivot):
- self._set_options(pivot)
-
-
-cdef class _ProjectOptions(FunctionOptions):
- cdef:
- unique_ptr[CProjectOptions] project_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.project_options.get()
-
- def _set_options(self, field_names):
- cdef:
- vector[c_string] c_field_names
- for n in field_names:
- c_field_names.push_back(tobytes(n))
- self.project_options.reset(new CProjectOptions(field_names))
-
-
-class ProjectOptions(_ProjectOptions):
- def __init__(self, field_names):
- self._set_options(field_names)
-
-
-cdef class _MinMaxOptions(FunctionOptions):
- cdef:
- unique_ptr[CMinMaxOptions] min_max_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.min_max_options.get()
-
- def _set_options(self, null_handling):
- if null_handling == 'skip':
- self.min_max_options.reset(
- new CMinMaxOptions(CMinMaxMode_SKIP))
- elif null_handling == 'emit_null':
- self.min_max_options.reset(
- new CMinMaxOptions(CMinMaxMode_EMIT_NULL))
- else:
- raise ValueError(
- '{!r} is not a valid null_handling'
- .format(null_handling))
-
-
-class MinMaxOptions(_MinMaxOptions):
- def __init__(self, null_handling='skip'):
- self._set_options(null_handling)
-
-
-cdef class _CountOptions(FunctionOptions):
- cdef:
- unique_ptr[CCountOptions] count_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.count_options.get()
-
- def _set_options(self, count_mode):
- if count_mode == 'count_null':
- self.count_options.reset(
- new CCountOptions(CCountMode_COUNT_NULL))
- elif count_mode == 'count_non_null':
- self.count_options.reset(
- new CCountOptions(CCountMode_COUNT_NON_NULL))
- else:
- raise ValueError(
- '{!r} is not a valid count_mode'
- .format(count_mode))
-
-
-class CountOptions(_CountOptions):
- def __init__(self, count_mode='count_non_null'):
- self._set_options(count_mode)
-
-
-cdef class _ModeOptions(FunctionOptions):
- cdef:
- unique_ptr[CModeOptions] mode_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.mode_options.get()
-
- def _set_options(self, n):
- self.mode_options.reset(new CModeOptions(n))
-
-
-class ModeOptions(_ModeOptions):
- def __init__(self, n=1):
- self._set_options(n)
-
-
-cdef class _SetLookupOptions(FunctionOptions):
- cdef:
- unique_ptr[CSetLookupOptions] set_lookup_options
- unique_ptr[CDatum] valset
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.set_lookup_options.get()
-
- def _set_options(self, value_set, c_bool skip_nulls):
- if isinstance(value_set, Array):
- self.valset.reset(new CDatum((<Array> value_set).sp_array))
- elif isinstance(value_set, ChunkedArray):
- self.valset.reset(
- new CDatum((<ChunkedArray> value_set).sp_chunked_array)
- )
- elif isinstance(value_set, Scalar):
- self.valset.reset(new CDatum((<Scalar> value_set).unwrap()))
- else:
- raise ValueError('"{}" is not a valid value_set'.format(value_set))
-
- self.set_lookup_options.reset(
- new CSetLookupOptions(deref(self.valset), skip_nulls)
- )
-
-
-class SetLookupOptions(_SetLookupOptions):
- def __init__(self, *, value_set, skip_nulls=False):
- self._set_options(value_set, skip_nulls)
-
-
-cdef class _StrptimeOptions(FunctionOptions):
- cdef:
- unique_ptr[CStrptimeOptions] strptime_options
- TimeUnit time_unit
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.strptime_options.get()
-
- def _set_options(self, format, unit):
- if unit == 's':
- self.time_unit = TimeUnit_SECOND
- elif unit == 'ms':
- self.time_unit = TimeUnit_MILLI
- elif unit == 'us':
- self.time_unit = TimeUnit_MICRO
- elif unit == 'ns':
- self.time_unit = TimeUnit_NANO
- else:
- raise ValueError('"{}" is not a valid time unit'.format(unit))
-
- self.strptime_options.reset(
- new CStrptimeOptions(tobytes(format), self.time_unit)
- )
-
-
-class StrptimeOptions(_StrptimeOptions):
- def __init__(self, format, unit):
- self._set_options(format, unit)
-
-
-cdef class _VarianceOptions(FunctionOptions):
- cdef:
- unique_ptr[CVarianceOptions] variance_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.variance_options.get()
-
- def _set_options(self, ddof):
- self.variance_options.reset(new CVarianceOptions(ddof))
-
-
-class VarianceOptions(_VarianceOptions):
- def __init__(self, *, ddof=0):
- self._set_options(ddof)
-
-
-cdef class _SplitOptions(FunctionOptions):
- cdef:
- unique_ptr[CSplitOptions] split_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.split_options.get()
-
- def _set_options(self, max_splits, reverse):
- self.split_options.reset(
- new CSplitOptions(max_splits, reverse))
-
-
-class SplitOptions(_SplitOptions):
- def __init__(self, *, max_splits=-1, reverse=False):
- self._set_options(max_splits, reverse)
-
-
-cdef class _SplitPatternOptions(FunctionOptions):
- cdef:
- unique_ptr[CSplitPatternOptions] split_pattern_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.split_pattern_options.get()
-
- def _set_options(self, pattern, max_splits, reverse):
- self.split_pattern_options.reset(
- new CSplitPatternOptions(tobytes(pattern), max_splits, reverse))
-
-
-class SplitPatternOptions(_SplitPatternOptions):
- def __init__(self, *, pattern, max_splits=-1, reverse=False):
- self._set_options(pattern, max_splits, reverse)
-
-
-cdef class _ArraySortOptions(FunctionOptions):
- cdef:
- unique_ptr[CArraySortOptions] array_sort_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.array_sort_options.get()
-
- def _set_options(self, order):
- if order == "ascending":
- self.array_sort_options.reset(
- new CArraySortOptions(CSortOrder_Ascending))
- elif order == "descending":
- self.array_sort_options.reset(
- new CArraySortOptions(CSortOrder_Descending))
- else:
- raise ValueError(
- "{!r} is not a valid order".format(order)
- )
-
-
-class ArraySortOptions(_ArraySortOptions):
- def __init__(self, *, order='ascending'):
- self._set_options(order)
-
-
-cdef class _SortOptions(FunctionOptions):
- cdef:
- unique_ptr[CSortOptions] sort_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.sort_options.get()
-
- def _set_options(self, sort_keys):
- cdef:
- vector[CSortKey] c_sort_keys
- c_string c_name
- CSortOrder c_order
-
- for name, order in sort_keys:
- if order == "ascending":
- c_order = CSortOrder_Ascending
- elif order == "descending":
- c_order = CSortOrder_Descending
- else:
- raise ValueError(
- "{!r} is not a valid order".format(order)
- )
- c_name = tobytes(name)
- c_sort_keys.push_back(CSortKey(c_name, c_order))
-
- self.sort_options.reset(new CSortOptions(c_sort_keys))
-
-
-class SortOptions(_SortOptions):
- def __init__(self, sort_keys=None):
- if sort_keys is None:
- sort_keys = []
- self._set_options(sort_keys)
-
-
-cdef class _QuantileOptions(FunctionOptions):
- cdef:
- unique_ptr[CQuantileOptions] quantile_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.quantile_options.get()
-
- def _set_options(self, quantiles, interp):
- interp_dict = {
- 'linear': CQuantileInterp_LINEAR,
- 'lower': CQuantileInterp_LOWER,
- 'higher': CQuantileInterp_HIGHER,
- 'nearest': CQuantileInterp_NEAREST,
- 'midpoint': CQuantileInterp_MIDPOINT,
- }
- if interp not in interp_dict:
- raise ValueError(
- '{!r} is not a valid interpolation'
- .format(interp))
- self.quantile_options.reset(
- new CQuantileOptions(quantiles, interp_dict[interp]))
-
-
-class QuantileOptions(_QuantileOptions):
- def __init__(self, *, q=0.5, interpolation='linear'):
- if not isinstance(q, (list, tuple, np.ndarray)):
- q = [q]
- self._set_options(q, interpolation)
-
-
-cdef class _TDigestOptions(FunctionOptions):
- cdef:
- unique_ptr[CTDigestOptions] tdigest_options
-
- cdef const CFunctionOptions* get_options(self) except NULL:
- return self.tdigest_options.get()
-
- def _set_options(self, quantiles, delta, buffer_size):
- self.tdigest_options.reset(
- new CTDigestOptions(quantiles, delta, buffer_size))
-
-
-class TDigestOptions(_TDigestOptions):
- def __init__(self, *, q=0.5, delta=100, buffer_size=500):
- if not isinstance(q, (list, tuple, np.ndarray)):
- q = [q]
- self._set_options(q, delta, buffer_size)
diff --git a/python/pyarrow/_csv.pxd b/python/pyarrow/_csv.pxd
deleted file mode 100644
index f8e12f1..0000000
--- a/python/pyarrow/_csv.pxd
+++ /dev/null
@@ -1,46 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: language_level = 3
-
-from pyarrow.includes.libarrow cimport *
-from pyarrow.lib cimport _Weakrefable
-
-
-cdef class ConvertOptions(_Weakrefable):
- cdef:
- CCSVConvertOptions options
-
- @staticmethod
- cdef ConvertOptions wrap(CCSVConvertOptions options)
-
-
-cdef class ParseOptions(_Weakrefable):
- cdef:
- CCSVParseOptions options
-
- @staticmethod
- cdef ParseOptions wrap(CCSVParseOptions options)
-
-
-cdef class ReadOptions(_Weakrefable):
- cdef:
- CCSVReadOptions options
- public object encoding
-
- @staticmethod
- cdef ReadOptions wrap(CCSVReadOptions options)
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
deleted file mode 100644
index a98160c..0000000
--- a/python/pyarrow/_csv.pyx
+++ /dev/null
@@ -1,952 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: profile=False
-# distutils: language = c++
-# cython: embedsignature = True
-# cython: language_level = 3
-
-from cython.operator cimport dereference as deref
-
-import codecs
-from collections.abc import Mapping
-
-from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport *
-from pyarrow.lib cimport (check_status, Field, MemoryPool, Schema,
- RecordBatchReader, ensure_type,
- maybe_unbox_memory_pool, get_input_stream,
- get_writer, native_transcoding_input_stream,
- pyarrow_unwrap_batch, pyarrow_unwrap_table,
- pyarrow_wrap_schema, pyarrow_wrap_table,
- pyarrow_wrap_data_type, pyarrow_unwrap_data_type,
- Table, RecordBatch, StopToken)
-from pyarrow.lib import frombytes, tobytes, SignalStopHandler
-from pyarrow.util import _stringify_path
-
-
-cdef unsigned char _single_char(s) except 0:
- val = ord(s)
- if val == 0 or val > 127:
- raise ValueError("Expecting an ASCII character")
- return <unsigned char> val
-
-
-cdef class ReadOptions(_Weakrefable):
- """
- Options for reading CSV files.
-
- Parameters
- ----------
- use_threads : bool, optional (default True)
- Whether to use multiple threads to accelerate reading
- block_size : int, optional
- How much bytes to process at a time from the input stream.
- This will determine multi-threading granularity as well as
- the size of individual chunks in the Table.
- skip_rows: int, optional (default 0)
- The number of rows to skip before the column names (if any)
- and the CSV data.
- column_names: list, optional
- The column names of the target table. If empty, fall back on
- `autogenerate_column_names`.
- autogenerate_column_names: bool, optional (default False)
- Whether to autogenerate column names if `column_names` is empty.
- If true, column names will be of the form "f0", "f1"...
- If false, column names will be read from the first CSV row
- after `skip_rows`.
- encoding: str, optional (default 'utf8')
- The character encoding of the CSV data. Columns that cannot
- decode using this encoding can still be read as Binary.
- """
-
- # Avoid mistakingly creating attributes
- __slots__ = ()
-
- def __init__(self, *, use_threads=None, block_size=None, skip_rows=None,
- column_names=None, autogenerate_column_names=None,
- encoding='utf8'):
- self.options = CCSVReadOptions.Defaults()
- if use_threads is not None:
- self.use_threads = use_threads
- if block_size is not None:
- self.block_size = block_size
- if skip_rows is not None:
- self.skip_rows = skip_rows
- if column_names is not None:
- self.column_names = column_names
- if autogenerate_column_names is not None:
- self.autogenerate_column_names= autogenerate_column_names
- # Python-specific option
- self.encoding = encoding
-
- @property
- def use_threads(self):
- """
- Whether to use multiple threads to accelerate reading.
- """
- return self.options.use_threads
-
- @use_threads.setter
- def use_threads(self, value):
- self.options.use_threads = value
-
- @property
- def block_size(self):
- """
- How much bytes to process at a time from the input stream.
- This will determine multi-threading granularity as well as
- the size of individual chunks in the Table.
- """
- return self.options.block_size
-
- @block_size.setter
- def block_size(self, value):
- self.options.block_size = value
-
- @property
- def skip_rows(self):
- """
- The number of rows to skip before the column names (if any)
- and the CSV data.
- """
- return self.options.skip_rows
-
- @skip_rows.setter
- def skip_rows(self, value):
- self.options.skip_rows = value
-
- @property
- def column_names(self):
- """
- The column names of the target table. If empty, fall back on
- `autogenerate_column_names`.
- """
- return [frombytes(s) for s in self.options.column_names]
-
- @column_names.setter
- def column_names(self, value):
- self.options.column_names.clear()
- for item in value:
- self.options.column_names.push_back(tobytes(item))
-
- @property
- def autogenerate_column_names(self):
- """
- Whether to autogenerate column names if `column_names` is empty.
- If true, column names will be of the form "f0", "f1"...
- If false, column names will be read from the first CSV row
- after `skip_rows`.
- """
- return self.options.autogenerate_column_names
-
- @autogenerate_column_names.setter
- def autogenerate_column_names(self, value):
- self.options.autogenerate_column_names = value
-
- def equals(self, ReadOptions other):
- return (
- self.use_threads == other.use_threads and
- self.block_size == other.block_size and
- self.skip_rows == other.skip_rows and
- self.column_names == other.column_names and
- self.autogenerate_column_names ==
- other.autogenerate_column_names and
- self.encoding == other.encoding
- )
-
- @staticmethod
- cdef ReadOptions wrap(CCSVReadOptions options):
- out = ReadOptions()
- out.options = options
- out.encoding = 'utf8' # No way to know this
- return out
-
- def __getstate__(self):
- return (self.use_threads, self.block_size, self.skip_rows,
- self.column_names, self.autogenerate_column_names,
- self.encoding)
-
- def __setstate__(self, state):
- (self.use_threads, self.block_size, self.skip_rows,
- self.column_names, self.autogenerate_column_names,
- self.encoding) = state
-
- def __eq__(self, other):
- try:
- return self.equals(other)
- except TypeError:
- return False
-
-
-cdef class ParseOptions(_Weakrefable):
- """
- Options for parsing CSV files.
-
- Parameters
- ----------
- delimiter: 1-character string, optional (default ',')
- The character delimiting individual cells in the CSV data.
- quote_char: 1-character string or False, optional (default '"')
- The character used optionally for quoting CSV values
- (False if quoting is not allowed).
- double_quote: bool, optional (default True)
- Whether two quotes in a quoted CSV value denote a single quote
- in the data.
- escape_char: 1-character string or False, optional (default False)
- The character used optionally for escaping special characters
- (False if escaping is not allowed).
- newlines_in_values: bool, optional (default False)
- Whether newline characters are allowed in CSV values.
- Setting this to True reduces the performance of multi-threaded
- CSV reading.
- ignore_empty_lines: bool, optional (default True)
- Whether empty lines are ignored in CSV input.
- If False, an empty line is interpreted as containing a single empty
- value (assuming a one-column CSV file).
- """
- __slots__ = ()
-
- def __init__(self, *, delimiter=None, quote_char=None, double_quote=None,
- escape_char=None, newlines_in_values=None,
- ignore_empty_lines=None):
- self.options = CCSVParseOptions.Defaults()
- if delimiter is not None:
- self.delimiter = delimiter
- if quote_char is not None:
- self.quote_char = quote_char
- if double_quote is not None:
- self.double_quote = double_quote
- if escape_char is not None:
- self.escape_char = escape_char
- if newlines_in_values is not None:
- self.newlines_in_values = newlines_in_values
- if ignore_empty_lines is not None:
- self.ignore_empty_lines = ignore_empty_lines
-
- @property
- def delimiter(self):
- """
- The character delimiting individual cells in the CSV data.
- """
- return chr(self.options.delimiter)
-
- @delimiter.setter
- def delimiter(self, value):
- self.options.delimiter = _single_char(value)
-
- @property
- def quote_char(self):
- """
- The character used optionally for quoting CSV values
- (False if quoting is not allowed).
- """
- if self.options.quoting:
- return chr(self.options.quote_char)
- else:
- return False
-
- @quote_char.setter
- def quote_char(self, value):
- if value is False:
- self.options.quoting = False
- else:
- self.options.quote_char = _single_char(value)
- self.options.quoting = True
-
- @property
- def double_quote(self):
- """
- Whether two quotes in a quoted CSV value denote a single quote
- in the data.
- """
- return self.options.double_quote
-
- @double_quote.setter
- def double_quote(self, value):
- self.options.double_quote = value
-
- @property
- def escape_char(self):
- """
- The character used optionally for escaping special characters
- (False if escaping is not allowed).
- """
- if self.options.escaping:
- return chr(self.options.escape_char)
- else:
- return False
-
- @escape_char.setter
- def escape_char(self, value):
- if value is False:
- self.options.escaping = False
- else:
- self.options.escape_char = _single_char(value)
- self.options.escaping = True
-
- @property
- def newlines_in_values(self):
- """
- Whether newline characters are allowed in CSV values.
- Setting this to True reduces the performance of multi-threaded
- CSV reading.
- """
- return self.options.newlines_in_values
-
- @newlines_in_values.setter
- def newlines_in_values(self, value):
- self.options.newlines_in_values = value
-
- @property
- def ignore_empty_lines(self):
- """
- Whether empty lines are ignored in CSV input.
- If False, an empty line is interpreted as containing a single empty
- value (assuming a one-column CSV file).
- """
- return self.options.ignore_empty_lines
-
- @ignore_empty_lines.setter
- def ignore_empty_lines(self, value):
- self.options.ignore_empty_lines = value
-
- def equals(self, ParseOptions other):
- return (
- self.delimiter == other.delimiter and
- self.quote_char == other.quote_char and
- self.double_quote == other.double_quote and
- self.escape_char == other.escape_char and
- self.newlines_in_values == other.newlines_in_values and
- self.ignore_empty_lines == other.ignore_empty_lines
- )
-
- @staticmethod
- cdef ParseOptions wrap(CCSVParseOptions options):
- out = ParseOptions()
- out.options = options
- return out
-
- def __getstate__(self):
- return (self.delimiter, self.quote_char, self.double_quote,
- self.escape_char, self.newlines_in_values,
- self.ignore_empty_lines)
-
- def __setstate__(self, state):
- (self.delimiter, self.quote_char, self.double_quote,
- self.escape_char, self.newlines_in_values,
- self.ignore_empty_lines) = state
-
- def __eq__(self, other):
- try:
- return self.equals(other)
- except TypeError:
- return False
-
-
-cdef class _ISO8601(_Weakrefable):
- """
- A special object indicating ISO-8601 parsing.
- """
- __slots__ = ()
-
- def __str__(self):
- return 'ISO8601'
-
- def __eq__(self, other):
- return isinstance(other, _ISO8601)
-
-
-ISO8601 = _ISO8601()
-
-
-cdef class ConvertOptions(_Weakrefable):
- """
- Options for converting CSV data.
-
- Parameters
- ----------
- check_utf8 : bool, optional (default True)
- Whether to check UTF8 validity of string columns.
- column_types: pa.Schema or dict, optional
- Explicitly map column names to column types. Passing this argument
- disables type inference on the defined columns.
- null_values: list, optional
- A sequence of strings that denote nulls in the data
- (defaults are appropriate in most cases). Note that by default,
- string columns are not checked for null values. To enable
- null checking for those, specify ``strings_can_be_null=True``.
- true_values: list, optional
- A sequence of strings that denote true booleans in the data
- (defaults are appropriate in most cases).
- false_values: list, optional
- A sequence of strings that denote false booleans in the data
- (defaults are appropriate in most cases).
- timestamp_parsers: list, optional
- A sequence of strptime()-compatible format strings, tried in order
- when attempting to infer or convert timestamp values (the special
- value ISO8601() can also be given). By default, a fast built-in
- ISO-8601 parser is used.
- strings_can_be_null: bool, optional (default False)
- Whether string / binary columns can have null values.
- If true, then strings in null_values are considered null for
- string columns.
- If false, then all strings are valid string values.
- auto_dict_encode: bool, optional (default False)
- Whether to try to automatically dict-encode string / binary data.
- If true, then when type inference detects a string or binary column,
- it it dict-encoded up to `auto_dict_max_cardinality` distinct values
- (per chunk), after which it switches to regular encoding.
- This setting is ignored for non-inferred columns (those in
- `column_types`).
- auto_dict_max_cardinality: int, optional
- The maximum dictionary cardinality for `auto_dict_encode`.
- This value is per chunk.
- include_columns: list, optional
- The names of columns to include in the Table.
- If empty, the Table will include all columns from the CSV file.
- If not empty, only these columns will be included, in this order.
- include_missing_columns: bool, optional (default False)
- If false, columns in `include_columns` but not in the CSV file will
- error out.
- If true, columns in `include_columns` but not in the CSV file will
- produce a column of nulls (whose type is selected using
- `column_types`, or null by default).
- This option is ignored if `include_columns` is empty.
- """
- # Avoid mistakingly creating attributes
- __slots__ = ()
-
- def __init__(self, *, check_utf8=None, column_types=None, null_values=None,
- true_values=None, false_values=None,
- strings_can_be_null=None, include_columns=None,
- include_missing_columns=None, auto_dict_encode=None,
- auto_dict_max_cardinality=None, timestamp_parsers=None):
- self.options = CCSVConvertOptions.Defaults()
- if check_utf8 is not None:
- self.check_utf8 = check_utf8
- if column_types is not None:
- self.column_types = column_types
- if null_values is not None:
- self.null_values = null_values
- if true_values is not None:
- self.true_values = true_values
- if false_values is not None:
- self.false_values = false_values
- if strings_can_be_null is not None:
- self.strings_can_be_null = strings_can_be_null
- if include_columns is not None:
- self.include_columns = include_columns
- if include_missing_columns is not None:
- self.include_missing_columns = include_missing_columns
- if auto_dict_encode is not None:
- self.auto_dict_encode = auto_dict_encode
- if auto_dict_max_cardinality is not None:
- self.auto_dict_max_cardinality = auto_dict_max_cardinality
- if timestamp_parsers is not None:
- self.timestamp_parsers = timestamp_parsers
-
- @property
- def check_utf8(self):
- """
- Whether to check UTF8 validity of string columns.
- """
- return self.options.check_utf8
-
- @check_utf8.setter
- def check_utf8(self, value):
- self.options.check_utf8 = value
-
- @property
- def strings_can_be_null(self):
- """
- Whether string / binary columns can have null values.
- """
- return self.options.strings_can_be_null
-
- @strings_can_be_null.setter
- def strings_can_be_null(self, value):
- self.options.strings_can_be_null = value
-
- @property
- def column_types(self):
- """
- Explicitly map column names to column types.
- """
- d = {frombytes(item.first): pyarrow_wrap_data_type(item.second)
- for item in self.options.column_types}
- return d
-
- @column_types.setter
- def column_types(self, value):
- cdef:
- shared_ptr[CDataType] typ
-
- if isinstance(value, Mapping):
- value = value.items()
-
- self.options.column_types.clear()
- for item in value:
- if isinstance(item, Field):
- k = item.name
- v = item.type
- else:
- k, v = item
- typ = pyarrow_unwrap_data_type(ensure_type(v))
- assert typ != NULL
- self.options.column_types[tobytes(k)] = typ
-
- @property
- def null_values(self):
- """
- A sequence of strings that denote nulls in the data.
- """
- return [frombytes(x) for x in self.options.null_values]
-
- @null_values.setter
- def null_values(self, value):
- self.options.null_values = [tobytes(x) for x in value]
-
- @property
- def true_values(self):
- """
- A sequence of strings that denote true booleans in the data.
- """
- return [frombytes(x) for x in self.options.true_values]
-
- @true_values.setter
- def true_values(self, value):
- self.options.true_values = [tobytes(x) for x in value]
-
- @property
- def false_values(self):
- """
- A sequence of strings that denote false booleans in the data.
- """
- return [frombytes(x) for x in self.options.false_values]
-
- @false_values.setter
- def false_values(self, value):
- self.options.false_values = [tobytes(x) for x in value]
-
- @property
- def auto_dict_encode(self):
- """
- Whether to try to automatically dict-encode string / binary data.
- """
- return self.options.auto_dict_encode
-
- @auto_dict_encode.setter
- def auto_dict_encode(self, value):
- self.options.auto_dict_encode = value
-
- @property
- def auto_dict_max_cardinality(self):
- """
- The maximum dictionary cardinality for `auto_dict_encode`.
-
- This value is per chunk.
- """
- return self.options.auto_dict_max_cardinality
-
- @auto_dict_max_cardinality.setter
- def auto_dict_max_cardinality(self, value):
- self.options.auto_dict_max_cardinality = value
-
- @property
- def include_columns(self):
- """
- The names of columns to include in the Table.
-
- If empty, the Table will include all columns from the CSV file.
- If not empty, only these columns will be included, in this order.
- """
- return [frombytes(s) for s in self.options.include_columns]
-
- @include_columns.setter
- def include_columns(self, value):
- self.options.include_columns.clear()
- for item in value:
- self.options.include_columns.push_back(tobytes(item))
-
- @property
- def include_missing_columns(self):
- """
- If false, columns in `include_columns` but not in the CSV file will
- error out.
- If true, columns in `include_columns` but not in the CSV file will
- produce a null column (whose type is selected using `column_types`,
- or null by default).
- This option is ignored if `include_columns` is empty.
- """
- return self.options.include_missing_columns
-
- @include_missing_columns.setter
- def include_missing_columns(self, value):
- self.options.include_missing_columns = value
-
- @property
- def timestamp_parsers(self):
- """
- A sequence of strptime()-compatible format strings, tried in order
- when attempting to infer or convert timestamp values (the special
- value ISO8601() can also be given). By default, a fast built-in
- ISO-8601 parser is used.
- """
- cdef:
- shared_ptr[CTimestampParser] c_parser
- c_string kind
-
- parsers = []
- for c_parser in self.options.timestamp_parsers:
- kind = deref(c_parser).kind()
- if kind == b'strptime':
- parsers.append(frombytes(deref(c_parser).format()))
- else:
- assert kind == b'iso8601'
- parsers.append(ISO8601)
-
- return parsers
-
- @timestamp_parsers.setter
- def timestamp_parsers(self, value):
- cdef:
- vector[shared_ptr[CTimestampParser]] c_parsers
-
- for v in value:
- if isinstance(v, str):
- c_parsers.push_back(CTimestampParser.MakeStrptime(tobytes(v)))
- elif v == ISO8601:
- c_parsers.push_back(CTimestampParser.MakeISO8601())
- else:
- raise TypeError("Expected list of str or ISO8601 objects")
-
- self.options.timestamp_parsers = move(c_parsers)
-
- @staticmethod
- cdef ConvertOptions wrap(CCSVConvertOptions options):
- out = ConvertOptions()
- out.options = options
- return out
-
- def equals(self, ConvertOptions other):
- return (
- self.check_utf8 == other.check_utf8 and
- self.column_types == other.column_types and
- self.null_values == other.null_values and
- self.true_values == other.true_values and
- self.false_values == other.false_values and
- self.timestamp_parsers == other.timestamp_parsers and
- self.strings_can_be_null == other.strings_can_be_null and
- self.auto_dict_encode == other.auto_dict_encode and
- self.auto_dict_max_cardinality ==
- other.auto_dict_max_cardinality and
- self.include_columns == other.include_columns and
- self.include_missing_columns == other.include_missing_columns
- )
-
- def __getstate__(self):
- return (self.check_utf8, self.column_types, self.null_values,
- self.true_values, self.false_values, self.timestamp_parsers,
- self.strings_can_be_null, self.auto_dict_encode,
- self.auto_dict_max_cardinality, self.include_columns,
- self.include_missing_columns)
-
- def __setstate__(self, state):
- (self.check_utf8, self.column_types, self.null_values,
- self.true_values, self.false_values, self.timestamp_parsers,
- self.strings_can_be_null, self.auto_dict_encode,
- self.auto_dict_max_cardinality, self.include_columns,
- self.include_missing_columns) = state
-
- def __eq__(self, other):
- try:
- return self.equals(other)
- except TypeError:
- return False
-
-
-cdef _get_reader(input_file, ReadOptions read_options,
- shared_ptr[CInputStream]* out):
- use_memory_map = False
- get_input_stream(input_file, use_memory_map, out)
- if read_options is not None:
- out[0] = native_transcoding_input_stream(out[0],
- read_options.encoding,
- 'utf8')
-
-
-cdef _get_read_options(ReadOptions read_options, CCSVReadOptions* out):
- if read_options is None:
- out[0] = CCSVReadOptions.Defaults()
- else:
- out[0] = read_options.options
-
-
-cdef _get_parse_options(ParseOptions parse_options, CCSVParseOptions* out):
- if parse_options is None:
- out[0] = CCSVParseOptions.Defaults()
- else:
- out[0] = parse_options.options
-
-
-cdef _get_convert_options(ConvertOptions convert_options,
- CCSVConvertOptions* out):
- if convert_options is None:
- out[0] = CCSVConvertOptions.Defaults()
- else:
- out[0] = convert_options.options
-
-
-cdef class CSVStreamingReader(RecordBatchReader):
- """An object that reads record batches incrementally from a CSV file.
-
- Should not be instantiated directly by user code.
- """
- cdef readonly:
- Schema schema
-
- def __init__(self):
- raise TypeError("Do not call {}'s constructor directly, "
- "use pyarrow.csv.open_csv() instead."
- .format(self.__class__.__name__))
-
- # Note about cancellation: we cannot create a SignalStopHandler
- # by default here, as several CSVStreamingReader instances may be
- # created (including by the same thread). Handling cancellation
- # would require having the user pass the SignalStopHandler.
- # (in addition to solving ARROW-11853)
-
- cdef _open(self, shared_ptr[CInputStream] stream,
- CCSVReadOptions c_read_options,
- CCSVParseOptions c_parse_options,
- CCSVConvertOptions c_convert_options,
- MemoryPool memory_pool):
- cdef:
- shared_ptr[CSchema] c_schema
- CIOContext io_context
-
- io_context = CIOContext(maybe_unbox_memory_pool(memory_pool))
-
- with nogil:
- self.reader = <shared_ptr[CRecordBatchReader]> GetResultValue(
- CCSVStreamingReader.Make(
- io_context, stream,
- move(c_read_options), move(c_parse_options),
- move(c_convert_options)))
- c_schema = self.reader.get().schema()
-
- self.schema = pyarrow_wrap_schema(c_schema)
-
-
-def read_csv(input_file, read_options=None, parse_options=None,
- convert_options=None, MemoryPool memory_pool=None):
- """
- Read a Table from a stream of CSV data.
-
- Parameters
- ----------
- input_file: string, path or file-like object
- The location of CSV data. If a string or path, and if it ends
- with a recognized compressed file extension (e.g. ".gz" or ".bz2"),
- the data is automatically decompressed when reading.
- read_options: pyarrow.csv.ReadOptions, optional
- Options for the CSV reader (see pyarrow.csv.ReadOptions constructor
- for defaults)
- parse_options: pyarrow.csv.ParseOptions, optional
- Options for the CSV parser
- (see pyarrow.csv.ParseOptions constructor for defaults)
- convert_options: pyarrow.csv.ConvertOptions, optional
- Options for converting CSV data
- (see pyarrow.csv.ConvertOptions constructor for defaults)
- memory_pool: MemoryPool, optional
- Pool to allocate Table memory from
-
- Returns
- -------
- :class:`pyarrow.Table`
- Contents of the CSV file as a in-memory table.
- """
- cdef:
- shared_ptr[CInputStream] stream
- CCSVReadOptions c_read_options
- CCSVParseOptions c_parse_options
- CCSVConvertOptions c_convert_options
- CIOContext io_context
- shared_ptr[CCSVReader] reader
- shared_ptr[CTable] table
-
- _get_reader(input_file, read_options, &stream)
- _get_read_options(read_options, &c_read_options)
- _get_parse_options(parse_options, &c_parse_options)
- _get_convert_options(convert_options, &c_convert_options)
-
- with SignalStopHandler() as stop_handler:
- io_context = CIOContext(
- maybe_unbox_memory_pool(memory_pool),
- (<StopToken> stop_handler.stop_token).stop_token)
- reader = GetResultValue(CCSVReader.Make(
- io_context, stream,
- c_read_options, c_parse_options, c_convert_options))
-
- with nogil:
- table = GetResultValue(reader.get().Read())
-
- return pyarrow_wrap_table(table)
-
-
-def open_csv(input_file, read_options=None, parse_options=None,
- convert_options=None, MemoryPool memory_pool=None):
- """
- Open a streaming reader of CSV data.
-
- Reading using this function is always single-threaded.
-
- Parameters
- ----------
- input_file: string, path or file-like object
- The location of CSV data. If a string or path, and if it ends
- with a recognized compressed file extension (e.g. ".gz" or ".bz2"),
- the data is automatically decompressed when reading.
- read_options: pyarrow.csv.ReadOptions, optional
- Options for the CSV reader (see pyarrow.csv.ReadOptions constructor
- for defaults)
- parse_options: pyarrow.csv.ParseOptions, optional
- Options for the CSV parser
- (see pyarrow.csv.ParseOptions constructor for defaults)
- convert_options: pyarrow.csv.ConvertOptions, optional
- Options for converting CSV data
- (see pyarrow.csv.ConvertOptions constructor for defaults)
- memory_pool: MemoryPool, optional
- Pool to allocate Table memory from
-
- Returns
- -------
- :class:`pyarrow.csv.CSVStreamingReader`
- """
- cdef:
- shared_ptr[CInputStream] stream
- CCSVReadOptions c_read_options
- CCSVParseOptions c_parse_options
- CCSVConvertOptions c_convert_options
- CSVStreamingReader reader
-
- _get_reader(input_file, read_options, &stream)
- _get_read_options(read_options, &c_read_options)
- _get_parse_options(parse_options, &c_parse_options)
- _get_convert_options(convert_options, &c_convert_options)
-
- reader = CSVStreamingReader.__new__(CSVStreamingReader)
- reader._open(stream, move(c_read_options), move(c_parse_options),
- move(c_convert_options), memory_pool)
- return reader
-
-
-cdef class WriteOptions(_Weakrefable):
- """
- Options for writing CSV files.
-
- Parameters
- ----------
- include_header : bool, optional (default True)
- Whether to write an initial header line with column names
- batch_size : int, optional (default 1024)
- How many rows to process together when converting and writing
- CSV data
- """
- cdef:
- CCSVWriteOptions options
-
- # Avoid mistakingly creating attributes
- __slots__ = ()
-
- def __init__(self, *, include_header=None, batch_size=None):
- self.options = CCSVWriteOptions.Defaults()
- if include_header is not None:
- self.include_header = include_header
- if batch_size is not None:
- self.batch_size = batch_size
-
- @property
- def include_header(self):
- """
- Whether to write an initial header line with column names.
- """
- return self.options.include_header
-
- @include_header.setter
- def include_header(self, value):
- self.options.include_header = value
-
- @property
- def batch_size(self):
- """
- How many rows to process together when converting and writing
- CSV data.
- """
- return self.options.batch_size
-
- @batch_size.setter
- def batch_size(self, value):
- self.options.batch_size = value
-
-
-cdef _get_write_options(WriteOptions write_options, CCSVWriteOptions* out):
- if write_options is None:
- out[0] = CCSVWriteOptions.Defaults()
- else:
- out[0] = write_options.options
-
-
-def write_csv(data, output_file, write_options=None,
- MemoryPool memory_pool=None):
- """
- Write record batch or table to a CSV file.
-
- Parameters
- ----------
- data: pyarrow.RecordBatch or pyarrow.Table
- The data to write.
- output_file: string, path, pyarrow.OutputStream or file-like object
- The location where to write the CSV data.
- write_options: pyarrow.csv.WriteOptions
- Options to configure writing the CSV data.
- memory_pool: MemoryPool, optional
- Pool for temporary allocations.
- """
- cdef:
- shared_ptr[COutputStream] stream
- CCSVWriteOptions c_write_options
- CMemoryPool* c_memory_pool
- CRecordBatch* batch
- CTable* table
- _get_write_options(write_options, &c_write_options)
-
- get_writer(output_file, &stream)
- c_memory_pool = maybe_unbox_memory_pool(memory_pool)
- if isinstance(data, RecordBatch):
- batch = pyarrow_unwrap_batch(data).get()
- with nogil:
- check_status(WriteCSV(deref(batch), c_write_options, c_memory_pool,
- stream.get()))
- elif isinstance(data, Table):
- table = pyarrow_unwrap_table(data).get()
- with nogil:
- check_status(WriteCSV(deref(table), c_write_options, c_memory_pool,
- stream.get()))
- else:
- raise TypeError(f"Expected Table or RecordBatch, got '{type(data)}'")
diff --git a/python/pyarrow/_cuda.pxd b/python/pyarrow/_cuda.pxd
deleted file mode 100644
index 6acb882..0000000
--- a/python/pyarrow/_cuda.pxd
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: language_level = 3
-
-from pyarrow.lib cimport *
-from pyarrow.includes.common cimport *
-from pyarrow.includes.libarrow cimport *
-from pyarrow.includes.libarrow_cuda cimport *
-
-
-cdef class Context(_Weakrefable):
- cdef:
- shared_ptr[CCudaContext] context
- int device_number
-
- cdef void init(self, const shared_ptr[CCudaContext]& ctx)
-
-
-cdef class IpcMemHandle(_Weakrefable):
- cdef:
- shared_ptr[CCudaIpcMemHandle] handle
-
- cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h)
-
-
-cdef class CudaBuffer(Buffer):
- cdef:
- shared_ptr[CCudaBuffer] cuda_buffer
- object base
-
- cdef void init_cuda(self,
- const shared_ptr[CCudaBuffer]& buffer,
- object base)
-
-
-cdef class HostBuffer(Buffer):
- cdef:
- shared_ptr[CCudaHostBuffer] host_buffer
-
- cdef void init_host(self, const shared_ptr[CCudaHostBuffer]& buffer)
-
-
-cdef class BufferReader(NativeFile):
- cdef:
- CCudaBufferReader* reader
- CudaBuffer buffer
-
-
-cdef class BufferWriter(NativeFile):
- cdef:
- CCudaBufferWriter* writer
- CudaBuffer buffer
diff --git a/python/pyarrow/_cuda.pyx b/python/pyarrow/_cuda.pyx
deleted file mode 100644
index f4ca763..0000000
--- a/python/pyarrow/_cuda.pyx
+++ /dev/null
@@ -1,1059 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-from pyarrow.lib import tobytes
-from pyarrow.lib cimport *
-from pyarrow.includes.libarrow_cuda cimport *
-from pyarrow.lib import py_buffer, allocate_buffer, as_buffer, ArrowTypeError
-from pyarrow.util import get_contiguous_span
-cimport cpython as cp
-
-
-cdef class Context(_Weakrefable):
- """
- CUDA driver context.
- """
-
- def __init__(self, *args, **kwargs):
- """
- Create a CUDA driver context for a particular device.
-
- If a CUDA context handle is passed, it is wrapped, otherwise
- a default CUDA context for the given device is requested.
-
- Parameters
- ----------
- device_number : int (default 0)
- Specify the GPU device for which the CUDA driver context is
- requested.
- handle : int, optional
- Specify CUDA handle for a shared context that has been created
- by another library.
- """
- # This method exposed because autodoc doesn't pick __cinit__
-
- def __cinit__(self, int device_number=0, uintptr_t handle=0):
- cdef CCudaDeviceManager* manager
- manager = GetResultValue(CCudaDeviceManager.Instance())
- cdef int n = manager.num_devices()
- if device_number >= n or device_number < 0:
- self.context.reset()
- raise ValueError('device_number argument must be '
- 'non-negative less than %s' % (n))
- if handle == 0:
- self.context = GetResultValue(manager.GetContext(device_number))
- else:
- self.context = GetResultValue(manager.GetSharedContext(
- device_number, <void*>handle))
- self.device_number = device_number
-
- @staticmethod
- def from_numba(context=None):
- """
- Create a Context instance from a Numba CUDA context.
-
- Parameters
- ----------
- context : {numba.cuda.cudadrv.driver.Context, None}
- A Numba CUDA context instance.
- If None, the current Numba context is used.
-
- Returns
- -------
- shared_context : pyarrow.cuda.Context
- Context instance.
- """
- if context is None:
- import numba.cuda
- context = numba.cuda.current_context()
- return Context(device_number=context.device.id,
- handle=context.handle.value)
-
- def to_numba(self):
- """
- Convert Context to a Numba CUDA context.
-
- Returns
- -------
- context : numba.cuda.cudadrv.driver.Context
- Numba CUDA context instance.
- """
- import ctypes
- import numba.cuda
- device = numba.cuda.gpus[self.device_number]
- handle = ctypes.c_void_p(self.handle)
- context = numba.cuda.cudadrv.driver.Context(device, handle)
-
- class DummyPendingDeallocs(object):
- # Context is managed by pyarrow
- def add_item(self, *args, **kwargs):
- pass
-
- context.deallocations = DummyPendingDeallocs()
- return context
-
- @staticmethod
- def get_num_devices():
- """ Return the number of GPU devices.
- """
- cdef CCudaDeviceManager* manager
- manager = GetResultValue(CCudaDeviceManager.Instance())
- return manager.num_devices()
-
- @property
- def device_number(self):
- """ Return context device number.
- """
- return self.device_number
-
- @property
- def handle(self):
- """ Return pointer to context handle.
- """
- return <uintptr_t>self.context.get().handle()
-
- cdef void init(self, const shared_ptr[CCudaContext]& ctx):
- self.context = ctx
-
- def synchronize(self):
- """Blocks until the device has completed all preceding requested
- tasks.
- """
- check_status(self.context.get().Synchronize())
-
- @property
- def bytes_allocated(self):
- """Return the number of allocated bytes.
- """
- return self.context.get().bytes_allocated()
-
- def get_device_address(self, uintptr_t address):
- """Return the device address that is reachable from kernels running in
- the context
-
- Parameters
- ----------
- address : int
- Specify memory address value
-
- Returns
- -------
- device_address : int
- Device address accessible from device context
-
- Notes
- -----
- The device address is defined as a memory address accessible
- by device. While it is often a device memory address but it
- can be also a host memory address, for instance, when the
- memory is allocated as host memory (using cudaMallocHost or
- cudaHostAlloc) or as managed memory (using cudaMallocManaged)
- or the host memory is page-locked (using cudaHostRegister).
- """
- return GetResultValue(self.context.get().GetDeviceAddress(address))
-
- def new_buffer(self, int64_t nbytes):
- """Return new device buffer.
-
- Parameters
- ----------
- nbytes : int
- Specify the number of bytes to be allocated.
-
- Returns
- -------
- buf : CudaBuffer
- Allocated buffer.
- """
- cdef:
- shared_ptr[CCudaBuffer] cudabuf
- with nogil:
- cudabuf = GetResultValue(self.context.get().Allocate(nbytes))
- return pyarrow_wrap_cudabuffer(cudabuf)
-
- def foreign_buffer(self, address, size, base=None):
- """Create device buffer from address and size as a view.
-
- The caller is responsible for allocating and freeing the
- memory. When `address==size==0` then a new zero-sized buffer
- is returned.
-
- Parameters
- ----------
- address : int
- Specify the starting address of the buffer. The address can
- refer to both device or host memory but it must be
- accessible from device after mapping it with
- `get_device_address` method.
- size : int
- Specify the size of device buffer in bytes.
- base : {None, object}
- Specify object that owns the referenced memory.
-
- Returns
- -------
- cbuf : CudaBuffer
- Device buffer as a view of device reachable memory.
-
- """
- if not address and size == 0:
- return self.new_buffer(0)
- cdef:
- uintptr_t c_addr = self.get_device_address(address)
- int64_t c_size = size
- shared_ptr[CCudaBuffer] cudabuf
-
- cudabuf = GetResultValue(self.context.get().View(
- <uint8_t*>c_addr, c_size))
- return pyarrow_wrap_cudabuffer_base(cudabuf, base)
-
- def open_ipc_buffer(self, ipc_handle):
- """ Open existing CUDA IPC memory handle
-
- Parameters
- ----------
- ipc_handle : IpcMemHandle
- Specify opaque pointer to CUipcMemHandle (driver API).
-
- Returns
- -------
- buf : CudaBuffer
- referencing device buffer
- """
- handle = pyarrow_unwrap_cudaipcmemhandle(ipc_handle)
- cdef shared_ptr[CCudaBuffer] cudabuf
- with nogil:
- cudabuf = GetResultValue(
- self.context.get().OpenIpcBuffer(handle.get()[0]))
- return pyarrow_wrap_cudabuffer(cudabuf)
-
- def buffer_from_data(self, object data, int64_t offset=0, int64_t size=-1):
- """Create device buffer and initialize with data.
-
- Parameters
- ----------
- data : {CudaBuffer, HostBuffer, Buffer, array-like}
- Specify data to be copied to device buffer.
- offset : int
- Specify the offset of input buffer for device data
- buffering. Default: 0.
- size : int
- Specify the size of device buffer in bytes. Default: all
- (starting from input offset)
-
- Returns
- -------
- cbuf : CudaBuffer
- Device buffer with copied data.
- """
- is_host_data = not pyarrow_is_cudabuffer(data)
- buf = as_buffer(data) if is_host_data else data
-
- bsize = buf.size
- if offset < 0 or (bsize and offset >= bsize):
- raise ValueError('offset argument is out-of-range')
- if size < 0:
- size = bsize - offset
- elif offset + size > bsize:
- raise ValueError(
- 'requested larger slice than available in device buffer')
-
- if offset != 0 or size != bsize:
- buf = buf.slice(offset, size)
-
- result = self.new_buffer(size)
- if is_host_data:
- result.copy_from_host(buf, position=0, nbytes=size)
- else:
- result.copy_from_device(buf, position=0, nbytes=size)
- return result
-
- def buffer_from_object(self, obj):
- """Create device buffer view of arbitrary object that references
- device accessible memory.
-
- When the object contains a non-contiguous view of device
- accessible memory then the returned device buffer will contain
- contiguous view of the memory, that is, including the
- intermediate data that is otherwise invisible to the input
- object.
-
- Parameters
- ----------
- obj : {object, Buffer, HostBuffer, CudaBuffer, ...}
- Specify an object that holds (device or host) address that
- can be accessed from device. This includes objects with
- types defined in pyarrow.cuda as well as arbitrary objects
- that implement the CUDA array interface as defined by numba.
-
- Returns
- -------
- cbuf : CudaBuffer
- Device buffer as a view of device accessible memory.
-
- """
- if isinstance(obj, HostBuffer):
- return self.foreign_buffer(obj.address, obj.size, base=obj)
- elif isinstance(obj, Buffer):
- return CudaBuffer.from_buffer(obj)
- elif isinstance(obj, CudaBuffer):
- return obj
- elif hasattr(obj, '__cuda_array_interface__'):
- desc = obj.__cuda_array_interface__
- addr = desc['data'][0]
- if addr is None:
- return self.new_buffer(0)
- import numpy as np
- start, end = get_contiguous_span(
- desc['shape'], desc.get('strides'),
- np.dtype(desc['typestr']).itemsize)
- return self.foreign_buffer(addr + start, end - start, base=obj)
- raise ArrowTypeError('cannot create device buffer view from'
- ' `%s` object' % (type(obj)))
-
-
-cdef class IpcMemHandle(_Weakrefable):
- """A serializable container for a CUDA IPC handle.
- """
- cdef void init(self, shared_ptr[CCudaIpcMemHandle]& h):
- self.handle = h
-
- @staticmethod
- def from_buffer(Buffer opaque_handle):
- """Create IpcMemHandle from opaque buffer (e.g. from another
- process)
-
- Parameters
- ----------
- opaque_handle :
- a CUipcMemHandle as a const void*
-
- Results
- -------
- ipc_handle : IpcMemHandle
- """
- c_buf = pyarrow_unwrap_buffer(opaque_handle)
- cdef:
- shared_ptr[CCudaIpcMemHandle] handle
-
- handle = GetResultValue(
- CCudaIpcMemHandle.FromBuffer(c_buf.get().data()))
- return pyarrow_wrap_cudaipcmemhandle(handle)
-
- def serialize(self, pool=None):
- """Write IpcMemHandle to a Buffer
-
- Parameters
- ----------
- pool : {MemoryPool, None}
- Specify a pool to allocate memory from
-
- Returns
- -------
- buf : Buffer
- The serialized buffer.
- """
- cdef CMemoryPool* pool_ = maybe_unbox_memory_pool(pool)
- cdef shared_ptr[CBuffer] buf
- cdef CCudaIpcMemHandle* h = self.handle.get()
- with nogil:
- buf = GetResultValue(h.Serialize(pool_))
- return pyarrow_wrap_buffer(buf)
-
-
-cdef class CudaBuffer(Buffer):
- """An Arrow buffer with data located in a GPU device.
-
- To create a CudaBuffer instance, use Context.device_buffer().
-
- The memory allocated in a CudaBuffer is freed when the buffer object
- is deleted.
- """
-
- def __init__(self):
- raise TypeError("Do not call CudaBuffer's constructor directly, use "
- "`<pyarrow.Context instance>.device_buffer`"
- " method instead.")
-
- cdef void init_cuda(self,
- const shared_ptr[CCudaBuffer]& buffer,
- object base):
- self.cuda_buffer = buffer
- self.init(<shared_ptr[CBuffer]> buffer)
- self.base = base
-
- @staticmethod
- def from_buffer(buf):
- """ Convert back generic buffer into CudaBuffer
-
- Parameters
- ----------
- buf : Buffer
- Specify buffer containing CudaBuffer
-
- Returns
- -------
- dbuf : CudaBuffer
- Resulting device buffer.
- """
- c_buf = pyarrow_unwrap_buffer(buf)
- cuda_buffer = GetResultValue(CCudaBuffer.FromBuffer(c_buf))
- return pyarrow_wrap_cudabuffer(cuda_buffer)
-
- @staticmethod
- def from_numba(mem):
- """Create a CudaBuffer view from numba MemoryPointer instance.
-
- Parameters
- ----------
- mem : numba.cuda.cudadrv.driver.MemoryPointer
-
- Returns
- -------
- cbuf : CudaBuffer
- Device buffer as a view of numba MemoryPointer.
- """
- ctx = Context.from_numba(mem.context)
- if mem.device_pointer.value is None and mem.size==0:
- return ctx.new_buffer(0)
- return ctx.foreign_buffer(mem.device_pointer.value, mem.size, base=mem)
-
- def to_numba(self):
- """Return numba memory pointer of CudaBuffer instance.
- """
- import ctypes
- from numba.cuda.cudadrv.driver import MemoryPointer
- return MemoryPointer(self.context.to_numba(),
- pointer=ctypes.c_void_p(self.address),
- size=self.size)
-
- cdef getitem(self, int64_t i):
- return self.copy_to_host(position=i, nbytes=1)[0]
-
- def copy_to_host(self, int64_t position=0, int64_t nbytes=-1,
- Buffer buf=None,
- MemoryPool memory_pool=None, c_bool resizable=False):
- """Copy memory from GPU device to CPU host
-
- Caller is responsible for ensuring that all tasks affecting
- the memory are finished. Use
-
- `<CudaBuffer instance>.context.synchronize()`
-
- when needed.
-
- Parameters
- ----------
- position : int
- Specify the starting position of the source data in GPU
- device buffer. Default: 0.
- nbytes : int
- Specify the number of bytes to copy. Default: -1 (all from
- the position until host buffer is full).
- buf : Buffer
- Specify a pre-allocated output buffer in host. Default: None
- (allocate new output buffer).
- memory_pool : MemoryPool
- resizable : bool
- Specify extra arguments to allocate_buffer. Used only when
- buf is None.
-
- Returns
- -------
- buf : Buffer
- Output buffer in host.
-
- """
- if position < 0 or (self.size and position > self.size) \
- or (self.size == 0 and position != 0):
- raise ValueError('position argument is out-of-range')
- cdef:
- int64_t c_nbytes
- if buf is None:
- if nbytes < 0:
- # copy all starting from position to new host buffer
- c_nbytes = self.size - position
- else:
- if nbytes > self.size - position:
- raise ValueError(
- 'requested more to copy than available from '
- 'device buffer')
- # copy nbytes starting from position to new host buffeer
- c_nbytes = nbytes
- buf = allocate_buffer(c_nbytes, memory_pool=memory_pool,
- resizable=resizable)
- else:
- if nbytes < 0:
- # copy all from position until given host buffer is full
- c_nbytes = min(self.size - position, buf.size)
- else:
- if nbytes > buf.size:
- raise ValueError(
- 'requested copy does not fit into host buffer')
- # copy nbytes from position to given host buffer
- c_nbytes = nbytes
-
- cdef:
- shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(buf)
- int64_t c_position = position
- with nogil:
- check_status(self.cuda_buffer.get()
- .CopyToHost(c_position, c_nbytes,
- c_buf.get().mutable_data()))
- return buf
-
- def copy_from_host(self, data, int64_t position=0, int64_t nbytes=-1):
- """Copy data from host to device.
-
- The device buffer must be pre-allocated.
-
- Parameters
- ----------
- data : {Buffer, array-like}
- Specify data in host. It can be array-like that is valid
- argument to py_buffer
- position : int
- Specify the starting position of the copy in device buffer.
- Default: 0.
- nbytes : int
- Specify the number of bytes to copy. Default: -1 (all from
- source until device buffer, starting from position, is full)
-
- Returns
- -------
- nbytes : int
- Number of bytes copied.
- """
- if position < 0 or position > self.size:
- raise ValueError('position argument is out-of-range')
- cdef:
- int64_t c_nbytes
- buf = as_buffer(data)
-
- if nbytes < 0:
- # copy from host buffer to device buffer starting from
- # position until device buffer is full
- c_nbytes = min(self.size - position, buf.size)
- else:
- if nbytes > buf.size:
- raise ValueError(
- 'requested more to copy than available from host buffer')
- if nbytes > self.size - position:
- raise ValueError(
- 'requested more to copy than available in device buffer')
- # copy nbytes from host buffer to device buffer starting
- # from position
- c_nbytes = nbytes
-
- cdef:
- shared_ptr[CBuffer] c_buf = pyarrow_unwrap_buffer(buf)
- int64_t c_position = position
- with nogil:
- check_status(self.cuda_buffer.get().
- CopyFromHost(c_position, c_buf.get().data(),
- c_nbytes))
- return c_nbytes
-
- def copy_from_device(self, buf, int64_t position=0, int64_t nbytes=-1):
- """Copy data from device to device.
-
- Parameters
- ----------
- buf : CudaBuffer
- Specify source device buffer.
- position : int
- Specify the starting position of the copy in device buffer.
- Default: 0.
- nbytes : int
- Specify the number of bytes to copy. Default: -1 (all from
- source until device buffer, starting from position, is full)
-
- Returns
- -------
- nbytes : int
- Number of bytes copied.
-
- """
- if position < 0 or position > self.size:
- raise ValueError('position argument is out-of-range')
- cdef:
- int64_t c_nbytes
-
- if nbytes < 0:
- # copy from source device buffer to device buffer starting
- # from position until device buffer is full
- c_nbytes = min(self.size - position, buf.size)
- else:
- if nbytes > buf.size:
- raise ValueError(
- 'requested more to copy than available from device buffer')
- if nbytes > self.size - position:
- raise ValueError(
- 'requested more to copy than available in device buffer')
- # copy nbytes from source device buffer to device buffer
- # starting from position
- c_nbytes = nbytes
-
- cdef:
- shared_ptr[CCudaBuffer] c_buf = pyarrow_unwrap_cudabuffer(buf)
- int64_t c_position = position
- shared_ptr[CCudaContext] c_src_ctx = pyarrow_unwrap_cudacontext(
- buf.context)
- void* c_source_data = <void*>(c_buf.get().address())
-
- if self.context.handle != buf.context.handle:
- with nogil:
- check_status(self.cuda_buffer.get().
- CopyFromAnotherDevice(c_src_ctx, c_position,
- c_source_data, c_nbytes))
- else:
- with nogil:
- check_status(self.cuda_buffer.get().
- CopyFromDevice(c_position, c_source_data,
- c_nbytes))
- return c_nbytes
-
- def export_for_ipc(self):
- """
- Expose this device buffer as IPC memory which can be used in other
- processes.
-
- After calling this function, this device memory will not be
- freed when the CudaBuffer is destructed.
-
- Returns
- -------
- ipc_handle : IpcMemHandle
- The exported IPC handle
-
- """
- cdef shared_ptr[CCudaIpcMemHandle] handle
- with nogil:
- handle = GetResultValue(self.cuda_buffer.get().ExportForIpc())
- return pyarrow_wrap_cudaipcmemhandle(handle)
-
- @property
- def context(self):
- """Returns the CUDA driver context of this buffer.
- """
- return pyarrow_wrap_cudacontext(self.cuda_buffer.get().context())
-
- def slice(self, offset=0, length=None):
- """Return slice of device buffer
-
- Parameters
- ----------
- offset : int, default 0
- Specify offset from the start of device buffer to slice
- length : int, default None
- Specify the length of slice (default is until end of device
- buffer starting from offset). If the length is larger than
- the data available, the returned slice will have a size of
- the available data starting from the offset.
-
- Returns
- -------
- sliced : CudaBuffer
- Zero-copy slice of device buffer.
-
- """
- if offset < 0 or (self.size and offset >= self.size):
- raise ValueError('offset argument is out-of-range')
- cdef int64_t offset_ = offset
- cdef int64_t size
- if length is None:
- size = self.size - offset_
- elif offset + length <= self.size:
- size = length
- else:
- size = self.size - offset
- parent = pyarrow_unwrap_cudabuffer(self)
- return pyarrow_wrap_cudabuffer(make_shared[CCudaBuffer](parent,
- offset_, size))
-
- def to_pybytes(self):
- """Return device buffer content as Python bytes.
- """
- return self.copy_to_host().to_pybytes()
-
- def __getbuffer__(self, cp.Py_buffer* buffer, int flags):
- # Device buffer contains data pointers on the device. Hence,
- # cannot support buffer protocol PEP-3118 for CudaBuffer.
- raise BufferError('buffer protocol for device buffer not supported')
-
-
-cdef class HostBuffer(Buffer):
- """Device-accessible CPU memory created using cudaHostAlloc.
-
- To create a HostBuffer instance, use
-
- cuda.new_host_buffer(<nbytes>)
- """
-
- def __init__(self):
- raise TypeError("Do not call HostBuffer's constructor directly,"
- " use `cuda.new_host_buffer` function instead.")
-
- cdef void init_host(self, const shared_ptr[CCudaHostBuffer]& buffer):
- self.host_buffer = buffer
- self.init(<shared_ptr[CBuffer]> buffer)
-
- @property
- def size(self):
- return self.host_buffer.get().size()
-
-
-cdef class BufferReader(NativeFile):
- """File interface for zero-copy read from CUDA buffers.
-
- Note: Read methods return pointers to device memory. This means
- you must be careful using this interface with any Arrow code which
- may expect to be able to do anything other than pointer arithmetic
- on the returned buffers.
- """
-
- def __cinit__(self, CudaBuffer obj):
- self.buffer = obj
- self.reader = new CCudaBufferReader(self.buffer.buffer)
- self.set_random_access_file(
- shared_ptr[CRandomAccessFile](self.reader))
- self.is_readable = True
-
- def read_buffer(self, nbytes=None):
- """Return a slice view of the underlying device buffer.
-
- The slice will start at the current reader position and will
- have specified size in bytes.
-
- Parameters
- ----------
- nbytes : int, default None
- Specify the number of bytes to read. Default: None (read all
- remaining bytes).
-
- Returns
- -------
- cbuf : CudaBuffer
- New device buffer.
-
- """
- cdef:
- int64_t c_nbytes
- int64_t bytes_read = 0
- shared_ptr[CCudaBuffer] output
-
- if nbytes is None:
- c_nbytes = self.size() - self.tell()
- else:
- c_nbytes = nbytes
-
- with nogil:
- output = static_pointer_cast[CCudaBuffer, CBuffer](
- GetResultValue(self.reader.Read(c_nbytes)))
-
- return pyarrow_wrap_cudabuffer(output)
-
-
-cdef class BufferWriter(NativeFile):
- """File interface for writing to CUDA buffers.
-
- By default writes are unbuffered. Use set_buffer_size to enable
- buffering.
- """
-
- def __cinit__(self, CudaBuffer buffer):
- self.buffer = buffer
- self.writer = new CCudaBufferWriter(self.buffer.cuda_buffer)
- self.set_output_stream(shared_ptr[COutputStream](self.writer))
- self.is_writable = True
-
- def writeat(self, int64_t position, object data):
- """Write data to buffer starting from position.
-
- Parameters
- ----------
- position : int
- Specify device buffer position where the data will be
- written.
- data : array-like
- Specify data, the data instance must implement buffer
- protocol.
- """
- cdef:
- Buffer buf = as_buffer(data)
- const uint8_t* c_data = buf.buffer.get().data()
- int64_t c_size = buf.buffer.get().size()
-
- with nogil:
- check_status(self.writer.WriteAt(position, c_data, c_size))
-
- def flush(self):
- """ Flush the buffer stream """
- with nogil:
- check_status(self.writer.Flush())
-
- def seek(self, int64_t position, int whence=0):
- # TODO: remove this method after NativeFile.seek supports
- # writable files.
- cdef int64_t offset
-
- with nogil:
- if whence == 0:
- offset = position
- elif whence == 1:
- offset = GetResultValue(self.writer.Tell())
- offset = offset + position
- else:
- with gil:
- raise ValueError("Invalid value of whence: {0}"
- .format(whence))
- check_status(self.writer.Seek(offset))
- return self.tell()
-
- @property
- def buffer_size(self):
- """Returns size of host (CPU) buffer, 0 for unbuffered
- """
- return self.writer.buffer_size()
-
- @buffer_size.setter
- def buffer_size(self, int64_t buffer_size):
- """Set CPU buffer size to limit calls to cudaMemcpy
-
- Parameters
- ----------
- buffer_size : int
- Specify the size of CPU buffer to allocate in bytes.
- """
- with nogil:
- check_status(self.writer.SetBufferSize(buffer_size))
-
- @property
- def num_bytes_buffered(self):
- """Returns number of bytes buffered on host
- """
- return self.writer.num_bytes_buffered()
-
-# Functions
-
-
-def new_host_buffer(const int64_t size, int device=0):
- """Return buffer with CUDA-accessible memory on CPU host
-
- Parameters
- ----------
- size : int
- Specify the number of bytes to be allocated.
- device : int
- Specify GPU device number.
-
- Returns
- -------
- dbuf : HostBuffer
- Allocated host buffer
- """
- cdef shared_ptr[CCudaHostBuffer] buffer
- with nogil:
- buffer = GetResultValue(AllocateCudaHostBuffer(device, size))
- return pyarrow_wrap_cudahostbuffer(buffer)
-
-
-def serialize_record_batch(object batch, object ctx):
- """ Write record batch message to GPU device memory
-
- Parameters
- ----------
- batch : RecordBatch
- Record batch to write
- ctx : Context
- CUDA Context to allocate device memory from
-
- Returns
- -------
- dbuf : CudaBuffer
- device buffer which contains the record batch message
- """
- cdef shared_ptr[CCudaBuffer] buffer
- cdef CRecordBatch* batch_ = pyarrow_unwrap_batch(batch).get()
- cdef CCudaContext* ctx_ = pyarrow_unwrap_cudacontext(ctx).get()
- with nogil:
- buffer = GetResultValue(CudaSerializeRecordBatch(batch_[0], ctx_))
- return pyarrow_wrap_cudabuffer(buffer)
-
-
-def read_message(object source, pool=None):
- """ Read Arrow IPC message located on GPU device
-
- Parameters
- ----------
- source : {CudaBuffer, cuda.BufferReader}
- Device buffer or reader of device buffer.
- pool : MemoryPool (optional)
- Pool to allocate CPU memory for the metadata
-
- Returns
- -------
- message : Message
- The deserialized message, body still on device
- """
- cdef:
- Message result = Message.__new__(Message)
- cdef CMemoryPool* pool_ = maybe_unbox_memory_pool(pool)
- if not isinstance(source, BufferReader):
- reader = BufferReader(source)
- with nogil:
- result.message = move(
- GetResultValue(ReadMessage(reader.reader, pool_)))
- return result
-
-
-def read_record_batch(object buffer, object schema, *,
- DictionaryMemo dictionary_memo=None, pool=None):
- """Construct RecordBatch referencing IPC message located on CUDA device.
-
- While the metadata is copied to host memory for deserialization,
- the record batch data remains on the device.
-
- Parameters
- ----------
- buffer :
- Device buffer containing the complete IPC message
- schema : Schema
- The schema for the record batch
- dictionary_memo : DictionaryMemo, optional
- If message contains dictionaries, must pass a populated
- DictionaryMemo
- pool : MemoryPool (optional)
- Pool to allocate metadata from
-
- Returns
- -------
- batch : RecordBatch
- Reconstructed record batch, with device pointers
-
- """
- cdef:
- shared_ptr[CSchema] schema_ = pyarrow_unwrap_schema(schema)
- shared_ptr[CCudaBuffer] buffer_ = pyarrow_unwrap_cudabuffer(buffer)
- CDictionaryMemo temp_memo
- CDictionaryMemo* arg_dict_memo
- CMemoryPool* pool_ = maybe_unbox_memory_pool(pool)
- shared_ptr[CRecordBatch] batch
-
- if dictionary_memo is not None:
- arg_dict_memo = dictionary_memo.memo
- else:
- arg_dict_memo = &temp_memo
-
- with nogil:
- batch = GetResultValue(CudaReadRecordBatch(
- schema_, arg_dict_memo, buffer_, pool_))
- return pyarrow_wrap_batch(batch)
-
-
-# Public API
-
-
-cdef public api bint pyarrow_is_buffer(object buffer):
- return isinstance(buffer, Buffer)
-
-# cudabuffer
-
-cdef public api bint pyarrow_is_cudabuffer(object buffer):
- return isinstance(buffer, CudaBuffer)
-
-
-cdef public api object \
- pyarrow_wrap_cudabuffer_base(const shared_ptr[CCudaBuffer]& buf, base):
- cdef CudaBuffer result = CudaBuffer.__new__(CudaBuffer)
- result.init_cuda(buf, base)
- return result
-
-
-cdef public api object \
- pyarrow_wrap_cudabuffer(const shared_ptr[CCudaBuffer]& buf):
- cdef CudaBuffer result = CudaBuffer.__new__(CudaBuffer)
- result.init_cuda(buf, None)
- return result
-
-
-cdef public api shared_ptr[CCudaBuffer] pyarrow_unwrap_cudabuffer(object obj):
- if pyarrow_is_cudabuffer(obj):
- return (<CudaBuffer>obj).cuda_buffer
- raise TypeError('expected CudaBuffer instance, got %s'
- % (type(obj).__name__))
-
-# cudahostbuffer
-
-cdef public api bint pyarrow_is_cudahostbuffer(object buffer):
- return isinstance(buffer, HostBuffer)
-
-
-cdef public api object \
- pyarrow_wrap_cudahostbuffer(const shared_ptr[CCudaHostBuffer]& buf):
- cdef HostBuffer result = HostBuffer.__new__(HostBuffer)
- result.init_host(buf)
- return result
-
-
-cdef public api shared_ptr[CCudaHostBuffer] \
- pyarrow_unwrap_cudahostbuffer(object obj):
- if pyarrow_is_cudahostbuffer(obj):
- return (<HostBuffer>obj).host_buffer
- raise TypeError('expected HostBuffer instance, got %s'
- % (type(obj).__name__))
-
-# cudacontext
-
-cdef public api bint pyarrow_is_cudacontext(object ctx):
- return isinstance(ctx, Context)
-
-
-cdef public api object \
- pyarrow_wrap_cudacontext(const shared_ptr[CCudaContext]& ctx):
- cdef Context result = Context.__new__(Context)
- result.init(ctx)
- return result
-
-
-cdef public api shared_ptr[CCudaContext] \
- pyarrow_unwrap_cudacontext(object obj):
- if pyarrow_is_cudacontext(obj):
- return (<Context>obj).context
- raise TypeError('expected Context instance, got %s'
- % (type(obj).__name__))
-
-# cudaipcmemhandle
-
-cdef public api bint pyarrow_is_cudaipcmemhandle(object handle):
- return isinstance(handle, IpcMemHandle)
-
-
-cdef public api object \
- pyarrow_wrap_cudaipcmemhandle(shared_ptr[CCudaIpcMemHandle]& h):
- cdef IpcMemHandle result = IpcMemHandle.__new__(IpcMemHandle)
- result.init(h)
- return result
-
-
-cdef public api shared_ptr[CCudaIpcMemHandle] \
- pyarrow_unwrap_cudaipcmemhandle(object obj):
- if pyarrow_is_cudaipcmemhandle(obj):
- return (<IpcMemHandle>obj).handle
- raise TypeError('expected IpcMemHandle instance, got %s'
- % (type(obj).__name__))
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
deleted file mode 100644
index 6199428..0000000
--- a/python/pyarrow/_dataset.pyx
+++ /dev/null
@@ -1,2977 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements. See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership. The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied. See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# cython: language_level = 3
-
-"""Dataset is currently unstable. APIs subject to change without notice."""
-
-from cpython.object cimport Py_LT, Py_EQ, Py_GT, Py_LE, Py_NE, Py_GE
-from cython.operator cimport dereference as deref
-
-import collections
-import os
-import warnings
-
-import pyarrow as pa
-from pyarrow.lib cimport *
-from pyarrow.lib import ArrowTypeError, frombytes, tobytes
-from pyarrow.includes.libarrow_dataset cimport *
-from pyarrow._fs cimport FileSystem, FileInfo, FileSelector
-from pyarrow._csv cimport ConvertOptions, ParseOptions, ReadOptions
-from pyarrow.util import _is_iterable, _is_path_like, _stringify_path
-
-from pyarrow._parquet cimport (
- _create_writer_properties, _create_arrow_writer_properties,
- FileMetaData, RowGroupMetaData, ColumnChunkMetaData
-)
-
-
-def _forbid_instantiation(klass, subclasses_instead=True):
- msg = '{} is an abstract class thus cannot be initialized.'.format(
- klass.__name__
- )
- if subclasses_instead:
- subclasses = [cls.__name__ for cls in klass.__subclasses__]
- msg += ' Use one of the subclasses instead: {}'.format(
- ', '.join(subclasses)
- )
- raise TypeError(msg)
-
-
-cdef CFileSource _make_file_source(object file, FileSystem filesystem=None):
-
- cdef:
- CFileSource c_source
- shared_ptr[CFileSystem] c_filesystem
- c_string c_path
- shared_ptr[CRandomAccessFile] c_file
- shared_ptr[CBuffer] c_buffer
-
- if isinstance(file, Buffer):
- c_buffer = pyarrow_unwrap_buffer(file)
- c_source = CFileSource(move(c_buffer))
-
- elif _is_path_like(file):
- if filesystem is None:
- raise ValueError("cannot construct a FileSource from "
- "a path without a FileSystem")
- c_filesystem = filesystem.unwrap()
- c_path = tobytes(_stringify_path(file))
- c_source = CFileSource(move(c_path), move(c_filesystem))
-
- elif hasattr(file, 'read'):
- # Optimistically hope this is file-like
- c_file = get_native_file(file, False).get_random_access_file()
- c_source = CFileSource(move(c_file))
-
- else:
- raise TypeError("cannot construct a FileSource "
- "from " + str(file))
-
- return c_source
-
-
-cdef class Expression(_Weakrefable):
- """
- A logical expression to be evaluated against some input.
-
- To create an expression:
-
- - Use the factory function ``pyarrow.dataset.scalar()`` to create a
- scalar (not necessary when combined, see example below).
- - Use the factory function ``pyarrow.dataset.field()`` to reference
- a field (column in table).
- - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``.
- - Combine expressions using python operators ``&`` (logical and),
- ``|`` (logical or) and ``~`` (logical not).
- Note: python keywords ``and``, ``or`` and ``not`` cannot be used
- to combine expressions.
- - Check whether the expression is contained in a list of values with
- the ``pyarrow.dataset.Expression.isin()`` member function.
-
- Examples
- --------
-
- >>> import pyarrow.dataset as ds
- >>> (ds.field("a") < ds.scalar(3)) | (ds.field("b") > 7)
- <pyarrow.dataset.Expression ((a < 3:int64) or (b > 7:int64))>
- >>> ds.field('a') != 3
- <pyarrow.dataset.Expression (a != 3)>
- >>> ds.field('a').isin([1, 2, 3])
- <pyarrow.dataset.Expression (a is in [
- 1,
- 2,
- 3
- ])>
- """
- cdef:
- CExpression expr
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef void init(self, const CExpression& sp):
- self.expr = sp
-
- @staticmethod
- cdef wrap(const CExpression& sp):
- cdef Expression self = Expression.__new__(Expression)
- self.init(sp)
- return self
-
- cdef inline CExpression unwrap(self):
- return self.expr
-
- def equals(self, Expression other):
- return self.expr.Equals(other.unwrap())
-
- def __str__(self):
- return frombytes(self.expr.ToString())
-
- def __repr__(self):
- return "<pyarrow.dataset.{0} {1}>".format(
- self.__class__.__name__, str(self)
- )
-
- @staticmethod
- def _deserialize(Buffer buffer not None):
- return Expression.wrap(GetResultValue(CDeserializeExpression(
- pyarrow_unwrap_buffer(buffer))))
-
- def __reduce__(self):
- buffer = pyarrow_wrap_buffer(GetResultValue(
- CSerializeExpression(self.expr)))
- return Expression._deserialize, (buffer,)
-
- @staticmethod
- cdef Expression _expr_or_scalar(object expr):
- if isinstance(expr, Expression):
- return (<Expression> expr)
- return (<Expression> Expression._scalar(expr))
-
- @staticmethod
- cdef Expression _call(str function_name, list arguments,
- shared_ptr[CFunctionOptions] options=(
- <shared_ptr[CFunctionOptions]> nullptr)):
- cdef:
- vector[CExpression] c_arguments
-
- for argument in arguments:
- c_arguments.push_back((<Expression> argument).expr)
-
- return Expression.wrap(CMakeCallExpression(tobytes(function_name),
- move(c_arguments), options))
-
- def __richcmp__(self, other, int op):
- other = Expression._expr_or_scalar(other)
- return Expression._call({
- Py_EQ: "equal",
- Py_NE: "not_equal",
- Py_GT: "greater",
- Py_GE: "greater_equal",
- Py_LT: "less",
- Py_LE: "less_equal",
- }[op], [self, other])
-
- def __bool__(self):
- raise ValueError(
- "An Expression cannot be evaluated to python True or False. "
- "If you are using the 'and', 'or' or 'not' operators, use '&', "
- "'|' or '~' instead."
- )
-
- def __invert__(self):
- return Expression._call("invert", [self])
-
- def __and__(Expression self, other):
- other = Expression._expr_or_scalar(other)
- return Expression._call("and_kleene", [self, other])
-
- def __or__(Expression self, other):
- other = Expression._expr_or_scalar(other)
- return Expression._call("or_kleene", [self, other])
-
- def __add__(Expression self, other):
- other = Expression._expr_or_scalar(other)
- return Expression._call("add_checked", [self, other])
-
- def __mul__(Expression self, other):
- other = Expression._expr_or_scalar(other)
- return Expression._call("multiply_checked", [self, other])
-
- def __sub__(Expression self, other):
- other = Expression._expr_or_scalar(other)
- return Expression._call("subtract_checked", [self, other])
-
- def __truediv__(Expression self, other):
- other = Expression._expr_or_scalar(other)
- return Expression._call("divide_checked", [self, other])
-
- def is_valid(self):
- """Checks whether the expression is not-null (valid)"""
- return Expression._call("is_valid", [self])
-
- def is_null(self):
- """Checks whether the expression is null"""
- return Expression._call("is_null", [self])
-
- def cast(self, type, bint safe=True):
- """Explicitly change the expression's data type"""
- cdef shared_ptr[CCastOptions] c_options
- c_options.reset(new CCastOptions(safe))
- c_options.get().to_type = pyarrow_unwrap_data_type(ensure_type(type))
- return Expression._call("cast", [self],
- <shared_ptr[CFunctionOptions]> c_options)
-
- def isin(self, values):
- """Checks whether the expression is contained in values"""
- cdef:
- shared_ptr[CFunctionOptions] c_options
- CDatum c_values
-
- if not isinstance(values, pa.Array):
- values = pa.array(values)
-
- c_values = CDatum(pyarrow_unwrap_array(values))
- c_options.reset(new CSetLookupOptions(c_values, True))
- return Expression._call("is_in", [self], c_options)
-
- @staticmethod
- def _field(str name not None):
- return Expression.wrap(CMakeFieldExpression(tobytes(name)))
-
- @staticmethod
- def _scalar(value):
- cdef:
- Scalar scalar
-
- if isinstance(value, Scalar):
- scalar = value
- else:
- scalar = pa.scalar(value)
-
- return Expression.wrap(CMakeScalarExpression(scalar.unwrap()))
-
-
-_deserialize = Expression._deserialize
-cdef Expression _true = Expression._scalar(True)
-
-
-cdef class Dataset(_Weakrefable):
- """
- Collection of data fragments and potentially child datasets.
-
- Arrow Datasets allow you to query against data that has been split across
- multiple files. This sharding of data may indicate partitioning, which
- can accelerate queries that only touch some partitions (files).
- """
-
- cdef:
- shared_ptr[CDataset] wrapped
- CDataset* dataset
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef void init(self, const shared_ptr[CDataset]& sp):
- self.wrapped = sp
- self.dataset = sp.get()
-
- @staticmethod
- cdef wrap(const shared_ptr[CDataset]& sp):
- type_name = frombytes(sp.get().type_name())
-
- classes = {
- 'union': UnionDataset,
- 'filesystem': FileSystemDataset,
- }
-
- class_ = classes.get(type_name, None)
- if class_ is None:
- raise TypeError(type_name)
-
- cdef Dataset self = class_.__new__(class_)
- self.init(sp)
- return self
-
- cdef shared_ptr[CDataset] unwrap(self) nogil:
- return self.wrapped
-
- @property
- def partition_expression(self):
- """
- An Expression which evaluates to true for all data viewed by this
- Dataset.
- """
- return Expression.wrap(self.dataset.partition_expression())
-
- def replace_schema(self, Schema schema not None):
- """
- Return a copy of this Dataset with a different schema.
-
- The copy will view the same Fragments. If the new schema is not
- compatible with the original dataset's schema then an error will
- be raised.
- """
- cdef shared_ptr[CDataset] copy = GetResultValue(
- self.dataset.ReplaceSchema(pyarrow_unwrap_schema(schema)))
- return Dataset.wrap(move(copy))
-
- def get_fragments(self, Expression filter=None):
- """Returns an iterator over the fragments in this dataset.
-
- Parameters
- ----------
- filter : Expression, default None
- Return fragments matching the optional filter, either using the
- partition_expression or internal information like Parquet's
- statistics.
-
- Returns
- -------
- fragments : iterator of Fragment
- """
- cdef:
- CExpression c_filter
- CFragmentIterator c_iterator
-
- if filter is None:
- c_fragments = move(GetResultValue(self.dataset.GetFragments()))
- else:
- c_filter = _bind(filter, self.schema)
- c_fragments = move(GetResultValue(
- self.dataset.GetFragments(c_filter)))
-
- for maybe_fragment in c_fragments:
- yield Fragment.wrap(GetResultValue(move(maybe_fragment)))
-
- def _scanner(self, **kwargs):
- return Scanner.from_dataset(self, **kwargs)
-
- def scan(self, **kwargs):
- """Builds a scan operation against the dataset.
-
- It produces a stream of ScanTasks which is meant to be a unit of work
- to be dispatched. The tasks are not executed automatically, the user is
- responsible to execute and dispatch the individual tasks, so custom
- local task scheduling can be implemented.
-
- .. deprecated:: 4.0.0
- Use `to_batches` instead.
-
- Parameters
- ----------
- columns : list of str, default None
- The columns to project. This can be a list of column names to
- include (order and duplicates will be preserved), or a dictionary
- with {new_column_name: expression} values for more advanced
- projections.
- The columns will be passed down to Datasets and corresponding data
- fragments to avoid loading, copying, and deserializing columns
- that will not be required further down the compute chain.
- By default all of the available columns are projected. Raises
- an exception if any of the referenced column names does not exist
- in the dataset's Schema.
- filter : Expression, default None
- Scan will return only the rows matching the filter.
- If possible the predicate will be pushed down to exploit the
- partition information or internal metadata found in the data
- source, e.g. Parquet statistics. Otherwise filters the loaded
- RecordBatches before yielding them.
- batch_size : int, default 1M
- The maximum row count for scanned record batches. If scanned
- record batches are overflowing memory then this method can be
- called to reduce their size.
- use_threads : bool, default True
- If enabled, then maximum parallelism will be used determined by
- the number of available CPU cores.
- memory_pool : MemoryPool, default None
- For memory allocations, if required. If not specified, uses the
- default pool.
- fragment_scan_options : FragmentScanOptions, default None
- Options specific to a particular scan and fragment type, which
- can change between different scans of the same dataset.
-
- Returns
- -------
- scan_tasks : iterator of ScanTask
-
- Examples
- --------
- >>> import pyarrow.dataset as ds
- >>> dataset = ds.dataset("path/to/dataset")
-
- Selecting a subset of the columns:
-
- >>> dataset.scan(columns=["A", "B"])
-
- Projecting selected columns using an expression:
-
- >>> dataset.scan(columns={"A_int": ds.field("A").cast("int64")})
-
- Filtering rows while scanning:
-
- >>> dataset.scan(filter=ds.field("A") > 0)
- """
- return self._scanner(**kwargs).scan()
-
- def to_batches(self, **kwargs):
- """Read the dataset as materialized record batches.
-
- Builds a scan operation against the dataset and sequentially executes
- the ScanTasks as the returned generator gets consumed.
-
- See scan method parameters documentation.
-
- Returns
- -------
- record_batches : iterator of RecordBatch
- """
- return self._scanner(**kwargs).to_batches()
-
- def to_table(self, **kwargs):
- """Read the dataset to an arrow table.
-
- Note that this method reads all the selected data from the dataset
- into memory.
-
- See scan method parameters documentation.
-
- Returns
- -------
- table : Table instance
- """
- return self._scanner(**kwargs).to_table()
-
- def head(self, int num_rows, **kwargs):
- """Load the first N rows of the dataset.
-
- See scan method parameters documentation.
-
- Returns
- -------
- table : Table instance
- """
- return self._scanner(**kwargs).head(num_rows)
-
- @property
- def schema(self):
- """The common schema of the full Dataset"""
- return pyarrow_wrap_schema(self.dataset.schema())
-
-
-cdef class InMemoryDataset(Dataset):
- """A Dataset wrapping in-memory data.
-
- Parameters
- ----------
- source
- The data for this dataset. Can be a RecordBatch, Table, list of
- RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader.
- If an iterable is provided, the schema must also be provided.
- schema : Schema, optional
- Only required if passing an iterable as the source.
- """
-
- cdef:
- CInMemoryDataset* in_memory_dataset
-
- def __init__(self, source, Schema schema=None):
- cdef:
- RecordBatchReader reader
- shared_ptr[CInMemoryDataset] in_memory_dataset
-
- if isinstance(source, (pa.RecordBatch, pa.Table)):
- source = [source]
-
- if isinstance(source, (list, tuple)):
- batches = []
- for item in source:
- if isinstance(item, pa.RecordBatch):
- batches.append(item)
- elif isinstance(item, pa.Table):
- batches.extend(item.to_batches())
- else:
- raise TypeError(
- 'Expected a list of tables or batches. The given list '
- 'contains a ' + type(item).__name__)
- if schema is None:
- schema = item.schema
- elif not schema.equals(item.schema):
- raise ArrowTypeError(
- f'Item has schema\n{item.schema}\nwhich does not '
- f'match expected schema\n{schema}')
- if not batches and schema is None:
- raise ValueError('Must provide schema to construct in-memory '
- 'dataset from an empty list')
- table = pa.Table.from_batches(batches, schema=schema)
- in_memory_dataset = make_shared[CInMemoryDataset](
- pyarrow_unwrap_table(table))
- elif isinstance(source, pa.ipc.RecordBatchReader):
- reader = source
- in_memory_dataset = make_shared[CInMemoryDataset](reader.reader)
- elif _is_iterable(source):
- if schema is None:
- raise ValueError('Must provide schema to construct in-memory '
- 'dataset from an iterable')
- reader = pa.ipc.RecordBatchReader.from_batches(schema, source)
- in_memory_dataset = make_shared[CInMemoryDataset](reader.reader)
- else:
- raise TypeError(
- 'Expected a table, batch, iterable of tables/batches, or a '
- 'record batch reader instead of the given type: ' +
- type(source).__name__
- )
-
- self.init(<shared_ptr[CDataset]> in_memory_dataset)
-
- cdef void init(self, const shared_ptr[CDataset]& sp):
- Dataset.init(self, sp)
- self.in_memory_dataset = <CInMemoryDataset*> sp.get()
-
-
-cdef class UnionDataset(Dataset):
- """A Dataset wrapping child datasets.
-
- Children's schemas must agree with the provided schema.
-
- Parameters
- ----------
- schema : Schema
- A known schema to conform to.
- children : list of Dataset
- One or more input children
- """
-
- cdef:
- CUnionDataset* union_dataset
-
- def __init__(self, Schema schema not None, children):
- cdef:
- Dataset child
- CDatasetVector c_children
- shared_ptr[CUnionDataset] union_dataset
-
- for child in children:
- c_children.push_back(child.wrapped)
-
- union_dataset = GetResultValue(CUnionDataset.Make(
- pyarrow_unwrap_schema(schema), move(c_children)))
- self.init(<shared_ptr[CDataset]> union_dataset)
-
- cdef void init(self, const shared_ptr[CDataset]& sp):
- Dataset.init(self, sp)
- self.union_dataset = <CUnionDataset*> sp.get()
-
- def __reduce__(self):
- return UnionDataset, (self.schema, self.children)
-
- @property
- def children(self):
- cdef CDatasetVector children = self.union_dataset.children()
- return [Dataset.wrap(children[i]) for i in range(children.size())]
-
-
-cdef class FileSystemDataset(Dataset):
- """A Dataset of file fragments.
-
- A FileSystemDataset is composed of one or more FileFragment.
-
- Parameters
- ----------
- fragments : list[Fragments]
- List of fragments to consume.
- schema : Schema
- The top-level schema of the Dataset.
- format : FileFormat
- File format of the fragments, currently only ParquetFileFormat,
- IpcFileFormat, and CsvFileFormat are supported.
- filesystem : FileSystem
- FileSystem of the fragments.
- root_partition : Expression, optional
- The top-level partition of the DataDataset.
- """
-
- cdef:
- CFileSystemDataset* filesystem_dataset
-
- def __init__(self, fragments, Schema schema, FileFormat format,
- FileSystem filesystem=None, root_partition=None):
- cdef:
- FileFragment fragment=None
- vector[shared_ptr[CFileFragment]] c_fragments
- CResult[shared_ptr[CDataset]] result
- shared_ptr[CFileSystem] c_filesystem
-
- if root_partition is None:
- root_partition = _true
- elif not isinstance(root_partition, Expression):
- raise TypeError(
- "Argument 'root_partition' has incorrect type (expected "
- "Epression, got {0})".format(type(root_partition))
- )
-
- for fragment in fragments:
- c_fragments.push_back(
- static_pointer_cast[CFileFragment, CFragment](
- fragment.unwrap()))
-
- if filesystem is None:
- filesystem = fragment.filesystem
-
- if filesystem is not None:
- c_filesystem = filesystem.unwrap()
-
- result = CFileSystemDataset.Make(
- pyarrow_unwrap_schema(schema),
- (<Expression> root_partition).unwrap(),
- format.unwrap(),
- c_filesystem,
- c_fragments
- )
- self.init(GetResultValue(result))
-
- @property
- def filesystem(self):
- return FileSystem.wrap(self.filesystem_dataset.filesystem())
-
- cdef void init(self, const shared_ptr[CDataset]& sp):
- Dataset.init(self, sp)
- self.filesystem_dataset = <CFileSystemDataset*> sp.get()
-
- def __reduce__(self):
- return FileSystemDataset, (
- list(self.get_fragments()),
- self.schema,
- self.format,
- self.filesystem,
- self.partition_expression
- )
-
- @classmethod
- def from_paths(cls, paths, schema=None, format=None,
- filesystem=None, partitions=None, root_partition=None):
- """A Dataset created from a list of paths on a particular filesystem.
-
- Parameters
- ----------
- paths : list of str
- List of file paths to create the fragments from.
- schema : Schema
- The top-level schema of the DataDataset.
- format : FileFormat
- File format to create fragments from, currently only
- ParquetFileFormat, IpcFileFormat, and CsvFileFormat are supported.
- filesystem : FileSystem
- The filesystem which files are from.
- partitions : List[Expression], optional
- Attach additional partition information for the file paths.
- root_partition : Expression, optional
- The top-level partition of the DataDataset.
- """
- cdef:
- FileFragment fragment
-
- if root_partition is None:
- root_partition = _true
-
- for arg, class_, name in [
- (schema, Schema, 'schema'),
- (format, FileFormat, 'format'),
- (filesystem, FileSystem, 'filesystem'),
- (root_partition, Expression, 'root_partition')
- ]:
- if not isinstance(arg, class_):
- raise TypeError(
- "Argument '{0}' has incorrect type (expected {1}, "
- "got {2})".format(name, class_.__name__, type(arg))
- )
-
- partitions = partitions or [_true] * len(paths)
-
- if len(paths) != len(partitions):
- raise ValueError(
- 'The number of files resulting from paths_or_selector '
- 'must be equal to the number of partitions.'
- )
-
- fragments = [
- format.make_fragment(path, filesystem, partitions[i])
- for i, path in enumerate(paths)
- ]
- return FileSystemDataset(fragments, schema, format,
- filesystem, root_partition)
-
- @property
- def files(self):
- """List of the files"""
- cdef vector[c_string] files = self.filesystem_dataset.files()
- return [frombytes(f) for f in files]
-
- @property
- def format(self):
- """The FileFormat of this source."""
- return FileFormat.wrap(self.filesystem_dataset.format())
-
-
-cdef CExpression _bind(Expression filter, Schema schema) except *:
- assert schema is not None
-
- if filter is None:
- return _true.unwrap()
-
- return GetResultValue(filter.unwrap().Bind(
- deref(pyarrow_unwrap_schema(schema).get())))
-
-
-cdef class FileWriteOptions(_Weakrefable):
-
- cdef:
- shared_ptr[CFileWriteOptions] wrapped
- CFileWriteOptions* options
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef void init(self, const shared_ptr[CFileWriteOptions]& sp):
- self.wrapped = sp
- self.options = sp.get()
-
- @staticmethod
- cdef wrap(const shared_ptr[CFileWriteOptions]& sp):
- type_name = frombytes(sp.get().type_name())
-
- classes = {
- 'ipc': IpcFileWriteOptions,
- 'parquet': ParquetFileWriteOptions,
- }
-
- class_ = classes.get(type_name, None)
- if class_ is None:
- raise TypeError(type_name)
-
- cdef FileWriteOptions self = class_.__new__(class_)
- self.init(sp)
- return self
-
- @property
- def format(self):
- return FileFormat.wrap(self.options.format())
-
- cdef inline shared_ptr[CFileWriteOptions] unwrap(self):
- return self.wrapped
-
-
-cdef class FileFormat(_Weakrefable):
-
- cdef:
- shared_ptr[CFileFormat] wrapped
- CFileFormat* format
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef void init(self, const shared_ptr[CFileFormat]& sp):
- self.wrapped = sp
- self.format = sp.get()
-
- @staticmethod
- cdef wrap(const shared_ptr[CFileFormat]& sp):
- type_name = frombytes(sp.get().type_name())
-
- classes = {
- 'ipc': IpcFileFormat,
- 'csv': CsvFileFormat,
- 'parquet': ParquetFileFormat,
- }
-
- class_ = classes.get(type_name, None)
- if class_ is None:
- raise TypeError(type_name)
-
- cdef FileFormat self = class_.__new__(class_)
- self.init(sp)
- return self
-
- cdef inline shared_ptr[CFileFormat] unwrap(self):
- return self.wrapped
-
- def inspect(self, file, filesystem=None):
- """Infer the schema of a file."""
- c_source = _make_file_source(file, filesystem)
- c_schema = GetResultValue(self.format.Inspect(c_source))
- return pyarrow_wrap_schema(move(c_schema))
-
- def make_fragment(self, file, filesystem=None,
- Expression partition_expression=None):
- """
- Make a FileFragment of this FileFormat. The filter may not reference
- fields absent from the provided schema. If no schema is provided then
- one will be inferred.
- """
- if partition_expression is None:
- partition_expression = _true
-
- c_source = _make_file_source(file, filesystem)
- c_fragment = <shared_ptr[CFragment]> GetResultValue(
- self.format.MakeFragment(move(c_source),
- partition_expression.unwrap(),
- <shared_ptr[CSchema]>nullptr))
- return Fragment.wrap(move(c_fragment))
-
- def make_write_options(self):
- return FileWriteOptions.wrap(self.format.DefaultWriteOptions())
-
- @property
- def default_extname(self):
- return frombytes(self.format.type_name())
-
- @property
- def default_fragment_scan_options(self):
- return FragmentScanOptions.wrap(
- self.wrapped.get().default_fragment_scan_options)
-
- @default_fragment_scan_options.setter
- def default_fragment_scan_options(self, FragmentScanOptions options):
- if options is None:
- self.wrapped.get().default_fragment_scan_options =\
- <shared_ptr[CFragmentScanOptions]>nullptr
- else:
- self._set_default_fragment_scan_options(options)
-
- cdef _set_default_fragment_scan_options(self, FragmentScanOptions options):
- raise ValueError(f"Cannot set fragment scan options for "
- f"'{options.type_name}' on {self.__class__.__name__}")
-
- def __eq__(self, other):
- try:
- return self.equals(other)
- except TypeError:
- return False
-
-
-cdef class Fragment(_Weakrefable):
- """Fragment of data from a Dataset."""
-
- cdef:
- shared_ptr[CFragment] wrapped
- CFragment* fragment
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef void init(self, const shared_ptr[CFragment]& sp):
- self.wrapped = sp
- self.fragment = sp.get()
-
- @staticmethod
- cdef wrap(const shared_ptr[CFragment]& sp):
- type_name = frombytes(sp.get().type_name())
-
- classes = {
- # IpcFileFormat and CsvFileFormat do not have corresponding
- # subclasses of FileFragment
- 'ipc': FileFragment,
- 'csv': FileFragment,
- 'parquet': ParquetFileFragment,
- }
-
- class_ = classes.get(type_name, None)
- if class_ is None:
- class_ = Fragment
-
- cdef Fragment self = class_.__new__(class_)
- self.init(sp)
- return self
-
- cdef inline shared_ptr[CFragment] unwrap(self):
- return self.wrapped
-
- @property
- def physical_schema(self):
- """Return the physical schema of this Fragment. This schema can be
- different from the dataset read schema."""
- cdef:
- shared_ptr[CSchema] c_schema
-
- c_schema = GetResultValue(self.fragment.ReadPhysicalSchema())
- return pyarrow_wrap_schema(c_schema)
-
- @property
- def partition_expression(self):
- """An Expression which evaluates to true for all data viewed by this
- Fragment.
- """
- return Expression.wrap(self.fragment.partition_expression())
-
- def _scanner(self, **kwargs):
- return Scanner.from_fragment(self, **kwargs)
-
- def scan(self, Schema schema=None, **kwargs):
- """Builds a scan operation against the dataset.
-
- It produces a stream of ScanTasks which is meant to be a unit of work
- to be dispatched. The tasks are not executed automatically, the user is
- responsible to execute and dispatch the individual tasks, so custom
- local task scheduling can be implemented.
-
- .. deprecated:: 4.0.0
- Use `to_batches` instead.
-
- Parameters
- ----------
- schema : Schema
- Schema to use for scanning. This is used to unify a Fragment to
- it's Dataset's schema. If not specified this will use the
- Fragment's physical schema which might differ for each Fragment.
- columns : list of str, default None
- The columns to project. This can be a list of column names to
- include (order and duplicates will be preserved), or a dictionary
- with {new_column_name: expression} values for more advanced
- projections.
- The columns will be passed down to Datasets and corresponding data
- fragments to avoid loading, copying, and deserializing columns
- that will not be required further down the compute chain.
- By default all of the available columns are projected. Raises
- an exception if any of the referenced column names does not exist
- in the dataset's Schema.
- filter : Expression, default None
- Scan will return only the rows matching the filter.
- If possible the predicate will be pushed down to exploit the
- partition information or internal metadata found in the data
- source, e.g. Parquet statistics. Otherwise filters the loaded
- RecordBatches before yielding them.
- batch_size : int, default 1M
- The maximum row count for scanned record batches. If scanned
- record batches are overflowing memory then this method can be
- called to reduce their size.
- use_threads : bool, default True
- If enabled, then maximum parallelism will be used determined by
- the number of available CPU cores.
- memory_pool : MemoryPool, default None
- For memory allocations, if required. If not specified, uses the
- default pool.
- fragment_scan_options : FragmentScanOptions, default None
- Options specific to a particular scan and fragment type, which
- can change between different scans of the same dataset.
-
- Returns
- -------
- scan_tasks : iterator of ScanTask
- """
- return self._scanner(schema=schema, **kwargs).scan()
-
- def to_batches(self, Schema schema=None, **kwargs):
- """Read the fragment as materialized record batches.
-
- See scan method parameters documentation.
-
- Returns
- -------
- record_batches : iterator of RecordBatch
- """
- return self._scanner(schema=schema, **kwargs).to_batches()
-
- def to_table(self, Schema schema=None, **kwargs):
- """Convert this Fragment into a Table.
-
- Use this convenience utility with care. This will serially materialize
- the Scan result in memory before creating the Table.
-
- See scan method parameters documentation.
-
- Returns
- -------
- table : Table
- """
- return self._scanner(schema=schema, **kwargs).to_table()
-
- def head(self, int num_rows, **kwargs):
- """Load the first N rows of the fragment.
-
- See scan method parameters documentation.
-
- Returns
- -------
- table : Table instance
- """
- return self._scanner(**kwargs).head(num_rows)
-
-
-cdef class FileFragment(Fragment):
- """A Fragment representing a data file."""
-
- cdef:
- CFileFragment* file_fragment
-
- cdef void init(self, const shared_ptr[CFragment]& sp):
- Fragment.init(self, sp)
- self.file_fragment = <CFileFragment*> sp.get()
-
- def __reduce__(self):
- buffer = self.buffer
- return self.format.make_fragment, (
- self.path if buffer is None else buffer,
- self.filesystem,
- self.partition_expression
- )
-
- @property
- def path(self):
- """
- The path of the data file viewed by this fragment, if it views a
- file. If instead it views a buffer, this will be "<Buffer>".
- """
- return frombytes(self.file_fragment.source().path())
-
- @property
- def filesystem(self):
- """
- The FileSystem containing the data file viewed by this fragment, if
- it views a file. If instead it views a buffer, this will be None.
- """
- cdef:
- shared_ptr[CFileSystem] c_fs
- c_fs = self.file_fragment.source().filesystem()
-
- if c_fs.get() == nullptr:
- return None
-
- return FileSystem.wrap(c_fs)
-
- @property
- def buffer(self):
- """
- The buffer viewed by this fragment, if it views a buffer. If
- instead it views a file, this will be None.
- """
- cdef:
- shared_ptr[CBuffer] c_buffer
- c_buffer = self.file_fragment.source().buffer()
-
- if c_buffer.get() == nullptr:
- return None
-
- return pyarrow_wrap_buffer(c_buffer)
-
- @property
- def format(self):
- """
- The format of the data file viewed by this fragment.
- """
- return FileFormat.wrap(self.file_fragment.format())
-
-
-class RowGroupInfo:
- """A wrapper class for RowGroup information"""
-
- def __init__(self, id, metadata, schema):
- self.id = id
- self.metadata = metadata
- self.schema = schema
-
- @property
- def num_rows(self):
- return self.metadata.num_rows
-
- @property
- def total_byte_size(self):
- return self.metadata.total_byte_size
-
- @property
- def statistics(self):
- def name_stats(i):
- col = self.metadata.column(i)
-
- stats = col.statistics
- if stats is None or not stats.has_min_max:
- return None, None
-
- name = col.path_in_schema
- field_index = self.schema.get_field_index(name)
- if field_index < 0:
- return None, None
-
- typ = self.schema.field(field_index).type
- return col.path_in_schema, {
- 'min': pa.scalar(stats.min, type=typ).as_py(),
- 'max': pa.scalar(stats.max, type=typ).as_py()
- }
-
- return {
- name: stats for name, stats
- in map(name_stats, range(self.metadata.num_columns))
- if stats is not None
- }
-
- def __repr__(self):
- return "RowGroupInfo({})".format(self.id)
-
- def __eq__(self, other):
- if isinstance(other, int):
- return self.id == other
- if not isinstance(other, RowGroupInfo):
- return False
- return self.id == other.id
-
-
-cdef class FragmentScanOptions(_Weakrefable):
- """Scan options specific to a particular fragment and scan operation."""
-
- cdef:
- shared_ptr[CFragmentScanOptions] wrapped
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp):
- self.wrapped = sp
-
- @staticmethod
- cdef wrap(const shared_ptr[CFragmentScanOptions]& sp):
- if not sp:
- return None
-
- type_name = frombytes(sp.get().type_name())
-
- classes = {
- 'csv': CsvFragmentScanOptions,
- 'parquet': ParquetFragmentScanOptions,
- }
-
- class_ = classes.get(type_name, None)
- if class_ is None:
- raise TypeError(type_name)
-
- cdef FragmentScanOptions self = class_.__new__(class_)
- self.init(sp)
- return self
-
- @property
- def type_name(self):
- return frombytes(self.wrapped.get().type_name())
-
- def __eq__(self, other):
- try:
- return self.equals(other)
- except TypeError:
- return False
-
-
-cdef class ParquetFileFragment(FileFragment):
- """A Fragment representing a parquet file."""
-
- cdef:
- CParquetFileFragment* parquet_file_fragment
-
- cdef void init(self, const shared_ptr[CFragment]& sp):
- FileFragment.init(self, sp)
- self.parquet_file_fragment = <CParquetFileFragment*> sp.get()
-
- def __reduce__(self):
- buffer = self.buffer
- row_groups = [row_group.id for row_group in self.row_groups]
- return self.format.make_fragment, (
- self.path if buffer is None else buffer,
- self.filesystem,
- self.partition_expression,
- row_groups
- )
-
- def ensure_complete_metadata(self):
- """
- Ensure that all metadata (statistics, physical schema, ...) have
- been read and cached in this fragment.
- """
- check_status(self.parquet_file_fragment.EnsureCompleteMetadata())
-
- @property
- def row_groups(self):
- metadata = self.metadata
- cdef vector[int] row_groups = self.parquet_file_fragment.row_groups()
- return [RowGroupInfo(i, metadata.row_group(i), self.physical_schema)
- for i in row_groups]
-
- @property
- def metadata(self):
- self.ensure_complete_metadata()
- cdef FileMetaData metadata = FileMetaData()
- metadata.init(self.parquet_file_fragment.metadata())
- return metadata
-
- @property
- def num_row_groups(self):
- """
- Return the number of row groups viewed by this fragment (not the
- number of row groups in the origin file).
- """
- self.ensure_complete_metadata()
- return self.parquet_file_fragment.row_groups().size()
-
- def split_by_row_group(self, Expression filter=None,
- Schema schema=None):
- """
- Split the fragment into multiple fragments.
-
- Yield a Fragment wrapping each row group in this ParquetFileFragment.
- Row groups will be excluded whose metadata contradicts the optional
- filter.
-
- Parameters
- ----------
- filter : Expression, default None
- Only include the row groups which satisfy this predicate (using
- the Parquet RowGroup statistics).
- schema : Schema, default None
- Schema to use when filtering row groups. Defaults to the
- Fragment's phsyical schema
-
- Returns
- -------
- A list of Fragments
- """
- cdef:
- vector[shared_ptr[CFragment]] c_fragments
- CExpression c_filter
- shared_ptr[CFragment] c_fragment
-
- schema = schema or self.physical_schema
- c_filter = _bind(filter, schema)
- with nogil:
- c_fragments = move(GetResultValue(
- self.parquet_file_fragment.SplitByRowGroup(move(c_filter))))
-
- return [Fragment.wrap(c_fragment) for c_fragment in c_fragments]
-
- def subset(self, Expression filter=None, Schema schema=None,
- object row_group_ids=None):
- """
- Create a subset of the fragment (viewing a subset of the row groups).
-
- Subset can be specified by either a filter predicate (with optional
- schema) or by a list of row group IDs. Note that when using a filter,
- the resulting fragment can be empty (viewing no row groups).
-
- Parameters
- ----------
- filter : Expression, default None
- Only include the row groups which satisfy this predicate (using
- the Parquet RowGroup statistics).
- schema : Schema, default None
- Schema to use when filtering row groups. Defaults to the
- Fragment's phsyical schema
- row_group_ids : list of ints
- The row group IDs to include in the subset. Can only be specified
- if `filter` is None.
-
- Returns
- -------
- ParquetFileFragment
- """
- cdef:
- CExpression c_filter
- vector[int] c_row_group_ids
- shared_ptr[CFragment] c_fragment
-
- if filter is not None and row_group_ids is not None:
- raise ValueError(
- "Cannot specify both 'filter' and 'row_group_ids'."
- )
-
- if filter is not None:
- schema = schema or self.physical_schema
- c_filter = _bind(filter, schema)
- with nogil:
- c_fragment = move(GetResultValue(
- self.parquet_file_fragment.SubsetWithFilter(
- move(c_filter))))
- elif row_group_ids is not None:
- c_row_group_ids = [
- <int> row_group for row_group in sorted(set(row_group_ids))
- ]
- with nogil:
- c_fragment = move(GetResultValue(
- self.parquet_file_fragment.SubsetWithIds(
- move(c_row_group_ids))))
- else:
- raise ValueError(
- "Need to specify one of 'filter' or 'row_group_ids'"
- )
-
- return Fragment.wrap(c_fragment)
-
-
-cdef class ParquetReadOptions(_Weakrefable):
- """
- Parquet format specific options for reading.
-
- Parameters
- ----------
- dictionary_columns : list of string, default None
- Names of columns which should be dictionary encoded as
- they are read.
- """
-
- cdef public:
- set dictionary_columns
-
- # Also see _PARQUET_READ_OPTIONS
- def __init__(self, dictionary_columns=None):
- self.dictionary_columns = set(dictionary_columns or set())
-
- def equals(self, ParquetReadOptions other):
- return self.dictionary_columns == other.dictionary_columns
-
- def __eq__(self, other):
- try:
- return self.equals(other)
- except TypeError:
- return False
-
- def __repr__(self):
- return (f"<ParquetReadOptions"
- f" dictionary_columns={self.dictionary_columns}>")
-
-
-cdef class ParquetFileWriteOptions(FileWriteOptions):
-
- cdef:
- CParquetFileWriteOptions* parquet_options
- object _properties
-
- def update(self, **kwargs):
- arrow_fields = {
- "use_deprecated_int96_timestamps",
- "coerce_timestamps",
- "allow_truncated_timestamps",
- }
-
- setters = set()
- for name, value in kwargs.items():
- if name not in self._properties:
- raise TypeError("unexpected parquet write option: " + name)
- self._properties[name] = value
- if name in arrow_fields:
- setters.add(self._set_arrow_properties)
- else:
- setters.add(self._set_properties)
-
- for setter in setters:
- setter()
-
- def _set_properties(self):
- cdef CParquetFileWriteOptions* opts = self.parquet_options
-
- opts.writer_properties = _create_writer_properties(
- use_dictionary=self._properties["use_dictionary"],
- compression=self._properties["compression"],
- version=self._properties["version"],
- write_statistics=self._properties["write_statistics"],
- data_page_size=self._properties["data_page_size"],
- compression_level=self._properties["compression_level"],
- use_byte_stream_split=(
- self._properties["use_byte_stream_split"]
- ),
- data_page_version=self._properties["data_page_version"],
- )
-
- def _set_arrow_properties(self):
- cdef CParquetFileWriteOptions* opts = self.parquet_options
-
- opts.arrow_writer_properties = _create_arrow_writer_properties(
- use_deprecated_int96_timestamps=(
- self._properties["use_deprecated_int96_timestamps"]
- ),
- coerce_timestamps=self._properties["coerce_timestamps"],
- allow_truncated_timestamps=(
- self._properties["allow_truncated_timestamps"]
- ),
- writer_engine_version="V2",
- use_compliant_nested_type=(
- self._properties["use_compliant_nested_type"]
- )
- )
-
- cdef void init(self, const shared_ptr[CFileWriteOptions]& sp):
- FileWriteOptions.init(self, sp)
- self.parquet_options = <CParquetFileWriteOptions*> sp.get()
- self._properties = dict(
- use_dictionary=True,
- compression="snappy",
- version="1.0",
- write_statistics=None,
- data_page_size=None,
- compression_level=None,
- use_byte_stream_split=False,
- data_page_version="1.0",
- use_deprecated_int96_timestamps=False,
- coerce_timestamps=None,
- allow_truncated_timestamps=False,
- use_compliant_nested_type=False,
- )
- self._set_properties()
- self._set_arrow_properties()
-
-
-cdef set _PARQUET_READ_OPTIONS = {'dictionary_columns'}
-
-
-cdef class ParquetFileFormat(FileFormat):
-
- cdef:
- CParquetFileFormat* parquet_format
-
- def __init__(self, read_options=None,
- default_fragment_scan_options=None, **kwargs):
- cdef:
- shared_ptr[CParquetFileFormat] wrapped
- CParquetFileFormatReaderOptions* options
-
- # Read/scan options
- read_options_args = {option: kwargs[option] for option in kwargs
- if option in _PARQUET_READ_OPTIONS}
- scan_args = {option: kwargs[option] for option in kwargs
- if option not in _PARQUET_READ_OPTIONS}
- if read_options and read_options_args:
- duplicates = ', '.join(sorted(read_options_args))
- raise ValueError(f'If `read_options` is given, '
- f'cannot specify {duplicates}')
- if default_fragment_scan_options and scan_args:
- duplicates = ', '.join(sorted(scan_args))
- raise ValueError(f'If `default_fragment_scan_options` is given, '
- f'cannot specify {duplicates}')
-
- if read_options is None:
- read_options = ParquetReadOptions(**read_options_args)
- elif isinstance(read_options, dict):
- # For backwards compatibility
- duplicates = []
- for option, value in read_options.items():
- if option in _PARQUET_READ_OPTIONS:
- read_options_args[option] = value
- else:
- duplicates.append(option)
- scan_args[option] = value
- if duplicates:
- duplicates = ", ".join(duplicates)
- warnings.warn(f'The scan options {duplicates} should be '
- 'specified directly as keyword arguments')
- read_options = ParquetReadOptions(**read_options_args)
- elif not isinstance(read_options, ParquetReadOptions):
- raise TypeError('`read_options` must be either a dictionary or an '
- 'instance of ParquetReadOptions')
-
- if default_fragment_scan_options is None:
- default_fragment_scan_options = ParquetFragmentScanOptions(
- **scan_args)
- elif isinstance(default_fragment_scan_options, dict):
- default_fragment_scan_options = ParquetFragmentScanOptions(
- **default_fragment_scan_options)
- elif not isinstance(default_fragment_scan_options,
- ParquetFragmentScanOptions):
- raise TypeError('`default_fragment_scan_options` must be either a '
- 'dictionary or an instance of '
- 'ParquetFragmentScanOptions')
-
- wrapped = make_shared[CParquetFileFormat]()
- options = &(wrapped.get().reader_options)
- if read_options.dictionary_columns is not None:
- for column in read_options.dictionary_columns:
- options.dict_columns.insert(tobytes(column))
-
- self.init(<shared_ptr[CFileFormat]> wrapped)
- self.default_fragment_scan_options = default_fragment_scan_options
-
- cdef void init(self, const shared_ptr[CFileFormat]& sp):
- FileFormat.init(self, sp)
- self.parquet_format = <CParquetFileFormat*> sp.get()
-
- @property
- def read_options(self):
- cdef CParquetFileFormatReaderOptions* options
- options = &self.parquet_format.reader_options
- return ParquetReadOptions(
- dictionary_columns={frombytes(col)
- for col in options.dict_columns},
- )
-
- def make_write_options(self, **kwargs):
- opts = FileFormat.make_write_options(self)
- (<ParquetFileWriteOptions> opts).update(**kwargs)
- return opts
-
- cdef _set_default_fragment_scan_options(self, FragmentScanOptions options):
- if options.type_name == 'parquet':
- self.parquet_format.default_fragment_scan_options = options.wrapped
- else:
- super()._set_default_fragment_scan_options(options)
-
- def equals(self, ParquetFileFormat other):
- return (
- self.read_options.equals(other.read_options) and
- self.default_fragment_scan_options ==
- other.default_fragment_scan_options
- )
-
- def __reduce__(self):
- return ParquetFileFormat, (self.read_options,
- self.default_fragment_scan_options)
-
- def __repr__(self):
- return f"<ParquetFileFormat read_options={self.read_options}>"
-
- def make_fragment(self, file, filesystem=None,
- Expression partition_expression=None, row_groups=None):
- cdef:
- vector[int] c_row_groups
-
- if partition_expression is None:
- partition_expression = _true
-
- if row_groups is None:
- return super().make_fragment(file, filesystem,
- partition_expression)
-
- c_source = _make_file_source(file, filesystem)
- c_row_groups = [<int> row_group for row_group in set(row_groups)]
-
- c_fragment = <shared_ptr[CFragment]> GetResultValue(
- self.parquet_format.MakeFragment(move(c_source),
- partition_expression.unwrap(),
- <shared_ptr[CSchema]>nullptr,
- move(c_row_groups)))
- return Fragment.wrap(move(c_fragment))
-
-
-cdef class ParquetFragmentScanOptions(FragmentScanOptions):
- """Scan-specific options for Parquet fragments.
-
- Parameters
- ----------
- use_buffered_stream : bool, default False
- Read files through buffered input streams rather than loading entire
- row groups at once. This may be enabled to reduce memory overhead.
- Disabled by default.
- buffer_size : int, default 8192
- Size of buffered stream, if enabled. Default is 8KB.
- pre_buffer : bool, default False
- If enabled, pre-buffer the raw Parquet data instead of issuing one
- read per column chunk. This can improve performance on high-latency
- filesystems.
- enable_parallel_column_conversion : bool, default False
- EXPERIMENTAL: Parallelize conversion across columns. This option is
- ignored if a scan is already parallelized across input files to avoid
- thread contention. This option will be removed after support is added
- for simultaneous parallelization across files and columns.
- """
-
- cdef:
- CParquetFragmentScanOptions* parquet_options
-
- # Avoid mistakingly creating attributes
- __slots__ = ()
-
- def __init__(self, bint use_buffered_stream=False,
- buffer_size=8192,
- bint pre_buffer=False,
- bint enable_parallel_column_conversion=False):
- self.init(shared_ptr[CFragmentScanOptions](
- new CParquetFragmentScanOptions()))
- self.use_buffered_stream = use_buffered_stream
- self.buffer_size = buffer_size
- self.pre_buffer = pre_buffer
- self.enable_parallel_column_conversion = \
- enable_parallel_column_conversion
-
- cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp):
- FragmentScanOptions.init(self, sp)
- self.parquet_options = <CParquetFragmentScanOptions*> sp.get()
-
- cdef CReaderProperties* reader_properties(self):
- return self.parquet_options.reader_properties.get()
-
- cdef ArrowReaderProperties* arrow_reader_properties(self):
- return self.parquet_options.arrow_reader_properties.get()
-
- @property
- def use_buffered_stream(self):
- return self.reader_properties().is_buffered_stream_enabled()
-
- @use_buffered_stream.setter
- def use_buffered_stream(self, bint use_buffered_stream):
- if use_buffered_stream:
- self.reader_properties().enable_buffered_stream()
- else:
- self.reader_properties().disable_buffered_stream()
-
- @property
- def buffer_size(self):
- return self.reader_properties().buffer_size()
-
- @buffer_size.setter
- def buffer_size(self, buffer_size):
- if buffer_size <= 0:
- raise ValueError("Buffer size must be larger than zero")
- self.reader_properties().set_buffer_size(buffer_size)
-
- @property
- def pre_buffer(self):
- return self.arrow_reader_properties().pre_buffer()
-
- @pre_buffer.setter
- def pre_buffer(self, bint pre_buffer):
- self.arrow_reader_properties().set_pre_buffer(pre_buffer)
-
- @property
- def enable_parallel_column_conversion(self):
- return self.parquet_options.enable_parallel_column_conversion
-
- @enable_parallel_column_conversion.setter
- def enable_parallel_column_conversion(
- self, bint enable_parallel_column_conversion):
- self.parquet_options.enable_parallel_column_conversion = \
- enable_parallel_column_conversion
-
- def equals(self, ParquetFragmentScanOptions other):
- return (
- self.use_buffered_stream == other.use_buffered_stream and
- self.buffer_size == other.buffer_size and
- self.pre_buffer == other.pre_buffer and
- self.enable_parallel_column_conversion ==
- other.enable_parallel_column_conversion)
-
- def __reduce__(self):
- return ParquetFragmentScanOptions, (
- self.use_buffered_stream, self.buffer_size, self.pre_buffer,
- self.enable_parallel_column_conversion)
-
-
-cdef class IpcFileWriteOptions(FileWriteOptions):
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
-
-cdef class IpcFileFormat(FileFormat):
-
- def __init__(self):
- self.init(shared_ptr[CFileFormat](new CIpcFileFormat()))
-
- def equals(self, IpcFileFormat other):
- return True
-
- @property
- def default_extname(self):
- return "feather"
-
- def __reduce__(self):
- return IpcFileFormat, tuple()
-
-
-cdef class CsvFileFormat(FileFormat):
- cdef:
- CCsvFileFormat* csv_format
-
- # Avoid mistakingly creating attributes
- __slots__ = ()
-
- def __init__(self, ParseOptions parse_options=None,
- default_fragment_scan_options=None,
- ConvertOptions convert_options=None,
- ReadOptions read_options=None):
- self.init(shared_ptr[CFileFormat](new CCsvFileFormat()))
- if parse_options is not None:
- self.parse_options = parse_options
- if convert_options is not None or read_options is not None:
- if default_fragment_scan_options:
- raise ValueError('If `default_fragment_scan_options` is '
- 'given, cannot specify convert_options '
- 'or read_options')
- self.default_fragment_scan_options = CsvFragmentScanOptions(
- convert_options=convert_options, read_options=read_options)
- elif isinstance(default_fragment_scan_options, dict):
- self.default_fragment_scan_options = CsvFragmentScanOptions(
- **default_fragment_scan_options)
- elif isinstance(default_fragment_scan_options, CsvFragmentScanOptions):
- self.default_fragment_scan_options = default_fragment_scan_options
- elif default_fragment_scan_options is not None:
- raise TypeError('`default_fragment_scan_options` must be either '
- 'a dictionary or an instance of '
- 'CsvFragmentScanOptions')
-
- cdef void init(self, const shared_ptr[CFileFormat]& sp):
- FileFormat.init(self, sp)
- self.csv_format = <CCsvFileFormat*> sp.get()
-
- def make_write_options(self):
- raise NotImplemented("writing CSV datasets")
-
- @property
- def parse_options(self):
- return ParseOptions.wrap(self.csv_format.parse_options)
-
- @parse_options.setter
- def parse_options(self, ParseOptions parse_options not None):
- self.csv_format.parse_options = parse_options.options
-
- cdef _set_default_fragment_scan_options(self, FragmentScanOptions options):
- if options.type_name == 'csv':
- self.csv_format.default_fragment_scan_options = options.wrapped
- else:
- super()._set_default_fragment_scan_options(options)
-
- def equals(self, CsvFileFormat other):
- return (
- self.parse_options.equals(other.parse_options) and
- self.default_fragment_scan_options ==
- other.default_fragment_scan_options)
-
- def __reduce__(self):
- return CsvFileFormat, (self.parse_options,
- self.default_fragment_scan_options)
-
- def __repr__(self):
- return f"<CsvFileFormat parse_options={self.parse_options}>"
-
-
-cdef class CsvFragmentScanOptions(FragmentScanOptions):
- """Scan-specific options for CSV fragments."""
-
- cdef:
- CCsvFragmentScanOptions* csv_options
-
- # Avoid mistakingly creating attributes
- __slots__ = ()
-
- def __init__(self, ConvertOptions convert_options=None,
- ReadOptions read_options=None):
- self.init(shared_ptr[CFragmentScanOptions](
- new CCsvFragmentScanOptions()))
- if convert_options is not None:
- self.convert_options = convert_options
- if read_options is not None:
- self.read_options = read_options
-
- cdef void init(self, const shared_ptr[CFragmentScanOptions]& sp):
- FragmentScanOptions.init(self, sp)
- self.csv_options = <CCsvFragmentScanOptions*> sp.get()
-
- @property
- def convert_options(self):
- return ConvertOptions.wrap(self.csv_options.convert_options)
-
- @convert_options.setter
- def convert_options(self, ConvertOptions convert_options not None):
- self.csv_options.convert_options = convert_options.options
-
- @property
- def read_options(self):
- return ReadOptions.wrap(self.csv_options.read_options)
-
- @read_options.setter
- def read_options(self, ReadOptions read_options not None):
- self.csv_options.read_options = read_options.options
-
- def equals(self, CsvFragmentScanOptions other):
- return (
- other and
- self.convert_options.equals(other.convert_options) and
- self.read_options.equals(other.read_options))
-
- def __reduce__(self):
- return CsvFragmentScanOptions, (self.convert_options,
- self.read_options)
-
-
-cdef class Partitioning(_Weakrefable):
-
- cdef:
- shared_ptr[CPartitioning] wrapped
- CPartitioning* partitioning
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef init(self, const shared_ptr[CPartitioning]& sp):
- self.wrapped = sp
- self.partitioning = sp.get()
-
- @staticmethod
- cdef wrap(const shared_ptr[CPartitioning]& sp):
- type_name = frombytes(sp.get().type_name())
-
- classes = {
- 'schema': DirectoryPartitioning,
- 'hive': HivePartitioning,
- }
-
- class_ = classes.get(type_name, None)
- if class_ is None:
- raise TypeError(type_name)
-
- cdef Partitioning self = class_.__new__(class_)
- self.init(sp)
- return self
-
- cdef inline shared_ptr[CPartitioning] unwrap(self):
- return self.wrapped
-
- def parse(self, path):
- cdef CResult[CExpression] result
- result = self.partitioning.Parse(tobytes(path))
- return Expression.wrap(GetResultValue(result))
-
- @property
- def schema(self):
- """The arrow Schema attached to the partitioning."""
- return pyarrow_wrap_schema(self.partitioning.schema())
-
-
-cdef class PartitioningFactory(_Weakrefable):
-
- cdef:
- shared_ptr[CPartitioningFactory] wrapped
- CPartitioningFactory* factory
-
- def __init__(self):
- _forbid_instantiation(self.__class__)
-
- cdef init(self, const shared_ptr[CPartitioningFactory]& sp):
- self.wrapped = sp
- self.factory = sp.get()
-
- @staticmethod
- cdef wrap(const shared_ptr[CPartitioningFactory]& sp):
- cdef PartitioningFactory self = PartitioningFactory.__new__(
- PartitioningFactory
- )
- self.init(sp)
- return self
-
- cdef inline shared_ptr[CPartitioningFactory] unwrap(self):
- return self.wrapped
-
-
-cdef vector[shared_ptr[CArray]] _partitioning_dictionaries(
- Schema schema, dictionaries) except *:
- cdef:
- vector[shared_ptr[CArray]] c_dictionaries
-
- dictionaries = dictionaries or {}
-
- for field in schema:
- dictionary = dictionaries.get(field.name)
-
- if (isinstance(field.type, pa.DictionaryType) and
- dictionary is not None):
- c_dictionaries.push_back(pyarrow_unwrap_array(dictionary))
- else:
- c_dictionaries.push_back(<shared_ptr[CArray]> nullptr)
-
- return c_dictionaries
-
-
-cdef class DirectoryPartitioning(Partitioning):
- """
- A Partitioning based on a specified Schema.
-
- The DirectoryPartitioning expects one segment in the file path for each
- field in the schema (all fields are required to be present).
- For example given schema<year:int16, month:int8> the path "/2009/11" would
- be parsed to ("year"_ == 2009 and "month"_ == 11).
-
- Parameters
- ----------
- schema : Schema
- The schema that describes the partitions present in the file path.
- dictionaries : Dict[str, Array]
- If the type of any field of `schema` is a dictionary type, the
- corresponding entry of `dictionaries` must be an array containing
- every value which may be taken by the corresponding column or an
- error will be raised in parsing.
-
- Returns
- -------
- DirectoryPartitioning
-
- Examples
- --------
- >>> from pyarrow.dataset import DirectoryPartitioning
- >>> partition = DirectoryPartitioning(
- ... pa.schema([("year", pa.int16()), ("month", pa.int8())]))
- >>> print(partitioning.parse("/2009/11"))
- ((year == 2009:int16) and (month == 11:int8))
- """
-
- cdef:
- CDirectoryPartitioning* directory_partitioning
-
- def __init__(self, Schema schema not None, dictionaries=None):
- cdef:
- shared_ptr[CDirectoryPartitioning] c_partitioning
-
- c_partitioning = make_shared[CDirectoryPartitioning](
- pyarrow_unwrap_schema(schema),
- _partitioning_dictionaries(schema, dictionaries)
- )
- self.init(<shared_ptr[CPartitioning]> c_partitioning)
-
- cdef init(self, const shared_ptr[CPartitioning]& sp):
- Partitioning.init(self, sp)
- self.directory_partitioning = <CDirectoryPartitioning*> sp.get()
-
- @staticmethod
- def discover(field_names=None, infer_dictionary=False,
- max_partition_dictionary_size=0,
- schema=None):
- """
- Discover a DirectoryPartitioning.
-
- Parameters
- ----------
- field_names : list of str
- The names to associate with the values from the subdirectory names.
- If schema is given, will be populated from the schema.
- infer_dictionary : bool, default False
- When inferring a schema for partition fields, yield dictionary
- encoded types instead of plain types. This can be more efficient
- when materializing virtual columns, and Expressions parsed by the
- finished Partitioning will include dictionaries of all unique
- inspected values for each field.
- max_partition_dictionary_size : int, default 0
- Synonymous with infer_dictionary for backwards compatibility with
- 1.0: setting this to -1 or None is equivalent to passing
- infer_dictionary=True.
- schema : Schema, default None
- Use this schema instead of inferring a schema from partition
- values. Partition values will be validated against this schema
- before accumulation into the Partitioning's dictionary.
-
- Returns
- -------
- PartitioningFactory
- To be used in the FileSystemFactoryOptions.
- """
- cdef:
- CPartitioningFactoryOptions c_options
- vector[c_string] c_field_names
-
- if max_partition_dictionary_size in {-1, None}:
- infer_dictionary = True
- elif max_partition_dictionary_size != 0:
- raise NotImplemented("max_partition_dictionary_size must be "
- "0, -1, or None")
-
- if infer_dictionary:
- c_options.infer_dictionary = True
-
- if schema:
- c_options.schema = pyarrow_unwrap_schema(schema)
- c_field_names = [tobytes(f.name) for f in schema]
- elif not field_names:
- raise ValueError(
- "Neither field_names nor schema was passed; "
- "cannot infer field_names")
- else:
- c_field_names = [tobytes(s) for s in field_names]
- return PartitioningFactory.wrap(
- CDirectoryPartitioning.MakeFactory(c_field_names, c_options))
-
-
-cdef class HivePartitioning(Partitioning):
- """
- A Partitioning for "/$key=$value/" nested directories as found in
- Apache Hive.
-
- Multi-level, directory based partitioning scheme originating from
- Apache Hive with all data files stored in the leaf directories. Data is
- partitioned by static values of a particular column in the schema.
- Partition keys are represented in the form $key=$value in directory names.
- Field order is ignored, as are missing or unrecognized field names.
-
- For example, given schema<year:int16, month:int8, day:int8>, a possible
- path would be "/year=2009/month=11/day=15".
-
- Parameters
- ----------
- schema : Schema
- The schema that describes the partitions present in the file path.
- dictionaries : Dict[str, Array]
- If the type of any field of `schema` is a dictionary type, the
- corresponding entry of `dictionaries` must be an array containing
- every value which may be taken by the corresponding column or an
- error will be raised in parsing.
- null_fallback : str, default "__HIVE_DEFAULT_PARTITION__"
- If any field is None then this fallback will be used as a label
-
- Returns
- -------
- HivePartitioning
-
- Examples
- --------
- >>> from pyarrow.dataset import HivePartitioning
- >>> partitioning = HivePartitioning(
- ... pa.schema([("year", pa.int16()), ("month", pa.int8())]))
- >>> print(partitioning.parse("/year=2009/month=11"))
- ((year == 2009:int16) and (month == 11:int8))
-
- """
-
- cdef:
- CHivePartitioning* hive_partitioning
-
- def __init__(self,
- Schema schema not None,
- dictionaries=None,
- null_fallback="__HIVE_DEFAULT_PARTITION__"):
-
- cdef:
- shared_ptr[CHivePartitioning] c_partitioning
- c_string c_null_fallback = tobytes(null_fallback)
-
- c_partitioning = make_shared[CHivePartitioning](
- pyarrow_unwrap_schema(schema),
- _partitioning_dictionaries(schema, dictionaries),
- c_null_fallback
- )
- self.init(<shared_ptr[CPartitioning]> c_partitioning)
-
- cdef init(self, const shared_ptr[CPartitioning]& sp):
- Partitioning.init(self, sp)
- self.hive_partitioning = <CHivePartitioning*> sp.get()
-
- @staticmethod
- def discover(infer_dictionary=False,
- max_partition_dictionary_size=0,
- null_fallback="__HIVE_DEFAULT_PARTITION__",
- schema=None):
- """
- Discover a HivePartitioning.
-
- Parameters
- ----------
- infer_dictionary : bool, default False
- When inferring a schema for partition fields, yield dictionary
- encoded types instead of plain. This can be more efficient when
- materializing virtual columns, and Expressions parsed by the
- finished Partitioning will include dictionaries of all unique
- inspected values for each field.
- max_partition_dictionary_size : int, default 0
- Synonymous with infer_dictionary for backwards compatibility with
- 1.0: setting this to -1 or None is equivalent to passing
- infer_dictionary=True.
- null_fallback : str, default "__HIVE_DEFAULT_PARTITION__"
- When inferring a schema for partition fields this value will be
- replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__
- for compatibility with Spark
- schema : Schema, default None
- Use this schema instead of inferring a schema from partition
- values. Partition values will be validated against this schema
- before accumulation into the Partitioning's dictionary.
-
- Returns
- -------
- PartitioningFactory
- To be used in the FileSystemFactoryOptions.
- """
- cdef:
- CHivePartitioningFactoryOptions c_options
-
- if max_partition_dictionary_size in {-1, None}:
- infer_dictionary = True
- elif max_partition_dictionary_size != 0:
- raise NotImplemented("max_partition_dictionary_size must be "
- "0, -1, or None")
-
- if infer_dictionary:
- c_options.infer_dictionary = True
-
- c_options.null_fallback = tobytes(null_fallback)
-
- if schema:
- c_options.schema = pyarrow_unwrap_schema(schema)
-
- return PartitioningFactory.wrap(
- CHivePartitioning.MakeFactory(c_options))
-
-
-cdef class DatasetFactory(_Weakrefable):
- """
- DatasetFactory is used to create a Dataset, inspect the Schema
- of the fragments contained in it, and declare a partitioning.
- """
-
- cdef:
- shared_ptr[CDatasetFactory] wrapped
- CDatasetFactory* factory
-
- def __init__(self, list children):
- _forbid_instantiation(self.__class__)
-
- cdef init(self, const shared_ptr[CDatasetFactory]& sp):
- self.wrapped = sp
- self.factory = sp.get()
-
- @staticmethod
- cdef wrap(const shared_ptr[CDatasetFactory]& sp):
- cdef DatasetFactory self = \
- DatasetFactory.__new__(DatasetFactory)
- self.init(sp)
- return self
-
- cdef inline shared_ptr[CDatasetFactory] unwrap(self) nogil:
- return self.wrapped
-
- @property
- def root_partition(self):
- return Expression.wrap(self.factory.root_partition())
-
- @root_partition.setter
- def root_partition(self, Expression expr):
- check_status(self.factory.SetRootPartition(expr.unwrap()))
-
- def inspect_schemas(self):
- cdef CResult[vector[shared_ptr[CSchema]]] result
- cdef CInspectOptions options
- with nogil:
- result = self.factory.InspectSchemas(options)
-
- schemas = []
- for s in GetResultValue(result):
- schemas.append(pyarrow_wrap_schema(s))
- return schemas
-
- def inspect(self):
- """
- Inspect all data fragments and return a common Schema.
-
- Returns
- -------
- Schema
- """
- cdef:
- CInspectOptions options
- CResult[shared_ptr[CSchema]] result
- with nogil:
- result = self.factory.Inspect(options)
- return pyarrow_wrap_schema(GetResultValue(result))
-
- def finish(self, Schema schema=None):
- """
- Create a Dataset using the inspected schema or an explicit schema
- (if given).
-
- Parameters
- ----------
- schema: Schema, default None
- The schema to conform the source to. If None, the inspected
- schema is used.
-
- Returns
- -------
- Dataset
- """
- cdef:
- shared_ptr[CSchema] sp_schema
- CResult[shared_ptr[CDataset]] result
-
- if schema is not None:
- sp_schema = pyarrow_unwrap_schema(schema)
- with nogil:
- result = self.factory.FinishWithSchema(sp_schema)
- else:
- with nogil:
- result = self.factory.Finish()
-
- return Dataset.wrap(GetResultValue(result))
-
-
-cdef class FileSystemFactoryOptions(_Weakrefable):
- """
- Influences the discovery of filesystem paths.
-
- Parameters
- ----------
- partition_base_dir : str, optional
- For the purposes of applying the partitioning, paths will be
- stripped of the partition_base_dir. Files not matching the
- partition_base_dir prefix will be skipped for partitioning discovery.
- The ignored files will still be part of the Dataset, but will not
- have partition information.
- partitioning: Partitioning/PartitioningFactory, optional
- Apply the Partitioning to every discovered Fragment. See Partitioning or
- PartitioningFactory documentation.
- exclude_invalid_files : bool, optional (default True)
- If True, invalid files will be excluded (file format specific check).
- This will incur IO for each files in a serial and single threaded
- fashion. Disabling this feature will skip the IO, but unsupported
- files may be present in the Dataset (resulting in an error at scan
- time).
- selector_ignore_prefixes : list, optional
- When discovering from a Selector (and not from an explicit file list),
- ignore files and directories matching any of these prefixes.
- By default this is ['.', '_'].
- """
-
- cdef:
- CFileSystemFactoryOptions options
-
- __slots__ = () # avoid mistakingly creating attributes
-
- def __init__(self, partition_base_dir=None, partitioning=None,
- exclude_invalid_files=None,
- list selector_ignore_prefixes=None):
- if isinstance(partitioning, PartitioningFactory):
- self.partitioning_factory = partitioning
- elif isinstance(partitioning, Partitioning):
- self.partitioning = partitioning
-
- if partition_base_dir is not None:
- self.partition_base_dir = partition_base_dir
- if exclude_invalid_files is not None:
- self.exclude_invalid_files = exclude_invalid_files
- if selector_ignore_prefixes is not None:
- self.selector_ignore_prefixes = selector_ignore_prefixes
-
- cdef inline CFileSystemFactoryOptions unwrap(self):
- return self.options
-
- @property
- def partitioning(self):
- """Partitioning to apply to discovered files.
-
- NOTE: setting this property will overwrite partitioning_factory.
- """
- c_partitioning = self.options.partitioning.partitioning()
- if c_partitioning.get() == nullptr:
- return None
- return Partitioning.wrap(c_partitioning)
-
- @partitioning.setter
- def partitioning(self, Partitioning value):
- self.options.partitioning = (<Partitioning> value).unwrap()
-
- @property
- def partitioning_factory(self):
- """PartitioningFactory to apply to discovered files and
- discover a Partitioning.
-
- NOTE: setting this property will overwrite partitioning.
- """
- c_factory = self.options.partitioning.factory()
- if c_factory.get() == nullptr:
- return None
- return PartitioningFactory.wrap(c_factory)
-
- @partitioning_factory.setter
- def partitioning_factory(self, PartitioningFactory value):
- self.options.partitioning = (<PartitioningFactory> value).unwrap()
-
- @property
- def partition_base_dir(self):
- """
- Base directory to strip paths before applying the partitioning.
- """
- return frombytes(self.options.partition_base_dir)
-
- @partition_base_dir.setter
- def partition_base_dir(self, value):
- self.options.partition_base_dir = tobytes(value)
-
- @property
- def exclude_invalid_files(self):
- """Whether to exclude invalid files."""
- return self.options.exclude_invalid_files
-
- @exclude_invalid_files.setter
- def exclude_invalid_files(self, bint value):
- self.options.exclude_invalid_files = value
-
- @property
- def selector_ignore_prefixes(self):
- """
- List of prefixes. Files matching one of those prefixes will be
- ignored by the discovery process.
- """
- return [frombytes(p) for p in self.options.selector_ignore_prefixes]
-
- @selector_ignore_prefixes.setter
- def selector_ignore_prefixes(self, values):
- self.options.selector_ignore_prefixes = [tobytes(v) for v in values]
-
-
-cdef class FileSystemDatasetFactory(DatasetFactory):
- """
- Create a DatasetFactory from a list of paths with schema inspection.
-
- Parameters
- ----------
- filesystem : pyarrow.fs.FileSystem
- Filesystem to discover.
- paths_or_selector: pyarrow.fs.Selector or list of path-likes
- Either a Selector object or a list of path-like objects.
- format : FileFormat
- Currently only ParquetFileFormat and IpcFileFormat are supported.
- options : FileSystemFactoryOptions, optional
- Various flags influencing the discovery of filesystem paths.
- """
-
- cdef:
- CFileSystemDatasetFactory* filesystem_factory
-
- def __init__(self, FileSystem filesystem not None, paths_or_selector,
- FileFormat format not None,
- FileSystemFactoryOptions options=None):
- cdef:
- vector[c_string] paths
- CFileSelector c_selector
- CResult[shared_ptr[CDatasetFactory]] result
- shared_ptr[CFileSystem] c_filesystem
- shared_ptr[CFileFormat] c_format
- CFileSystemFactoryOptions c_options
-
- options = options or FileSystemFactoryOptions()
- c_options = options.unwrap()
- c_filesystem = filesystem.unwrap()
- c_format = format.unwrap()
-
- if isinstance(paths_or_selector, FileSelector):
- with nogil:
- c_selector = (<FileSelector> paths_or_selector).selector
- result = CFileSystemDatasetFactory.MakeFromSelector(
- c_filesystem,
- c_selector,
- c_format,
- c_options
- )
- elif isinstance(paths_or_selector, (list, tuple)):
- paths = [tobytes(s) for s in paths_or_selector]
- with nogil:
- result = CFileSystemDatasetFactory.MakeFromPaths(
- c_filesystem,
- paths,
- c_format,
- c_options
- )
- else:
- raise TypeError('Must pass either paths or a FileSelector, but '
- 'passed {}'.format(type(paths_or_selector)))
-
- self.init(GetResultValue(result))
-
- cdef init(self, shared_ptr[CDatasetFactory]& sp):
- DatasetFactory.init(self, sp)
- self.filesystem_factory = <CFileSystemDatasetFactory*> sp.get()
-
-
-cdef class UnionDatasetFactory(DatasetFactory):
- """
- Provides a way to inspect/discover a Dataset's expected schema before
- materialization.
-
- Parameters
- ----------
- factories : list of DatasetFactory
- """
-
- cdef:
- CUnionDatasetFactory* union_factory
-
- def __init__(self, list factories):
- cdef:
- DatasetFactory factory
- vector[shared_ptr[CDatasetFactory]] c_factories
- for factory in factories:
- c_factories.push_back(factory.unwrap())
- self.init(GetResultValue(CUnionDatasetFactory.Make(c_factories)))
-
- cdef init(self, const shared_ptr[CDatasetFactory]& sp):
- DatasetFactory.init(self, sp)
- self.union_factory = <CUnionDatasetFactory*> sp.get()
-
-
-cdef class ParquetFactoryOptions(_Weakrefable):
- """
- Influences the discovery of parquet dataset.
-
- Parameters
- ----------
- partition_base_dir : str, optional
- For the purposes of applying the partitioning, paths will be
- stripped of the partition_base_dir. Files not matching the
- partition_base_dir prefix will be skipped for partitioning discovery.
- The ignored files will still be part of the Dataset, but will not
- have partition information.
- partitioning : Partitioning, PartitioningFactory, optional
- The partitioning scheme applied to fragments, see ``Partitioning``.
- validate_column_chunk_paths : bool, default False
- Assert that all ColumnChunk paths are consistent. The parquet spec
- allows for ColumnChunk data to be stored in multiple files, but
- ParquetDatasetFactory supports only a single file with all ColumnChunk
- data. If this flag is set construction of a ParquetDatasetFactory will
- raise an error if ColumnChunk data is not resident in a single file.
- """
-
- cdef:
- CParquetFactoryOptions options
-
- __slots__ = () # avoid mistakingly creating attributes
-
- def __init__(self, partition_base_dir=None, partitioning=None,
- validate_column_chunk_paths=False):
- if isinstance(partitioning, PartitioningFactory):
- self.partitioning_factory = partitioning
- elif isinstance(partitioning, Partitioning):
- self.partitioning = partitioning
-
- if partition_base_dir is not None:
- self.partition_base_dir = partition_base_dir
-
- self.options.validate_column_chunk_paths = validate_column_chunk_paths
-
- cdef inline CParquetFactoryOptions unwrap(self):
- return self.options
-
- @property
- def partitioning(self):
- """Partitioning to apply to discovered files.
-
- NOTE: setting this property will overwrite partitioning_factory.
- """
- c_partitioning = self.options.partitioning.partitioning()
- if c_partitioning.get() == nullptr:
... 78503 lines suppressed ...