You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by pc...@apache.org on 2018/11/08 05:06:55 UTC
[arrow] branch master updated: ARROW-3602: [Gandiva] [Python]
Initial Gandiva Cython bindings
This is an automated email from the ASF dual-hosted git repository.
pcmoritz pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8e9cb87 ARROW-3602: [Gandiva] [Python] Initial Gandiva Cython bindings
8e9cb87 is described below
commit 8e9cb870de0ecb126a0d7403f34e3a1ce119d618
Author: Philipp Moritz <pc...@gmail.com>
AuthorDate: Wed Nov 7 21:06:45 2018 -0800
ARROW-3602: [Gandiva] [Python] Initial Gandiva Cython bindings
This is an initial Cython wrapper for Gandiva.
Feedback is appreciated (the API is experimental right now and will most likely change in the future).
Author: Philipp Moritz <pc...@gmail.com>
Closes #2822 from pcmoritz/gandiva-cython and squashes the following commits:
6ff0d9402 <Philipp Moritz> Merge branch 'master' into gandiva-cython
6ea00628b <Philipp Moritz> use cython instantiation
4a366bb42 <Philipp Moritz> Merge branch 'master' into gandiva-cython
73bc203fd <Philipp Moritz> use strings instead of bytes for function names
f589de37d <Philipp Moritz> fix travis
69b10cb3d <Philipp Moritz> don't build gandiva in mac os build
20b75a6bb <Philipp Moritz> update
e1d074bc5 <Philipp Moritz> build gandiva tests
562b763e4 <Philipp Moritz> put back llvm
9b9d3ab22 <Philipp Moritz> don't build gandiva tests in python build
1ee601933 <Philipp Moritz> don't run gandiva tests on python build
646f36014 <Philipp Moritz> Merge branch 'gandiva-cython' of github.com:pcmoritz/arrow into gandiva-cython
40bb0c701 <Philipp Moritz> use gandiva files
cd282a348 <Philipp Moritz> Merge branch 'master' into gandiva-cython
829f7a2cf <Philipp Moritz> fix
f726d1745 <Philipp Moritz> remove compiler error
99f93f167 <Philipp Moritz> Merge branch 'master' into gandiva-cython
cfec265e7 <Philipp Moritz> use clang
600888443 <Philipp Moritz> install llvm 6.0 (?)
5abd24897 <Philipp Moritz> build gandiva
9ee2c5868 <Philipp Moritz> add gandiva flag
038084655 <Philipp Moritz> fix pytest include
27434d596 <Philipp Moritz> add ganvida pytest flags
8374cdb6f <Philipp Moritz> make gandiva optional for tests
5ceb22163 <Philipp Moritz> allow gandiva failure
021b301c0 <Philipp Moritz> lint
276536c8a <Philipp Moritz> Merge branch 'master' into gandiva-cython
b41599496 <Philipp Moritz> remove gandiva cython bindings from wheels
7b75dec5e <Philipp Moritz> linting
dc2a64870 <Philipp Moritz> add filter
0ff33c18d <Philipp Moritz> add test
b4571eed2 <Philipp Moritz> build gandiva for wheels
92c8dff70 <Philipp Moritz> linting
ae5305af1 <Philipp Moritz> linting
97568d90b <Philipp Moritz> whitespace
335be395d <Philipp Moritz> update FindGandiva.cmake
33786d260 <Philipp Moritz> memory pool handling
26be86632 <Philipp Moritz> fix array creation
aab17705c <Philipp Moritz> fixes
7a8c9483e <Philipp Moritz> port gandiva cython wrappers to in-tree gandiva
---
.travis.yml | 2 +
ci/travis_script_python.sh | 8 ++
cpp/CMakeLists.txt | 4 +
cpp/cmake_modules/FindGandiva.cmake | 96 ++++++++++++++++
cpp/src/gandiva/CMakeLists.txt | 52 +++++----
cpp/src/gandiva/tests/generate_data.h | 1 -
python/CMakeLists.txt | 24 ++++
python/pyarrow/gandiva.pyx | 204 +++++++++++++++++++++++++++++++++
python/pyarrow/includes/libgandiva.pxd | 107 +++++++++++++++++
python/pyarrow/tests/conftest.py | 8 ++
python/pyarrow/tests/test_gandiva.py | 100 ++++++++++++++++
python/setup.py | 9 ++
12 files changed, 590 insertions(+), 25 deletions(-)
diff --git a/.travis.yml b/.travis.yml
index f6f499c..20bf4ae 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -78,6 +78,7 @@ matrix:
- ARROW_BUILD_WARNING_LEVEL=CHECKIN
- ARROW_TRAVIS_PYTHON_JVM=1
- ARROW_TRAVIS_JAVA_BUILD_ONLY=1
+ - ARROW_TRAVIS_PYTHON_GANDIVA=1
# ARROW-2999 Benchmarks are disabled in Travis CI for the time being
# - ARROW_TRAVIS_PYTHON_BENCHMARKS=1
- MATRIX_EVAL="CC=gcc-4.9 && CXX=g++-4.9"
@@ -85,6 +86,7 @@ matrix:
# (ARROW_CI_CPP_AFFECTED implies ARROW_CI_PYTHON_AFFECTED)
- if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi
- $TRAVIS_BUILD_DIR/ci/travis_install_linux.sh
+ - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh
# If either C++ or Python changed, we must install the C++ libraries
- git submodule update --init
- $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh
diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 4d48adb..608e1ce 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -96,6 +96,10 @@ if [ $ARROW_TRAVIS_COVERAGE == "1" ]; then
CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GENERATE_COVERAGE=ON"
fi
+if [ $ARROW_TRAVIS_PYTHON_GANDIVA == "1" ]; then
+ CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_GANDIVA=ON -DARROW_GANDIVA_BUILD_TESTS=OFF"
+fi
+
cmake -GNinja \
$CMAKE_COMMON_FLAGS \
-DARROW_BUILD_TESTS=on \
@@ -136,6 +140,9 @@ export PYARROW_BUILD_TYPE=$ARROW_BUILD_TYPE
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_PLASMA=1
export PYARROW_WITH_ORC=1
+if [ $ARROW_TRAVIS_PYTHON_GANDIVA == "1" ]; then
+ export PYARROW_WITH_GANDIVA=1
+fi
python setup.py develop
@@ -201,6 +208,7 @@ if [ "$ARROW_TRAVIS_PYTHON_BENCHMARKS" == "1" ] && [ "$PYTHON_VERSION" == "3.6"
export PYARROW_WITH_PARQUET=1
export PYARROW_WITH_PLASMA=1
export PYARROW_WITH_ORC=0
+ export PYARROW_WITH_GANDIVA=0
pushd $ARROW_PYTHON_DIR
# Workaround for https://github.com/airspeed-velocity/asv/issues/631
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b0960f8..13b556e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -317,6 +317,10 @@ Always OFF if building binaries"
"Build the Gandiva JNI wrappers"
ON)
+ option(ARROW_GANDIVA_BUILD_TESTS
+ "Build the Gandiva googletest unit tests"
+ ON)
+
endif()
diff --git a/cpp/cmake_modules/FindGandiva.cmake b/cpp/cmake_modules/FindGandiva.cmake
new file mode 100644
index 0000000..5559c09
--- /dev/null
+++ b/cpp/cmake_modules/FindGandiva.cmake
@@ -0,0 +1,96 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# - Find GANDIVA (gandiva/client.h, libgandiva.a, libgandiva.so)
+# This module defines
+# GANDIVA_INCLUDE_DIR, directory containing headers
+# GANDIVA_LIBS, directory containing gandiva libraries
+# GANDIVA_STATIC_LIB, path to libgandiva.a
+# GANDIVA_SHARED_LIB, path to libgandiva's shared library
+# GANDIVA_SHARED_IMP_LIB, path to libgandiva's import library (MSVC only)
+# GANDIVA_FOUND, whether gandiva has been found
+
+include(FindPkgConfig)
+
+if ("$ENV{ARROW_HOME}" STREQUAL "")
+ pkg_check_modules(GANDIVA gandiva)
+ if (GANDIVA_FOUND)
+ pkg_get_variable(GANDIVA_SO_VERSION gandiva so_version)
+ set(GANDIVA_ABI_VERSION ${GANDIVA_SO_VERSION})
+ message(STATUS "Gandiva SO and ABI version: ${GANDIVA_SO_VERSION}")
+ pkg_get_variable(GANDIVA_FULL_SO_VERSION gandiva full_so_version)
+ message(STATUS "Gandiva full SO version: ${GANDIVA_FULL_SO_VERSION}")
+ set(GANDIVA_INCLUDE_DIR ${GANDIVA_INCLUDE_DIRS})
+ set(GANDIVA_LIBS ${GANDIVA_LIBRARY_DIRS})
+ set(GANDIVA_SEARCH_LIB_PATH ${GANDIVA_LIBRARY_DIRS})
+ endif()
+else()
+ set(GANDIVA_HOME "$ENV{ARROW_HOME}")
+
+ set(GANDIVA_SEARCH_HEADER_PATHS
+ ${GANDIVA_HOME}/include
+ )
+
+ set(GANDIVA_SEARCH_LIB_PATH
+ ${GANDIVA_HOME}/lib
+ )
+
+ find_path(GANDIVA_INCLUDE_DIR gandiva/expression_registry.h PATHS
+ ${GANDIVA_SEARCH_HEADER_PATHS}
+ # make sure we don't accidentally pick up a different version
+ NO_DEFAULT_PATH
+ )
+endif()
+
+find_library(GANDIVA_LIB_PATH NAMES gandiva
+ PATHS
+ ${GANDIVA_SEARCH_LIB_PATH}
+ NO_DEFAULT_PATH)
+get_filename_component(GANDIVA_LIBS ${GANDIVA_LIB_PATH} DIRECTORY)
+
+if (GANDIVA_INCLUDE_DIR AND GANDIVA_LIBS)
+ set(GANDIVA_FOUND TRUE)
+ set(GANDIVA_LIB_NAME gandiva)
+
+ set(GANDIVA_STATIC_LIB ${GANDIVA_LIBS}/lib${GANDIVA_LIB_NAME}.a)
+
+ set(GANDIVA_SHARED_LIB ${GANDIVA_LIBS}/lib${GANDIVA_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+endif()
+
+if (GANDIVA_FOUND)
+ if (NOT Gandiva_FIND_QUIETLY)
+ message(STATUS "Found the Gandiva core library: ${GANDIVA_LIB_PATH}")
+ endif ()
+else ()
+ if (NOT Gandiva_FIND_QUIETLY)
+ set(GANDIVA_ERR_MSG "Could not find the Gandiva library. Looked for headers")
+ set(GANDIVA_ERR_MSG "${GANDIVA_ERR_MSG} in ${GANDIVA_SEARCH_HEADER_PATHS}, and for libs")
+ set(GANDIVA_ERR_MSG "${GANDIVA_ERR_MSG} in ${GANDIVA_SEARCH_LIB_PATH}")
+ if (Gandiva_FIND_REQUIRED)
+ message(FATAL_ERROR "${GANDIVA_ERR_MSG}")
+ else (Gandiva_FIND_REQUIRED)
+ message(STATUS "${GANDIVA_ERR_MSG}")
+ endif (Gandiva_FIND_REQUIRED)
+ endif ()
+ set(GANDIVA_FOUND FALSE)
+endif ()
+
+mark_as_advanced(
+ GANDIVA_INCLUDE_DIR
+ GANDIVA_STATIC_LIB
+ GANDIVA_SHARED_LIB
+)
diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt
index 0aeb24b..6c227ba 100644
--- a/cpp/src/gandiva/CMakeLists.txt
+++ b/cpp/src/gandiva/CMakeLists.txt
@@ -120,34 +120,38 @@ install(
FILES "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc"
DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/")
-#args: label test-file src-files
-add_gandiva_unit_test(bitmap_accumulator_test.cc bitmap_accumulator.cc)
-add_gandiva_unit_test(engine_llvm_test.cc engine.cc llvm_types.cc configuration.cc
- gdv_function_stubs.cc context_helper.cc to_date_holder.cc date_utils.cc
- exported_funcs_registry.cc ${BC_FILE_PATH_CC})
-add_gandiva_unit_test(function_signature_test.cc function_signature.cc)
-add_gandiva_unit_test(function_registry_test.cc function_registry.cc function_signature.cc)
-add_gandiva_unit_test(llvm_types_test.cc llvm_types.cc)
-add_gandiva_unit_test(llvm_generator_test.cc llvm_generator.cc regex_util.cc engine.cc
- llvm_types.cc expr_decomposer.cc function_registry.cc annotator.cc
- bitmap_accumulator.cc configuration.cc function_signature.cc like_holder.cc
- to_date_holder.cc date_utils.cc regex_util.cc gdv_function_stubs.cc context_helper.cc
- exported_funcs_registry.cc ${BC_FILE_PATH_CC})
-add_gandiva_unit_test(annotator_test.cc annotator.cc function_signature.cc)
-add_gandiva_unit_test(tree_expr_test.cc tree_expr_builder.cc expr_decomposer.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc)
-add_gandiva_unit_test(expr_decomposer_test.cc expr_decomposer.cc tree_expr_builder.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc)
-add_gandiva_unit_test(expression_registry_test.cc llvm_types.cc expression_registry.cc function_signature.cc function_registry.cc)
-add_gandiva_unit_test(selection_vector_test.cc selection_vector.cc)
-add_gandiva_unit_test(lru_cache_test.cc)
-add_gandiva_unit_test(to_date_holder_test.cc to_date_holder.cc date_utils.cc)
-add_gandiva_unit_test(simple_arena_test.cc)
+if (ARROW_GANDIVA_BUILD_TESTS)
+ #args: label test-file src-files
+ add_gandiva_unit_test(bitmap_accumulator_test.cc bitmap_accumulator.cc)
+ add_gandiva_unit_test(engine_llvm_test.cc engine.cc llvm_types.cc configuration.cc
+ gdv_function_stubs.cc context_helper.cc to_date_holder.cc date_utils.cc
+ exported_funcs_registry.cc ${BC_FILE_PATH_CC})
+ add_gandiva_unit_test(function_signature_test.cc function_signature.cc)
+ add_gandiva_unit_test(function_registry_test.cc function_registry.cc function_signature.cc)
+ add_gandiva_unit_test(llvm_types_test.cc llvm_types.cc)
+ add_gandiva_unit_test(llvm_generator_test.cc llvm_generator.cc regex_util.cc engine.cc
+ llvm_types.cc expr_decomposer.cc function_registry.cc annotator.cc
+ bitmap_accumulator.cc configuration.cc function_signature.cc like_holder.cc
+ to_date_holder.cc date_utils.cc regex_util.cc gdv_function_stubs.cc context_helper.cc
+ exported_funcs_registry.cc ${BC_FILE_PATH_CC})
+ add_gandiva_unit_test(annotator_test.cc annotator.cc function_signature.cc)
+ add_gandiva_unit_test(tree_expr_test.cc tree_expr_builder.cc expr_decomposer.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc)
+ add_gandiva_unit_test(expr_decomposer_test.cc expr_decomposer.cc tree_expr_builder.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc)
+ add_gandiva_unit_test(expression_registry_test.cc llvm_types.cc expression_registry.cc function_signature.cc function_registry.cc)
+ add_gandiva_unit_test(selection_vector_test.cc selection_vector.cc)
+ add_gandiva_unit_test(lru_cache_test.cc)
+ add_gandiva_unit_test(to_date_holder_test.cc to_date_holder.cc date_utils.cc)
+ add_gandiva_unit_test(simple_arena_test.cc)
+endif()
if (ARROW_GANDIVA_JAVA)
add_subdirectory(jni)
endif()
add_subdirectory(precompiled)
-include(CTest)
-enable_testing()
+if (ARROW_GANDIVA_BUILD_TESTS)
+ include(CTest)
+ enable_testing()
-add_subdirectory(tests)
+ add_subdirectory(tests)
+endif()
diff --git a/cpp/src/gandiva/tests/generate_data.h b/cpp/src/gandiva/tests/generate_data.h
index 884c211..01665b8 100644
--- a/cpp/src/gandiva/tests/generate_data.h
+++ b/cpp/src/gandiva/tests/generate_data.h
@@ -67,7 +67,6 @@ class BoundedInt32DataGenerator : public Int32DataGenerator {
protected:
uint32_t upperBound_;
- Random random_;
};
class Int64DataGenerator : public DataGenerator<int64_t> {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 46d9b54..234186f 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -488,6 +488,30 @@ if (PYARROW_BUILD_ORC)
_orc)
endif()
+## Gandiva
+if (PYARROW_BUILD_GANDIVA)
+ find_package(Gandiva)
+
+ if(NOT GANDIVA_FOUND)
+ message(FATAL_ERROR "Unable to locate Gandiva libraries")
+ endif()
+
+ include_directories(SYSTEM ${GANDIVA_INCLUDE_DIR})
+
+ if (PYARROW_BUNDLE_ARROW_CPP)
+ bundle_arrow_lib(GANDIVA_SHARED_LIB
+ ABI_VERSION ${ARROW_ABI_VERSION}
+ SO_VERSION ${ARROW_SO_VERSION})
+ endif()
+
+ set(LINK_LIBS
+ ${LINK_LIBS}
+ ${GANDIVA_SHARED_LIB})
+
+ set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} gandiva)
+endif()
+
+
############################################################
# Setup and build Cython modules
############################################################
diff --git a/python/pyarrow/gandiva.pyx b/python/pyarrow/gandiva.pyx
new file mode 100644
index 0000000..7bc462f
--- /dev/null
+++ b/python/pyarrow/gandiva.pyx
@@ -0,0 +1,204 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# cython: profile=False
+# distutils: language = c++
+# cython: embedsignature = True
+
+from libcpp cimport bool as c_bool, nullptr
+from libcpp.memory cimport shared_ptr, unique_ptr, make_shared
+from libcpp.string cimport string as c_string
+from libcpp.vector cimport vector as c_vector
+from libc.stdint cimport int64_t, uint8_t, uintptr_t
+
+from pyarrow.includes.libarrow cimport *
+from pyarrow.compat import frombytes
+from pyarrow.lib cimport check_status, pyarrow_wrap_array
+
+from pyarrow.includes.libgandiva cimport (CCondition, CExpression,
+ CNode, CProjector, CFilter,
+ CSelectionVector,
+ TreeExprBuilder_MakeExpression,
+ TreeExprBuilder_MakeFunction,
+ TreeExprBuilder_MakeLiteral,
+ TreeExprBuilder_MakeField,
+ TreeExprBuilder_MakeIf,
+ TreeExprBuilder_MakeCondition,
+ SelectionVector_MakeInt32,
+ Projector_Make,
+ Filter_Make)
+
+from pyarrow.lib cimport (Array, DataType, Field, MemoryPool,
+ RecordBatch, Schema)
+
+cdef class Node:
+ cdef:
+ shared_ptr[CNode] node
+
+ def __init__(self):
+ raise TypeError("Do not call {}'s constructor directly, use the "
+ "TreeExprBuilder API directly"
+ .format(self.__class__.__name__))
+
+ @staticmethod
+ cdef create(shared_ptr[CNode] node):
+ cdef Node self = Node.__new__(Node)
+ self.node = node
+ return self
+
+cdef class Expression:
+ cdef:
+ shared_ptr[CExpression] expression
+
+ cdef void init(self, shared_ptr[CExpression] expression):
+ self.expression = expression
+
+cdef class Condition:
+ cdef:
+ shared_ptr[CCondition] condition
+
+ def __init__(self):
+ raise TypeError("Do not call {}'s constructor directly, use the "
+ "TreeExprBuilder API instead"
+ .format(self.__class__.__name__))
+
+ @staticmethod
+ cdef create(shared_ptr[CCondition] condition):
+ cdef Condition self = Condition.__new__(Condition)
+ self.condition = condition
+ return self
+
+cdef class SelectionVector:
+ cdef:
+ shared_ptr[CSelectionVector] selection_vector
+
+ def __init__(self):
+ raise TypeError("Do not call {}'s constructor directly."
+ .format(self.__class__.__name__))
+
+ @staticmethod
+ cdef create(shared_ptr[CSelectionVector] selection_vector):
+ cdef SelectionVector self = SelectionVector.__new__(SelectionVector)
+ self.selection_vector = selection_vector
+ return self
+
+ def to_array(self):
+ cdef shared_ptr[CArray] result = self.selection_vector.get().ToArray()
+ return pyarrow_wrap_array(result)
+
+cdef class Projector:
+ cdef:
+ shared_ptr[CProjector] projector
+ MemoryPool pool
+
+ def __init__(self):
+ raise TypeError("Do not call {}'s constructor directly, use "
+ "make_projector instead"
+ .format(self.__class__.__name__))
+
+ @staticmethod
+ cdef create(shared_ptr[CProjector] projector, MemoryPool pool):
+ cdef Projector self = Projector.__new__(Projector)
+ self.projector = projector
+ self.pool = pool
+ return self
+
+ def evaluate(self, RecordBatch batch):
+ cdef vector[shared_ptr[CArray]] results
+ check_status(self.projector.get().Evaluate(
+ batch.sp_batch.get()[0], self.pool.pool, &results))
+ cdef shared_ptr[CArray] result
+ arrays = []
+ for result in results:
+ arrays.append(pyarrow_wrap_array(result))
+ return arrays
+
+cdef class Filter:
+ cdef:
+ shared_ptr[CFilter] filter
+
+ def __init__(self):
+ raise TypeError("Do not call {}'s constructor directly, use "
+ "make_filter instead"
+ .format(self.__class__.__name__))
+
+ @staticmethod
+ cdef create(shared_ptr[CFilter] filter):
+ cdef Filter self = Filter.__new__(Filter)
+ self.filter = filter
+ return self
+
+ def evaluate(self, RecordBatch batch, MemoryPool pool):
+ cdef shared_ptr[CSelectionVector] selection
+ check_status(SelectionVector_MakeInt32(
+ batch.num_rows, pool.pool, &selection))
+ check_status(self.filter.get().Evaluate(
+ batch.sp_batch.get()[0], selection))
+ return SelectionVector.create(selection)
+
+cdef class TreeExprBuilder:
+
+ def make_literal(self, value):
+ cdef shared_ptr[CNode] r = TreeExprBuilder_MakeLiteral(value)
+ return Node.create(r)
+
+ def make_expression(self, Node root_node, Field return_field):
+ cdef shared_ptr[CExpression] r = TreeExprBuilder_MakeExpression(
+ root_node.node, return_field.sp_field)
+ cdef Expression expression = Expression()
+ expression.init(r)
+ return expression
+
+ def make_function(self, name, children, DataType return_type):
+ cdef c_vector[shared_ptr[CNode]] c_children
+ cdef Node child
+ for child in children:
+ c_children.push_back(child.node)
+ cdef shared_ptr[CNode] r = TreeExprBuilder_MakeFunction(
+ name.encode(), c_children, return_type.sp_type)
+ return Node.create(r)
+
+ def make_field(self, Field field):
+ cdef shared_ptr[CNode] r = TreeExprBuilder_MakeField(field.sp_field)
+ return Node.create(r)
+
+ def make_if(self, Node condition, Node this_node,
+ Node else_node, DataType return_type):
+ cdef shared_ptr[CNode] r = TreeExprBuilder_MakeIf(
+ condition.node, this_node.node, else_node.node,
+ return_type.sp_type)
+ return Node.create(r)
+
+ def make_condition(self, Node condition):
+ cdef shared_ptr[CCondition] r = TreeExprBuilder_MakeCondition(
+ condition.node)
+ return Condition.create(r)
+
+cpdef make_projector(Schema schema, children, MemoryPool pool):
+ cdef c_vector[shared_ptr[CExpression]] c_children
+ cdef Expression child
+ for child in children:
+ c_children.push_back(child.expression)
+ cdef shared_ptr[CProjector] result
+ check_status(Projector_Make(schema.sp_schema, c_children,
+ &result))
+ return Projector.create(result, pool)
+
+cpdef make_filter(Schema schema, Condition condition):
+ cdef shared_ptr[CFilter] result
+ check_status(Filter_Make(schema.sp_schema, condition.condition, &result))
+ return Filter.create(result)
diff --git a/python/pyarrow/includes/libgandiva.pxd b/python/pyarrow/includes/libgandiva.pxd
new file mode 100644
index 0000000..b1e45af
--- /dev/null
+++ b/python/pyarrow/includes/libgandiva.pxd
@@ -0,0 +1,107 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# distutils: language = c++
+
+from pyarrow.includes.common cimport *
+from pyarrow.includes.libarrow cimport *
+
+cdef extern from "gandiva/gandiva_aliases.h" namespace "gandiva" nogil:
+
+ cdef cppclass CNode" gandiva::Node":
+ pass
+
+ cdef cppclass CExpression" gandiva::Expression":
+ pass
+
+ ctypedef vector[shared_ptr[CNode]] CNodeVector" gandiva::NodeVector"
+
+ ctypedef vector[shared_ptr[CExpression]] \
+ CExpressionVector" gandiva::ExpressionVector"
+
+cdef extern from "gandiva/selection_vector.h" namespace "gandiva" nogil:
+
+ cdef cppclass CSelectionVector" gandiva::SelectionVector":
+
+ shared_ptr[CArray] ToArray()
+
+ cdef CStatus SelectionVector_MakeInt32\
+ "gandiva::SelectionVector::MakeInt32"(
+ int max_slots, CMemoryPool* pool,
+ shared_ptr[CSelectionVector]* selection_vector)
+
+cdef extern from "gandiva/condition.h" namespace "gandiva" nogil:
+
+ cdef cppclass CCondition" gandiva::Condition":
+ pass
+
+cdef extern from "gandiva/arrow.h" namespace "gandiva" nogil:
+
+ ctypedef vector[shared_ptr[CArray]] CArrayVector" gandiva::ArrayVector"
+
+
+cdef extern from "gandiva/tree_expr_builder.h" namespace "gandiva" nogil:
+
+ cdef shared_ptr[CNode] TreeExprBuilder_MakeLiteral \
+ "gandiva::TreeExprBuilder::MakeLiteral"(double value)
+
+ cdef shared_ptr[CExpression] TreeExprBuilder_MakeExpression\
+ "gandiva::TreeExprBuilder::MakeExpression"(
+ shared_ptr[CNode] root_node, shared_ptr[CField] result_field)
+
+ cdef shared_ptr[CNode] TreeExprBuilder_MakeFunction \
+ "gandiva::TreeExprBuilder::MakeFunction"(
+ const c_string& name, const CNodeVector& children,
+ shared_ptr[CDataType] return_type)
+
+ cdef shared_ptr[CNode] TreeExprBuilder_MakeField \
+ "gandiva::TreeExprBuilder::MakeField"(shared_ptr[CField] field)
+
+ cdef shared_ptr[CNode] TreeExprBuilder_MakeIf \
+ "gandiva::TreeExprBuilder::MakeIf"(
+ shared_ptr[CNode] condition, shared_ptr[CNode] this_node,
+ shared_ptr[CNode] else_node, shared_ptr[CDataType] return_type)
+
+ cdef shared_ptr[CCondition] TreeExprBuilder_MakeCondition \
+ "gandiva::TreeExprBuilder::MakeCondition"(
+ shared_ptr[CNode] condition)
+
+ cdef CStatus Projector_Make \
+ "gandiva::Projector::Make"(
+ shared_ptr[CSchema] schema, const CExpressionVector& children,
+ shared_ptr[CProjector]* projector)
+
+cdef extern from "gandiva/projector.h" namespace "gandiva" nogil:
+
+ cdef cppclass CProjector" gandiva::Projector":
+
+ CStatus Evaluate(
+ const CRecordBatch& batch, CMemoryPool* pool,
+ const CArrayVector* output)
+
+cdef extern from "gandiva/filter.h" namespace "gandiva" nogil:
+
+ cdef cppclass CFilter" gandiva::Filter":
+
+ CStatus Evaluate(
+ const CRecordBatch& batch,
+ shared_ptr[CSelectionVector] out_selection)
+
+ cdef CStatus Filter_Make \
+ "gandiva::Filter::Make"(
+ shared_ptr[CSchema] schema, shared_ptr[CCondition] condition,
+ shared_ptr[CFilter]* filter)
diff --git a/python/pyarrow/tests/conftest.py b/python/pyarrow/tests/conftest.py
index 68266c8..6cdedbb 100644
--- a/python/pyarrow/tests/conftest.py
+++ b/python/pyarrow/tests/conftest.py
@@ -24,6 +24,7 @@ except ImportError:
groups = [
+ 'gandiva',
'hdfs',
'large_memory',
'orc',
@@ -35,6 +36,7 @@ groups = [
defaults = {
+ 'gandiva': False,
'hdfs': False,
'large_memory': False,
'orc': False,
@@ -45,6 +47,12 @@ defaults = {
}
try:
+ import pyarrow.gandiva # noqa
+ defaults['gandiva'] = True
+except ImportError:
+ pass
+
+try:
import pyarrow.orc # noqa
defaults['orc'] = True
except ImportError:
diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py
new file mode 100644
index 0000000..f5874e4
--- /dev/null
+++ b/python/pyarrow/tests/test_gandiva.py
@@ -0,0 +1,100 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import pytest
+
+import pyarrow as pa
+import pandas as pd
+
+
+@pytest.mark.gandiva
+def test_tree_exp_builder():
+ import pyarrow.gandiva as gandiva
+
+ builder = gandiva.TreeExprBuilder()
+
+ field_a = pa.field('a', pa.int32())
+ field_b = pa.field('b', pa.int32())
+
+ schema = pa.schema([field_a, field_b])
+
+ field_result = pa.field('res', pa.int32())
+
+ node_a = builder.make_field(field_a)
+ node_b = builder.make_field(field_b)
+
+ condition = builder.make_function("greater_than", [node_a, node_b],
+ pa.bool_())
+ if_node = builder.make_if(condition, node_a, node_b, pa.int32())
+
+ expr = builder.make_expression(if_node, field_result)
+
+ projector = gandiva.make_projector(
+ schema, [expr], pa.default_memory_pool())
+
+ a = pa.array([10, 12, -20, 5], type=pa.int32())
+ b = pa.array([5, 15, 15, 17], type=pa.int32())
+ e = pa.array([10, 15, 15, 17], type=pa.int32())
+ input_batch = pa.RecordBatch.from_arrays([a, b], names=['a', 'b'])
+
+ r, = projector.evaluate(input_batch)
+ assert r.equals(e)
+
+
+@pytest.mark.gandiva
+def test_table():
+ import pyarrow.gandiva as gandiva
+
+ df = pd.DataFrame({"a": [1.0, 2.0], "b": [3.0, 4.0]})
+ table = pa.Table.from_pandas(df)
+
+ builder = gandiva.TreeExprBuilder()
+ node_a = builder.make_field(table.schema.field_by_name("a"))
+ node_b = builder.make_field(table.schema.field_by_name("b"))
+
+ sum = builder.make_function("add", [node_a, node_b], pa.float64())
+
+ field_result = pa.field("c", pa.float64())
+ expr = builder.make_expression(sum, field_result)
+
+ projector = gandiva.make_projector(
+ table.schema, [expr], pa.default_memory_pool())
+
+ # TODO: Add .evaluate function which can take Tables instead of
+ # RecordBatches
+ r, = projector.evaluate(table.to_batches()[0])
+
+ e = pa.Array.from_pandas(df["a"] + df["b"])
+ assert r.equals(e)
+
+
+@pytest.mark.gandiva
+def test_filter():
+ import pyarrow.gandiva as gandiva
+
+ df = pd.DataFrame({"a": [1.0 * i for i in range(10000)]})
+ table = pa.Table.from_pandas(df)
+
+ builder = gandiva.TreeExprBuilder()
+ node_a = builder.make_field(table.schema.field_by_name("a"))
+ thousand = builder.make_literal(1000.0)
+ cond = builder.make_function("less_than", [node_a, thousand], pa.bool_())
+ condition = builder.make_condition(cond)
+
+ filter = gandiva.make_filter(table.schema, condition)
+ result = filter.evaluate(table.to_batches()[0], pa.default_memory_pool())
+ assert result.to_array().equals(pa.array(range(1000), type=pa.uint32()))
diff --git a/python/setup.py b/python/setup.py
index 359960a..e6a8871 100755
--- a/python/setup.py
+++ b/python/setup.py
@@ -106,6 +106,7 @@ class build_ext(_build_ext):
('with-tensorflow', None,
'build pyarrow with TensorFlow support'),
('with-orc', None, 'build the ORC extension'),
+ ('with-gandiva', None, 'build the Gandiva extension'),
('generate-coverage', None,
'enable Cython code coverage'),
('bundle-boost', None,
@@ -147,6 +148,8 @@ class build_ext(_build_ext):
os.environ.get('PYARROW_WITH_TENSORFLOW', '0'))
self.with_orc = strtobool(
os.environ.get('PYARROW_WITH_ORC', '0'))
+ self.with_gandiva = strtobool(
+ os.environ.get('PYARROW_WITH_GANDIVA', '0'))
self.generate_coverage = strtobool(
os.environ.get('PYARROW_GENERATE_COVERAGE', '0'))
self.bundle_arrow_cpp = strtobool(
@@ -155,6 +158,7 @@ class build_ext(_build_ext):
os.environ.get('PYARROW_BUNDLE_BOOST', '0'))
CYTHON_MODULE_NAMES = [
+ 'gandiva',
'lib',
'_csv',
'_cuda',
@@ -214,6 +218,9 @@ class build_ext(_build_ext):
if self.with_orc:
cmake_options.append('-DPYARROW_BUILD_ORC=on')
+ if self.with_gandiva:
+ cmake_options.append('-DPYARROW_BUILD_GANDIVA=on')
+
if len(self.cmake_cxxflags) > 0:
cmake_options.append('-DPYARROW_CXXFLAGS={0}'
.format(self.cmake_cxxflags))
@@ -373,6 +380,8 @@ class build_ext(_build_ext):
return True
if name == '_cuda' and not self.with_cuda:
return True
+ if name == 'gandiva' and not self.with_gandiva:
+ return True
return False
def _get_build_dir(self):