You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/02 16:41:31 UTC
[arrow] branch master updated: ARROW-3890: [Python] Handle NumPy
binary arrays with UTF-8 validation when converting to StringArray
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 67b9215 ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray
67b9215 is described below
commit 67b92151d3a39b1908acc949c7192aa4fe77229c
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Dec 2 10:41:24 2018 -0600
ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray
I'm not sure if all compilers will be smart enough to do loop unswitching here. If it ends up being a bottleneck I suggest rewriting in a follow up patch.
The BinaryArray overflow issue (ChunkedArray is not being produced) is still present here. We will need to address that in ARROW-2970
This patch also includes symbol export macros particular to the arrow_python shared library. These are needed so that global data members in arrow.dll can be accessed in arrow_python.dll
Author: Wes McKinney <we...@apache.org>
Closes #3063 from wesm/ARROW-3890 and squashes the following commits:
dac4995f9 <Wes McKinney> Windows needs arrow.lib in addition to arrow_python.lib now because of the new export flags
91dbea8bb <Wes McKinney> Add libarrow_python-specific visibility macros so that global data members from arrow.dll can be accessed correctly in arrow_python.dll
062c3836f <Wes McKinney> Clarify comment
cfbd30bbe <Wes McKinney> Handle case where user passes UTF-8 encoded numpy.str_ type array to pyarrow.array with type=pyarrow.string()
---
cpp/src/arrow/python/CMakeLists.txt | 3 +-
cpp/src/arrow/python/arrow_to_pandas.h | 12 ++---
cpp/src/arrow/python/benchmark.h | 4 +-
cpp/src/arrow/python/common.h | 18 +++----
cpp/src/arrow/python/config.h | 6 +--
cpp/src/arrow/python/decimal.h | 16 +++----
cpp/src/arrow/python/deserialize.h | 12 ++---
cpp/src/arrow/python/helpers.h | 30 ++++++------
cpp/src/arrow/python/inference.cc | 6 +--
cpp/src/arrow/python/inference.h | 12 ++---
cpp/src/arrow/python/init.h | 4 +-
cpp/src/arrow/python/io.h | 8 ++--
cpp/src/arrow/python/numpy_convert.h | 20 ++++----
cpp/src/arrow/python/numpy_to_arrow.cc | 28 +++++++++--
cpp/src/arrow/python/numpy_to_arrow.h | 6 +--
cpp/src/arrow/python/pyarrow.h | 62 +++++++++++++------------
cpp/src/arrow/python/python_to_arrow.h | 6 +--
cpp/src/arrow/python/serialize.h | 10 ++--
cpp/src/arrow/python/{config.h => visibility.h} | 39 +++++++---------
python/pyarrow/__init__.py | 2 +-
python/pyarrow/tests/test_array.py | 27 +++++++++++
21 files changed, 189 insertions(+), 142 deletions(-)
diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt
index ff63eb0..7f4603a 100644
--- a/cpp/src/arrow/python/CMakeLists.txt
+++ b/cpp/src/arrow/python/CMakeLists.txt
@@ -76,7 +76,7 @@ ADD_ARROW_LIB(arrow_python
foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES})
target_compile_definitions(${LIB_TARGET}
- PRIVATE ARROW_EXPORTING)
+ PRIVATE ARROW_PYTHON_EXPORTING)
endforeach()
if (ARROW_BUILD_STATIC AND MSVC)
@@ -112,6 +112,7 @@ install(FILES
pyarrow.h
serialize.h
type_traits.h
+ visibility.h
DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python")
# pkg-config support
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h
index 138b010..753bf48 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -27,7 +27,7 @@
#include <string>
#include <unordered_set>
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
namespace arrow {
@@ -57,16 +57,16 @@ struct PandasOptions {
use_threads(false) {}
};
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr<Array>& arr,
PyObject* py_ref, PyObject** out);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ConvertChunkedArrayToPandas(PandasOptions options,
const std::shared_ptr<ChunkedArray>& col,
PyObject* py_ref, PyObject** out);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr<Column>& col,
PyObject* py_ref, PyObject** out);
@@ -76,7 +76,7 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr<Column
// BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
//
// tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr<Table>& table,
MemoryPool* pool, PyObject** out);
@@ -84,7 +84,7 @@ Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr<Table>&
///
/// Explicitly name columns that should be a categorical
/// This option is only used on conversions that are applied to a table.
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ConvertTableToPandas(PandasOptions options,
const std::unordered_set<std::string>& categorical_columns,
const std::shared_ptr<Table>& table, MemoryPool* pool,
diff --git a/cpp/src/arrow/python/benchmark.h b/cpp/src/arrow/python/benchmark.h
index f88b6b4..caaff32 100644
--- a/cpp/src/arrow/python/benchmark.h
+++ b/cpp/src/arrow/python/benchmark.h
@@ -20,7 +20,7 @@
#include "arrow/python/platform.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
namespace arrow {
namespace py {
@@ -29,7 +29,7 @@ namespace benchmark {
// Micro-benchmark routines for use from ASV
// Run PandasObjectIsNull() once over every object in *list*
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
void Benchmark_PandasObjectIsNull(PyObject* list);
} // namespace benchmark
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index 5779ef0..6587bd3 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -26,8 +26,8 @@
#include "arrow/python/config.h"
#include "arrow/buffer.h"
+#include "arrow/python/visibility.h"
#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
namespace arrow {
@@ -35,7 +35,7 @@ class MemoryPool;
namespace py {
-ARROW_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
+ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
// Catch a pending Python exception and return the corresponding Status.
// If no exception is pending, Status::OK() is returned.
@@ -47,14 +47,14 @@ inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) {
}
}
-ARROW_EXPORT Status PassPyError();
+ARROW_PYTHON_EXPORT Status PassPyError();
// TODO(wesm): We can just let errors pass through. To be explored later
#define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError());
#define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE));
-class ARROW_EXPORT PyAcquireGIL {
+class ARROW_PYTHON_EXPORT PyAcquireGIL {
public:
PyAcquireGIL() : acquired_gil_(false) { acquire(); }
@@ -85,7 +85,7 @@ class ARROW_EXPORT PyAcquireGIL {
// A RAII primitive that DECREFs the underlying PyObject* when it
// goes out of scope.
-class ARROW_EXPORT OwnedRef {
+class ARROW_PYTHON_EXPORT OwnedRef {
public:
OwnedRef() : obj_(NULLPTR) {}
OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
@@ -126,7 +126,7 @@ class ARROW_EXPORT OwnedRef {
// Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
// This is for situations where the GIL is not always known to be held
// (e.g. if it is released in the middle of a function for performance reasons)
-class ARROW_EXPORT OwnedRefNoGIL : public OwnedRef {
+class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
public:
OwnedRefNoGIL() : OwnedRef() {}
OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {}
@@ -226,10 +226,10 @@ struct PyBytesView {
};
// Return the common PyArrow memory pool
-ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool);
-ARROW_EXPORT MemoryPool* get_memory_pool();
+ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool);
+ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool();
-class ARROW_EXPORT PyBuffer : public Buffer {
+class ARROW_PYTHON_EXPORT PyBuffer : public Buffer {
public:
/// While memoryview objects support multi-dimensional buffers, PyBuffer only supports
/// one-dimensional byte buffers.
diff --git a/cpp/src/arrow/python/config.h b/cpp/src/arrow/python/config.h
index c2b089d..5649ffe 100644
--- a/cpp/src/arrow/python/config.h
+++ b/cpp/src/arrow/python/config.h
@@ -21,7 +21,7 @@
#include "arrow/python/platform.h"
#include "arrow/python/numpy_interop.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
#if PY_MAJOR_VERSION >= 3
#define PyString_Check PyUnicode_Check
@@ -30,10 +30,10 @@
namespace arrow {
namespace py {
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
extern PyObject* numpy_nan;
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
void set_numpy_nan(PyObject* obj);
} // namespace py
diff --git a/cpp/src/arrow/python/decimal.h b/cpp/src/arrow/python/decimal.h
index dd382d1..8072795 100644
--- a/cpp/src/arrow/python/decimal.h
+++ b/cpp/src/arrow/python/decimal.h
@@ -20,8 +20,8 @@
#include <string>
+#include "arrow/python/visibility.h"
#include "arrow/type.h"
-#include "arrow/util/visibility.h"
namespace arrow {
@@ -38,21 +38,21 @@ class OwnedRef;
namespace internal {
// \brief Import the Python Decimal type
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ImportDecimalType(OwnedRef* decimal_type);
// \brief Convert a Python Decimal object to a C++ string
// \param[in] python_decimal A Python decimal.Decimal instance
// \param[out] The string representation of the Python Decimal instance
// \return The status of the operation
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
// \brief Convert a C++ std::string to a Python Decimal instance
// \param[in] decimal_constructor The decimal type object
// \param[in] decimal_string A decimal string
// \return An instance of decimal.Decimal
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
PyObject* DecimalFromString(PyObject* decimal_constructor,
const std::string& decimal_string);
@@ -61,21 +61,21 @@ PyObject* DecimalFromString(PyObject* decimal_constructor,
// \param[in] arrow_type An instance of arrow::DecimalType
// \param[out] out A pointer to a Decimal128
// \return The status of the operation
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
Decimal128* out);
// \brief Check whether obj is an instance of Decimal
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool PyDecimal_Check(PyObject* obj);
// \brief Check whether obj is nan. This function will abort the program if the argument
// is not a Decimal instance
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool PyDecimal_ISNAN(PyObject* obj);
// \brief Helper class to track and update the precision and scale of a decimal
-class ARROW_EXPORT DecimalMetadata {
+class ARROW_PYTHON_EXPORT DecimalMetadata {
public:
DecimalMetadata();
DecimalMetadata(int32_t precision, int32_t scale);
diff --git a/cpp/src/arrow/python/deserialize.h b/cpp/src/arrow/python/deserialize.h
index 754765a..b9c4984 100644
--- a/cpp/src/arrow/python/deserialize.h
+++ b/cpp/src/arrow/python/deserialize.h
@@ -23,8 +23,8 @@
#include <vector>
#include "arrow/python/serialize.h"
+#include "arrow/python/visibility.h"
#include "arrow/status.h"
-#include "arrow/util/visibility.h"
namespace arrow {
@@ -43,7 +43,7 @@ namespace py {
/// \param[in] src a RandomAccessFile
/// \param[out] out the reconstructed data
/// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
/// \brief Reconstruct SerializedPyObject from representation produced by
@@ -56,7 +56,7 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
/// num_tensors * 2 + num_buffers in length
/// \param[out] out the reconstructed object
/// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_buffers,
PyObject* data, SerializedPyObject* out);
@@ -72,7 +72,7 @@ Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_bu
/// \param[out] out The returned object
/// \return Status
/// This acquires the GIL
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status DeserializeObject(PyObject* context, const SerializedPyObject& object,
PyObject* base, PyObject** out);
@@ -80,10 +80,10 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& object,
/// \param[in] object Object to deserialize
/// \param[out] out The deserialized tensor
/// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr<Tensor>* out);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status NdarrayFromBuffer(std::shared_ptr<Buffer> src, std::shared_ptr<Tensor>* out);
} // namespace py
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 4a7c8f1..2d44fee 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -27,9 +27,9 @@
#include <numpy/halffloat.h>
+#include "arrow/python/visibility.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
namespace arrow {
@@ -40,20 +40,20 @@ class OwnedRef;
// \brief Get an arrow DataType instance from Arrow's Type::type enum
// \param[in] type One of the values of Arrow's Type::type enum
// \return A shared pointer to DataType
-ARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
+ARROW_PYTHON_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
// \brief Construct a np.float16 object from a npy_half value.
-ARROW_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
+ARROW_PYTHON_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
// \brief Convert a Python object to a npy_half value.
-ARROW_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
+ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
namespace internal {
// \brief Import a Python module
// \param[in] module_name The name of the module
// \param[out] ref The OwnedRef containing the module PyObject*
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ImportModule(const std::string& module_name, OwnedRef* ref);
// \brief Import an object from a Python module
@@ -61,7 +61,7 @@ Status ImportModule(const std::string& module_name, OwnedRef* ref);
// \param[in] name The name of the object to import
// \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
// module
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref);
// \brief Check whether obj is an integer, independent of Python versions.
@@ -74,11 +74,11 @@ inline bool IsPyInteger(PyObject* obj) {
}
// \brief Use pandas missing value semantics to check if a value is null
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool PandasObjectIsNull(PyObject* obj);
// \brief Check whether obj is a floating-point NaN
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool PyFloat_IsNaN(PyObject* obj);
inline bool IsPyBinary(PyObject* obj) {
@@ -93,19 +93,19 @@ template <typename Int>
Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
// \brief Convert a Python unicode string to a std::string
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
// \brief Convert a Python bytes object to a std::string
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
std::string PyBytes_AsStdString(PyObject* obj);
// \brief Call str() on the given object and return the result as a std::string
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status PyObject_StdStringStr(PyObject* obj, std::string* out);
// \brief Return the repr() of the given object (always succeeds)
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
std::string PyObject_StdStringRepr(PyObject* obj);
// \brief Cast the given size to int32_t, with error checking
@@ -121,12 +121,12 @@ inline Status CastSize(Py_ssize_t size, int32_t* out,
// \brief Print the Python object's __str__ form along with the passed error
// message
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status InvalidValue(PyObject* obj, const std::string& why);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status IntegerScalarToFloat32Safe(PyObject* obj, float* result);
} // namespace internal
diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc
index e619a64..0f1d85e 100644
--- a/cpp/src/arrow/python/inference.cc
+++ b/cpp/src/arrow/python/inference.cc
@@ -583,13 +583,13 @@ Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
return Status::OK();
}
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); }
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); }
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); }
} // namespace py
diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h
index 2cffa17..f2e2305 100644
--- a/cpp/src/arrow/python/inference.h
+++ b/cpp/src/arrow/python/inference.h
@@ -27,9 +27,9 @@
#include <ostream>
#include <string>
+#include "arrow/python/visibility.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
#include "arrow/python/common.h"
@@ -41,23 +41,23 @@ class Status;
namespace py {
// These three functions take a sequence input, not arbitrary iterables
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
arrow::Status InferArrowType(PyObject* obj, std::shared_ptr<arrow::DataType>* out_type);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
arrow::Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
std::shared_ptr<arrow::DataType>* out_type);
/// Checks whether the passed Python object is a boolean scalar
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool IsPyBool(PyObject* obj);
/// Checks whether the passed Python object is an integer scalar
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool IsPyInt(PyObject* obj);
/// Checks whether the passed Python object is a float scalar
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool IsPyFloat(PyObject* obj);
} // namespace py
diff --git a/cpp/src/arrow/python/init.h b/cpp/src/arrow/python/init.h
index 1daa5a3..34d19b2 100644
--- a/cpp/src/arrow/python/init.h
+++ b/cpp/src/arrow/python/init.h
@@ -19,10 +19,10 @@
#define ARROW_PYTHON_INIT_H
#include "arrow/python/platform.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
extern "C" {
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
int arrow_init_numpy();
}
diff --git a/cpp/src/arrow/python/io.h b/cpp/src/arrow/python/io.h
index 73d96f5..d3b7c99 100644
--- a/cpp/src/arrow/python/io.h
+++ b/cpp/src/arrow/python/io.h
@@ -22,7 +22,7 @@
#include "arrow/io/interfaces.h"
#include "arrow/io/memory.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
#include "arrow/python/config.h"
@@ -36,7 +36,7 @@ namespace py {
class ARROW_NO_EXPORT PythonFile;
-class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile {
+class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile {
public:
explicit PyReadableFile(PyObject* file);
~PyReadableFile() override;
@@ -64,7 +64,7 @@ class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile {
std::unique_ptr<PythonFile> file_;
};
-class ARROW_EXPORT PyOutputStream : public io::OutputStream {
+class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream {
public:
explicit PyOutputStream(PyObject* file);
~PyOutputStream() override;
@@ -87,7 +87,7 @@ class ARROW_EXPORT PyOutputStream : public io::OutputStream {
// Keeping the reference in a Python wrapper would be incorrect as
// the Python wrapper can get destroyed even though the wrapped C++
// buffer is still alive (ARROW-2270).
-class ARROW_EXPORT PyForeignBuffer : public Buffer {
+class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer {
public:
static Status Make(const uint8_t* data, int64_t size, PyObject* base,
std::shared_ptr<Buffer>* out);
diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h
index dfdb1ac..dce5fe5 100644
--- a/cpp/src/arrow/python/numpy_convert.h
+++ b/cpp/src/arrow/python/numpy_convert.h
@@ -27,7 +27,7 @@
#include <string>
#include "arrow/buffer.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
namespace arrow {
@@ -38,7 +38,7 @@ class Tensor;
namespace py {
-class ARROW_EXPORT NumPyBuffer : public Buffer {
+class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
public:
explicit NumPyBuffer(PyObject* arr);
virtual ~NumPyBuffer();
@@ -48,25 +48,25 @@ class ARROW_EXPORT NumPyBuffer : public Buffer {
};
// Handle misbehaved types like LONGLONG and ULONGLONG
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
int cast_npy_type_compat(int type_num);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
bool is_contiguous(PyObject* array);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out);
Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out);
Status GetNumPyType(const DataType& type, int* type_num);
-ARROW_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
- std::shared_ptr<Tensor>* out);
+ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+ std::shared_ptr<Tensor>* out);
-ARROW_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
- PyObject** out);
+ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
+ PyObject* base, PyObject** out);
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 37141d7..f9a5ea1 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -41,6 +41,8 @@
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
#include "arrow/util/macros.h"
+#include "arrow/util/string.h"
+#include "arrow/util/utf8.h"
#include "arrow/visitor_inline.h"
#include "arrow/compute/context.h"
@@ -634,30 +636,48 @@ Status AppendUTF32(const char* data, int itemsize, int byteorder,
} // namespace
Status NumPyConverter::Visit(const StringType& type) {
+ util::InitializeUTF8();
+
StringBuilder builder(pool_);
- auto data = reinterpret_cast<const char*>(PyArray_DATA(arr_));
+ auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
- char numpy_byteorder = PyArray_DESCR(arr_)->byteorder;
+ char numpy_byteorder = dtype_->byteorder;
// For Python C API, -1 is little-endian, 1 is big-endian
int byteorder = numpy_byteorder == '>' ? 1 : -1;
PyAcquireGIL gil_lock;
+ const bool is_binary_type = dtype_->type_num == NPY_STRING;
+
+ auto AppendNonNullValue = [&](const uint8_t* data) {
+ if (is_binary_type) {
+ if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
+ return builder.Append(data, itemsize_);
+ } else {
+ std::stringstream ss;
+ ss << "Encountered non-UTF8 binary value: " << HexEncode(data, itemsize_);
+ return Status::Invalid(ss.str());
+ }
+ } else {
+ return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
+ &builder);
+ }
+ };
if (mask_ != nullptr) {
Ndarray1DIndexer<uint8_t> mask_values(mask_);
for (int64_t i = 0; i < length_; ++i) {
if (mask_values[i]) {
RETURN_NOT_OK(builder.AppendNull());
} else {
- RETURN_NOT_OK(AppendUTF32(data, itemsize_, byteorder, &builder));
+ RETURN_NOT_OK(AppendNonNullValue(data));
}
data += stride_;
}
} else {
for (int64_t i = 0; i < length_; ++i) {
- RETURN_NOT_OK(AppendUTF32(data, itemsize_, byteorder, &builder));
+ RETURN_NOT_OK(AppendNonNullValue(data));
data += stride_;
}
}
diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
index 5e1c088..4edc766 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.h
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -25,7 +25,7 @@
#include <memory>
#include "arrow/compute/kernels/cast.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
namespace arrow {
@@ -48,7 +48,7 @@ namespace py {
/// \param[in] type a specific type to cast to, may be null
/// \param[in] cast_options casting options
/// \param[out] out a ChunkedArray, to accommodate chunked output
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
const compute::CastOptions& cast_options,
@@ -64,7 +64,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
/// whether values are null
/// \param[in] type a specific type to cast to, may be null
/// \param[out] out a ChunkedArray, to accommodate chunked output
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
std::shared_ptr<ChunkedArray>* out);
diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h
index e637627..a5a3910 100644
--- a/cpp/src/arrow/python/pyarrow.h
+++ b/cpp/src/arrow/python/pyarrow.h
@@ -22,7 +22,7 @@
#include <memory>
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
namespace arrow {
@@ -39,44 +39,46 @@ class Tensor;
namespace py {
-ARROW_EXPORT int import_pyarrow();
+ARROW_PYTHON_EXPORT int import_pyarrow();
-ARROW_EXPORT bool is_buffer(PyObject* buffer);
-ARROW_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr<Buffer>* out);
-ARROW_EXPORT PyObject* wrap_buffer(const std::shared_ptr<Buffer>& buffer);
+ARROW_PYTHON_EXPORT bool is_buffer(PyObject* buffer);
+ARROW_PYTHON_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr<Buffer>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_buffer(const std::shared_ptr<Buffer>& buffer);
-ARROW_EXPORT bool is_data_type(PyObject* data_type);
-ARROW_EXPORT Status unwrap_data_type(PyObject* data_type, std::shared_ptr<DataType>* out);
-ARROW_EXPORT PyObject* wrap_data_type(const std::shared_ptr<DataType>& type);
+ARROW_PYTHON_EXPORT bool is_data_type(PyObject* data_type);
+ARROW_PYTHON_EXPORT Status unwrap_data_type(PyObject* data_type,
+ std::shared_ptr<DataType>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_data_type(const std::shared_ptr<DataType>& type);
-ARROW_EXPORT bool is_field(PyObject* field);
-ARROW_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr<Field>* out);
-ARROW_EXPORT PyObject* wrap_field(const std::shared_ptr<Field>& field);
+ARROW_PYTHON_EXPORT bool is_field(PyObject* field);
+ARROW_PYTHON_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr<Field>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_field(const std::shared_ptr<Field>& field);
-ARROW_EXPORT bool is_schema(PyObject* schema);
-ARROW_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr<Schema>* out);
-ARROW_EXPORT PyObject* wrap_schema(const std::shared_ptr<Schema>& schema);
+ARROW_PYTHON_EXPORT bool is_schema(PyObject* schema);
+ARROW_PYTHON_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr<Schema>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_schema(const std::shared_ptr<Schema>& schema);
-ARROW_EXPORT bool is_array(PyObject* array);
-ARROW_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr<Array>* out);
-ARROW_EXPORT PyObject* wrap_array(const std::shared_ptr<Array>& array);
+ARROW_PYTHON_EXPORT bool is_array(PyObject* array);
+ARROW_PYTHON_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr<Array>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_array(const std::shared_ptr<Array>& array);
-ARROW_EXPORT bool is_tensor(PyObject* tensor);
-ARROW_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr<Tensor>* out);
-ARROW_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor);
+ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor);
+ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr<Tensor>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor);
-ARROW_EXPORT bool is_column(PyObject* column);
-ARROW_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out);
-ARROW_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>& column);
+ARROW_PYTHON_EXPORT bool is_column(PyObject* column);
+ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>& column);
-ARROW_EXPORT bool is_table(PyObject* table);
-ARROW_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr<Table>* out);
-ARROW_EXPORT PyObject* wrap_table(const std::shared_ptr<Table>& table);
+ARROW_PYTHON_EXPORT bool is_table(PyObject* table);
+ARROW_PYTHON_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr<Table>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_table(const std::shared_ptr<Table>& table);
-ARROW_EXPORT bool is_record_batch(PyObject* batch);
-ARROW_EXPORT Status unwrap_record_batch(PyObject* batch,
- std::shared_ptr<RecordBatch>* out);
-ARROW_EXPORT PyObject* wrap_record_batch(const std::shared_ptr<RecordBatch>& batch);
+ARROW_PYTHON_EXPORT bool is_record_batch(PyObject* batch);
+ARROW_PYTHON_EXPORT Status unwrap_record_batch(PyObject* batch,
+ std::shared_ptr<RecordBatch>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_record_batch(
+ const std::shared_ptr<RecordBatch>& batch);
} // namespace py
} // namespace arrow
diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h
index d133089..f9d9756 100644
--- a/cpp/src/arrow/python/python_to_arrow.h
+++ b/cpp/src/arrow/python/python_to_arrow.h
@@ -26,9 +26,9 @@
#include <cstdint>
#include <memory>
+#include "arrow/python/visibility.h"
#include "arrow/type.h"
#include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
#include "arrow/python/common.h"
@@ -68,12 +68,12 @@ struct PyConversionOptions {
/// \param[in] options various conversion options
/// \param[out] out a ChunkedArray containing one or more chunks
/// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ConvertPySequence(PyObject* obj, PyObject* mask,
const PyConversionOptions& options,
std::shared_ptr<ChunkedArray>* out);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options,
std::shared_ptr<ChunkedArray>* out);
diff --git a/cpp/src/arrow/python/serialize.h b/cpp/src/arrow/python/serialize.h
index 2759d0c..9a9cc65 100644
--- a/cpp/src/arrow/python/serialize.h
+++ b/cpp/src/arrow/python/serialize.h
@@ -21,8 +21,8 @@
#include <memory>
#include <vector>
+#include "arrow/python/visibility.h"
#include "arrow/status.h"
-#include "arrow/util/visibility.h"
// Forward declaring PyObject, see
// https://mail.python.org/pipermail/python-dev/2003-August/037601.html
@@ -47,7 +47,7 @@ class OutputStream;
namespace py {
-struct ARROW_EXPORT SerializedPyObject {
+struct ARROW_PYTHON_EXPORT SerializedPyObject {
std::shared_ptr<RecordBatch> batch;
std::vector<std::shared_ptr<Tensor>> tensors;
std::vector<std::shared_ptr<Tensor>> ndarrays;
@@ -86,14 +86,14 @@ struct ARROW_EXPORT SerializedPyObject {
/// \return Status
///
/// Release GIL before calling
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out);
/// \brief Serialize an Arrow Tensor as a SerializedPyObject.
/// \param[in] tensor Tensor to be serialized
/// \param[out] out The serialized representation
/// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* out);
/// \brief Write the Tensor metadata header to an OutputStream.
@@ -102,7 +102,7 @@ Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* o
/// \param[in] tensor_num_bytes The lengh of the Tensor data in bytes
/// \param[in] dst The OutputStream to write the Tensor header to
/// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype,
const std::vector<int64_t>& shape, int64_t tensor_num_bytes,
io::OutputStream* dst);
diff --git a/cpp/src/arrow/python/config.h b/cpp/src/arrow/python/visibility.h
similarity index 60%
copy from cpp/src/arrow/python/config.h
copy to cpp/src/arrow/python/visibility.h
index c2b089d..c0b343c 100644
--- a/cpp/src/arrow/python/config.h
+++ b/cpp/src/arrow/python/visibility.h
@@ -15,28 +15,25 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef ARROW_PYTHON_CONFIG_H
-#define ARROW_PYTHON_CONFIG_H
+#pragma once
-#include "arrow/python/platform.h"
-
-#include "arrow/python/numpy_interop.h"
-#include "arrow/util/visibility.h"
-
-#if PY_MAJOR_VERSION >= 3
-#define PyString_Check PyUnicode_Check
+#if defined(_WIN32) || defined(__CYGWIN__) // Windows
+#if defined(_MSC_VER)
+#pragma warning(disable : 4251)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
#endif
-namespace arrow {
-namespace py {
-
-ARROW_EXPORT
-extern PyObject* numpy_nan;
-
-ARROW_EXPORT
-void set_numpy_nan(PyObject* obj);
-
-} // namespace py
-} // namespace arrow
+#ifdef ARROW_STATIC
+#define ARROW_PYTHON_EXPORT
+#elif defined(ARROW_PYTHON_EXPORTING)
+#define ARROW_PYTHON_EXPORT __declspec(dllexport)
+#else
+#define ARROW_PYTHON_EXPORT __declspec(dllimport)
+#endif
-#endif // ARROW_PYTHON_CONFIG_H
+#else // Not Windows
+#ifndef ARROW_PYTHON_EXPORT
+#define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
+#endif
+#endif // Non-Windows
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 12c2285..63ed53e 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -192,7 +192,7 @@ def get_libraries():
Return list of library names to include in the `libraries` argument for C
or Cython extensions using pyarrow
"""
- return ['arrow_python']
+ return ['arrow', 'arrow_python']
def get_library_dirs():
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 1350ad6..f9bd06e 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
@@ -1221,3 +1222,29 @@ def test_nested_dictionary_array():
dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr)
assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
+
+
+def test_array_from_numpy_str_utf8():
+ # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python
+ # 2 they are NPY_STRING (binary), so we must do UTF-8 validation
+ vec = np.array(["toto", "tata"])
+ vec2 = np.array(["toto", "tata"], dtype=object)
+
+ arr = pa.array(vec, pa.string())
+ arr2 = pa.array(vec2, pa.string())
+ expected = pa.array([u"toto", u"tata"])
+ assert arr.equals(expected)
+ assert arr2.equals(expected)
+
+ # with mask, separate code path
+ mask = np.array([False, False], dtype=bool)
+ arr = pa.array(vec, pa.string(), mask=mask)
+ assert arr.equals(expected)
+
+ # UTF8 validation failures
+ vec = np.array([(u'maƱana').encode('utf-16-le')])
+ with pytest.raises(ValueError):
+ pa.array(vec, pa.string())
+
+ with pytest.raises(ValueError):
+ pa.array(vec, pa.string(), mask=np.array([False]))