You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/02 16:41:31 UTC

[arrow] branch master updated: ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 67b9215  ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray
67b9215 is described below

commit 67b92151d3a39b1908acc949c7192aa4fe77229c
Author: Wes McKinney <we...@apache.org>
AuthorDate: Sun Dec 2 10:41:24 2018 -0600

    ARROW-3890: [Python] Handle NumPy binary arrays with UTF-8 validation when converting to StringArray
    
    I'm not sure if all compilers will be smart enough to do loop unswitching here. If it ends up being a bottleneck I suggest rewriting in a follow up patch.
    
    The BinaryArray overflow issue (ChunkedArray is not being produced) is still present here. We will need to address that in ARROW-2970
    
    This patch also includes symbol export macros particular to the arrow_python shared library. These are needed so that global data members in arrow.dll can be accessed in arrow_python.dll
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3063 from wesm/ARROW-3890 and squashes the following commits:
    
    dac4995f9 <Wes McKinney> Windows needs arrow.lib in addition to arrow_python.lib now because of the new export flags
    91dbea8bb <Wes McKinney> Add libarrow_python-specific visibility macros so that global data members from arrow.dll can be accessed correctly in arrow_python.dll
    062c3836f <Wes McKinney> Clarify comment
    cfbd30bbe <Wes McKinney> Handle case where user passes UTF-8 encoded numpy.str_ type array to pyarrow.array with type=pyarrow.string()
---
 cpp/src/arrow/python/CMakeLists.txt             |  3 +-
 cpp/src/arrow/python/arrow_to_pandas.h          | 12 ++---
 cpp/src/arrow/python/benchmark.h                |  4 +-
 cpp/src/arrow/python/common.h                   | 18 +++----
 cpp/src/arrow/python/config.h                   |  6 +--
 cpp/src/arrow/python/decimal.h                  | 16 +++----
 cpp/src/arrow/python/deserialize.h              | 12 ++---
 cpp/src/arrow/python/helpers.h                  | 30 ++++++------
 cpp/src/arrow/python/inference.cc               |  6 +--
 cpp/src/arrow/python/inference.h                | 12 ++---
 cpp/src/arrow/python/init.h                     |  4 +-
 cpp/src/arrow/python/io.h                       |  8 ++--
 cpp/src/arrow/python/numpy_convert.h            | 20 ++++----
 cpp/src/arrow/python/numpy_to_arrow.cc          | 28 +++++++++--
 cpp/src/arrow/python/numpy_to_arrow.h           |  6 +--
 cpp/src/arrow/python/pyarrow.h                  | 62 +++++++++++++------------
 cpp/src/arrow/python/python_to_arrow.h          |  6 +--
 cpp/src/arrow/python/serialize.h                | 10 ++--
 cpp/src/arrow/python/{config.h => visibility.h} | 39 +++++++---------
 python/pyarrow/__init__.py                      |  2 +-
 python/pyarrow/tests/test_array.py              | 27 +++++++++++
 21 files changed, 189 insertions(+), 142 deletions(-)

diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt
index ff63eb0..7f4603a 100644
--- a/cpp/src/arrow/python/CMakeLists.txt
+++ b/cpp/src/arrow/python/CMakeLists.txt
@@ -76,7 +76,7 @@ ADD_ARROW_LIB(arrow_python
 
 foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES})
   target_compile_definitions(${LIB_TARGET}
-    PRIVATE ARROW_EXPORTING)
+    PRIVATE ARROW_PYTHON_EXPORTING)
 endforeach()
 
 if (ARROW_BUILD_STATIC AND MSVC)
@@ -112,6 +112,7 @@ install(FILES
   pyarrow.h
   serialize.h
   type_traits.h
+  visibility.h
   DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python")
 
 # pkg-config support
diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h
index 138b010..753bf48 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.h
+++ b/cpp/src/arrow/python/arrow_to_pandas.h
@@ -27,7 +27,7 @@
 #include <string>
 #include <unordered_set>
 
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 namespace arrow {
 
@@ -57,16 +57,16 @@ struct PandasOptions {
         use_threads(false) {}
 };
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr<Array>& arr,
                             PyObject* py_ref, PyObject** out);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ConvertChunkedArrayToPandas(PandasOptions options,
                                    const std::shared_ptr<ChunkedArray>& col,
                                    PyObject* py_ref, PyObject** out);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr<Column>& col,
                              PyObject* py_ref, PyObject** out);
 
@@ -76,7 +76,7 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr<Column
 // BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x.
 //
 // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2])
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr<Table>& table,
                             MemoryPool* pool, PyObject** out);
 
@@ -84,7 +84,7 @@ Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr<Table>&
 ///
 /// Explicitly name columns that should be a categorical
 /// This option is only used on conversions that are applied to a table.
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ConvertTableToPandas(PandasOptions options,
                             const std::unordered_set<std::string>& categorical_columns,
                             const std::shared_ptr<Table>& table, MemoryPool* pool,
diff --git a/cpp/src/arrow/python/benchmark.h b/cpp/src/arrow/python/benchmark.h
index f88b6b4..caaff32 100644
--- a/cpp/src/arrow/python/benchmark.h
+++ b/cpp/src/arrow/python/benchmark.h
@@ -20,7 +20,7 @@
 
 #include "arrow/python/platform.h"
 
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 namespace arrow {
 namespace py {
@@ -29,7 +29,7 @@ namespace benchmark {
 // Micro-benchmark routines for use from ASV
 
 // Run PandasObjectIsNull() once over every object in *list*
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 void Benchmark_PandasObjectIsNull(PyObject* list);
 
 }  // namespace benchmark
diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h
index 5779ef0..6587bd3 100644
--- a/cpp/src/arrow/python/common.h
+++ b/cpp/src/arrow/python/common.h
@@ -26,8 +26,8 @@
 #include "arrow/python/config.h"
 
 #include "arrow/buffer.h"
+#include "arrow/python/visibility.h"
 #include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
 
 namespace arrow {
 
@@ -35,7 +35,7 @@ class MemoryPool;
 
 namespace py {
 
-ARROW_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
+ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError);
 
 // Catch a pending Python exception and return the corresponding Status.
 // If no exception is pending, Status::OK() is returned.
@@ -47,14 +47,14 @@ inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) {
   }
 }
 
-ARROW_EXPORT Status PassPyError();
+ARROW_PYTHON_EXPORT Status PassPyError();
 
 // TODO(wesm): We can just let errors pass through. To be explored later
 #define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError());
 
 #define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE));
 
-class ARROW_EXPORT PyAcquireGIL {
+class ARROW_PYTHON_EXPORT PyAcquireGIL {
  public:
   PyAcquireGIL() : acquired_gil_(false) { acquire(); }
 
@@ -85,7 +85,7 @@ class ARROW_EXPORT PyAcquireGIL {
 
 // A RAII primitive that DECREFs the underlying PyObject* when it
 // goes out of scope.
-class ARROW_EXPORT OwnedRef {
+class ARROW_PYTHON_EXPORT OwnedRef {
  public:
   OwnedRef() : obj_(NULLPTR) {}
   OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {}
@@ -126,7 +126,7 @@ class ARROW_EXPORT OwnedRef {
 // Same as OwnedRef, but ensures the GIL is taken when it goes out of scope.
 // This is for situations where the GIL is not always known to be held
 // (e.g. if it is released in the middle of a function for performance reasons)
-class ARROW_EXPORT OwnedRefNoGIL : public OwnedRef {
+class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef {
  public:
   OwnedRefNoGIL() : OwnedRef() {}
   OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {}
@@ -226,10 +226,10 @@ struct PyBytesView {
 };
 
 // Return the common PyArrow memory pool
-ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool);
-ARROW_EXPORT MemoryPool* get_memory_pool();
+ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool);
+ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool();
 
-class ARROW_EXPORT PyBuffer : public Buffer {
+class ARROW_PYTHON_EXPORT PyBuffer : public Buffer {
  public:
   /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports
   /// one-dimensional byte buffers.
diff --git a/cpp/src/arrow/python/config.h b/cpp/src/arrow/python/config.h
index c2b089d..5649ffe 100644
--- a/cpp/src/arrow/python/config.h
+++ b/cpp/src/arrow/python/config.h
@@ -21,7 +21,7 @@
 #include "arrow/python/platform.h"
 
 #include "arrow/python/numpy_interop.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 #if PY_MAJOR_VERSION >= 3
 #define PyString_Check PyUnicode_Check
@@ -30,10 +30,10 @@
 namespace arrow {
 namespace py {
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 extern PyObject* numpy_nan;
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 void set_numpy_nan(PyObject* obj);
 
 }  // namespace py
diff --git a/cpp/src/arrow/python/decimal.h b/cpp/src/arrow/python/decimal.h
index dd382d1..8072795 100644
--- a/cpp/src/arrow/python/decimal.h
+++ b/cpp/src/arrow/python/decimal.h
@@ -20,8 +20,8 @@
 
 #include <string>
 
+#include "arrow/python/visibility.h"
 #include "arrow/type.h"
-#include "arrow/util/visibility.h"
 
 namespace arrow {
 
@@ -38,21 +38,21 @@ class OwnedRef;
 namespace internal {
 
 // \brief Import the Python Decimal type
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ImportDecimalType(OwnedRef* decimal_type);
 
 // \brief Convert a Python Decimal object to a C++ string
 // \param[in] python_decimal A Python decimal.Decimal instance
 // \param[out] The string representation of the Python Decimal instance
 // \return The status of the operation
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status PythonDecimalToString(PyObject* python_decimal, std::string* out);
 
 // \brief Convert a C++ std::string to a Python Decimal instance
 // \param[in] decimal_constructor The decimal type object
 // \param[in] decimal_string A decimal string
 // \return An instance of decimal.Decimal
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 PyObject* DecimalFromString(PyObject* decimal_constructor,
                             const std::string& decimal_string);
 
@@ -61,21 +61,21 @@ PyObject* DecimalFromString(PyObject* decimal_constructor,
 // \param[in] arrow_type An instance of arrow::DecimalType
 // \param[out] out A pointer to a Decimal128
 // \return The status of the operation
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type,
                                 Decimal128* out);
 
 // \brief Check whether obj is an instance of Decimal
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool PyDecimal_Check(PyObject* obj);
 
 // \brief Check whether obj is nan. This function will abort the program if the argument
 // is not a Decimal instance
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool PyDecimal_ISNAN(PyObject* obj);
 
 // \brief Helper class to track and update the precision and scale of a decimal
-class ARROW_EXPORT DecimalMetadata {
+class ARROW_PYTHON_EXPORT DecimalMetadata {
  public:
   DecimalMetadata();
   DecimalMetadata(int32_t precision, int32_t scale);
diff --git a/cpp/src/arrow/python/deserialize.h b/cpp/src/arrow/python/deserialize.h
index 754765a..b9c4984 100644
--- a/cpp/src/arrow/python/deserialize.h
+++ b/cpp/src/arrow/python/deserialize.h
@@ -23,8 +23,8 @@
 #include <vector>
 
 #include "arrow/python/serialize.h"
+#include "arrow/python/visibility.h"
 #include "arrow/status.h"
-#include "arrow/util/visibility.h"
 
 namespace arrow {
 
@@ -43,7 +43,7 @@ namespace py {
 /// \param[in] src a RandomAccessFile
 /// \param[out] out the reconstructed data
 /// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
 
 /// \brief Reconstruct SerializedPyObject from representation produced by
@@ -56,7 +56,7 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out);
 /// num_tensors * 2 + num_buffers in length
 /// \param[out] out the reconstructed object
 /// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_buffers,
                                    PyObject* data, SerializedPyObject* out);
 
@@ -72,7 +72,7 @@ Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_bu
 /// \param[out] out The returned object
 /// \return Status
 /// This acquires the GIL
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status DeserializeObject(PyObject* context, const SerializedPyObject& object,
                          PyObject* base, PyObject** out);
 
@@ -80,10 +80,10 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& object,
 /// \param[in] object Object to deserialize
 /// \param[out] out The deserialized tensor
 /// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr<Tensor>* out);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status NdarrayFromBuffer(std::shared_ptr<Buffer> src, std::shared_ptr<Tensor>* out);
 
 }  // namespace py
diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h
index 4a7c8f1..2d44fee 100644
--- a/cpp/src/arrow/python/helpers.h
+++ b/cpp/src/arrow/python/helpers.h
@@ -27,9 +27,9 @@
 
 #include <numpy/halffloat.h>
 
+#include "arrow/python/visibility.h"
 #include "arrow/type.h"
 #include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
 
 namespace arrow {
 
@@ -40,20 +40,20 @@ class OwnedRef;
 // \brief Get an arrow DataType instance from Arrow's Type::type enum
 // \param[in] type One of the values of Arrow's Type::type enum
 // \return A shared pointer to DataType
-ARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
+ARROW_PYTHON_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);
 
 // \brief Construct a np.float16 object from a npy_half value.
-ARROW_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
+ARROW_PYTHON_EXPORT PyObject* PyHalf_FromHalf(npy_half value);
 
 // \brief Convert a Python object to a npy_half value.
-ARROW_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
+ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out);
 
 namespace internal {
 
 // \brief Import a Python module
 // \param[in] module_name The name of the module
 // \param[out] ref The OwnedRef containing the module PyObject*
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ImportModule(const std::string& module_name, OwnedRef* ref);
 
 // \brief Import an object from a Python module
@@ -61,7 +61,7 @@ Status ImportModule(const std::string& module_name, OwnedRef* ref);
 // \param[in] name The name of the object to import
 // \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c
 // module
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref);
 
 // \brief Check whether obj is an integer, independent of Python versions.
@@ -74,11 +74,11 @@ inline bool IsPyInteger(PyObject* obj) {
 }
 
 // \brief Use pandas missing value semantics to check if a value is null
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool PandasObjectIsNull(PyObject* obj);
 
 // \brief Check whether obj is a floating-point NaN
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool PyFloat_IsNaN(PyObject* obj);
 
 inline bool IsPyBinary(PyObject* obj) {
@@ -93,19 +93,19 @@ template <typename Int>
 Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = "");
 
 // \brief Convert a Python unicode string to a std::string
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status PyUnicode_AsStdString(PyObject* obj, std::string* out);
 
 // \brief Convert a Python bytes object to a std::string
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 std::string PyBytes_AsStdString(PyObject* obj);
 
 // \brief Call str() on the given object and return the result as a std::string
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status PyObject_StdStringStr(PyObject* obj, std::string* out);
 
 // \brief Return the repr() of the given object (always succeeds)
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 std::string PyObject_StdStringRepr(PyObject* obj);
 
 // \brief Cast the given size to int32_t, with error checking
@@ -121,12 +121,12 @@ inline Status CastSize(Py_ssize_t size, int32_t* out,
 
 // \brief Print the Python object's __str__ form along with the passed error
 // message
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status InvalidValue(PyObject* obj, const std::string& why);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status IntegerScalarToDoubleSafe(PyObject* obj, double* result);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status IntegerScalarToFloat32Safe(PyObject* obj, float* result);
 
 }  // namespace internal
diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc
index e619a64..0f1d85e 100644
--- a/cpp/src/arrow/python/inference.cc
+++ b/cpp/src/arrow/python/inference.cc
@@ -583,13 +583,13 @@ Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
   return Status::OK();
 }
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); }
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); }
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); }
 
 }  // namespace py
diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h
index 2cffa17..f2e2305 100644
--- a/cpp/src/arrow/python/inference.h
+++ b/cpp/src/arrow/python/inference.h
@@ -27,9 +27,9 @@
 #include <ostream>
 #include <string>
 
+#include "arrow/python/visibility.h"
 #include "arrow/type.h"
 #include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
 
 #include "arrow/python/common.h"
 
@@ -41,23 +41,23 @@ class Status;
 namespace py {
 
 // These three functions take a sequence input, not arbitrary iterables
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 arrow::Status InferArrowType(PyObject* obj, std::shared_ptr<arrow::DataType>* out_type);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 arrow::Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
                                     std::shared_ptr<arrow::DataType>* out_type);
 
 /// Checks whether the passed Python object is a boolean scalar
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool IsPyBool(PyObject* obj);
 
 /// Checks whether the passed Python object is an integer scalar
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool IsPyInt(PyObject* obj);
 
 /// Checks whether the passed Python object is a float scalar
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool IsPyFloat(PyObject* obj);
 
 }  // namespace py
diff --git a/cpp/src/arrow/python/init.h b/cpp/src/arrow/python/init.h
index 1daa5a3..34d19b2 100644
--- a/cpp/src/arrow/python/init.h
+++ b/cpp/src/arrow/python/init.h
@@ -19,10 +19,10 @@
 #define ARROW_PYTHON_INIT_H
 
 #include "arrow/python/platform.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 extern "C" {
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 int arrow_init_numpy();
 }
 
diff --git a/cpp/src/arrow/python/io.h b/cpp/src/arrow/python/io.h
index 73d96f5..d3b7c99 100644
--- a/cpp/src/arrow/python/io.h
+++ b/cpp/src/arrow/python/io.h
@@ -22,7 +22,7 @@
 
 #include "arrow/io/interfaces.h"
 #include "arrow/io/memory.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 #include "arrow/python/config.h"
 
@@ -36,7 +36,7 @@ namespace py {
 
 class ARROW_NO_EXPORT PythonFile;
 
-class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile {
+class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile {
  public:
   explicit PyReadableFile(PyObject* file);
   ~PyReadableFile() override;
@@ -64,7 +64,7 @@ class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile {
   std::unique_ptr<PythonFile> file_;
 };
 
-class ARROW_EXPORT PyOutputStream : public io::OutputStream {
+class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream {
  public:
   explicit PyOutputStream(PyObject* file);
   ~PyOutputStream() override;
@@ -87,7 +87,7 @@ class ARROW_EXPORT PyOutputStream : public io::OutputStream {
 // Keeping the reference in a Python wrapper would be incorrect as
 // the Python wrapper can get destroyed even though the wrapped C++
 // buffer is still alive (ARROW-2270).
-class ARROW_EXPORT PyForeignBuffer : public Buffer {
+class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer {
  public:
   static Status Make(const uint8_t* data, int64_t size, PyObject* base,
                      std::shared_ptr<Buffer>* out);
diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h
index dfdb1ac..dce5fe5 100644
--- a/cpp/src/arrow/python/numpy_convert.h
+++ b/cpp/src/arrow/python/numpy_convert.h
@@ -27,7 +27,7 @@
 #include <string>
 
 #include "arrow/buffer.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 namespace arrow {
 
@@ -38,7 +38,7 @@ class Tensor;
 
 namespace py {
 
-class ARROW_EXPORT NumPyBuffer : public Buffer {
+class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer {
  public:
   explicit NumPyBuffer(PyObject* arr);
   virtual ~NumPyBuffer();
@@ -48,25 +48,25 @@ class ARROW_EXPORT NumPyBuffer : public Buffer {
 };
 
 // Handle misbehaved types like LONGLONG and ULONGLONG
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 int cast_npy_type_compat(int type_num);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 bool is_contiguous(PyObject* array);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr<DataType>* out);
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr<DataType>* out);
 
 Status GetTensorType(PyObject* dtype, std::shared_ptr<DataType>* out);
 Status GetNumPyType(const DataType& type, int* type_num);
 
-ARROW_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
-                                    std::shared_ptr<Tensor>* out);
+ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao,
+                                           std::shared_ptr<Tensor>* out);
 
-ARROW_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor, PyObject* base,
-                                    PyObject** out);
+ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr<Tensor>& tensor,
+                                           PyObject* base, PyObject** out);
 
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 37141d7..f9a5ea1 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -41,6 +41,8 @@
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/string.h"
+#include "arrow/util/utf8.h"
 #include "arrow/visitor_inline.h"
 
 #include "arrow/compute/context.h"
@@ -634,30 +636,48 @@ Status AppendUTF32(const char* data, int itemsize, int byteorder,
 }  // namespace
 
 Status NumPyConverter::Visit(const StringType& type) {
+  util::InitializeUTF8();
+
   StringBuilder builder(pool_);
 
-  auto data = reinterpret_cast<const char*>(PyArray_DATA(arr_));
+  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
 
-  char numpy_byteorder = PyArray_DESCR(arr_)->byteorder;
+  char numpy_byteorder = dtype_->byteorder;
 
   // For Python C API, -1 is little-endian, 1 is big-endian
   int byteorder = numpy_byteorder == '>' ? 1 : -1;
 
   PyAcquireGIL gil_lock;
 
+  const bool is_binary_type = dtype_->type_num == NPY_STRING;
+
+  auto AppendNonNullValue = [&](const uint8_t* data) {
+    if (is_binary_type) {
+      if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) {
+        return builder.Append(data, itemsize_);
+      } else {
+        std::stringstream ss;
+        ss << "Encountered non-UTF8 binary value: " << HexEncode(data, itemsize_);
+        return Status::Invalid(ss.str());
+      }
+    } else {
+      return AppendUTF32(reinterpret_cast<const char*>(data), itemsize_, byteorder,
+                         &builder);
+    }
+  };
   if (mask_ != nullptr) {
     Ndarray1DIndexer<uint8_t> mask_values(mask_);
     for (int64_t i = 0; i < length_; ++i) {
       if (mask_values[i]) {
         RETURN_NOT_OK(builder.AppendNull());
       } else {
-        RETURN_NOT_OK(AppendUTF32(data, itemsize_, byteorder, &builder));
+        RETURN_NOT_OK(AppendNonNullValue(data));
       }
       data += stride_;
     }
   } else {
     for (int64_t i = 0; i < length_; ++i) {
-      RETURN_NOT_OK(AppendUTF32(data, itemsize_, byteorder, &builder));
+      RETURN_NOT_OK(AppendNonNullValue(data));
       data += stride_;
     }
   }
diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
index 5e1c088..4edc766 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.h
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -25,7 +25,7 @@
 #include <memory>
 
 #include "arrow/compute/kernels/cast.h"
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 namespace arrow {
 
@@ -48,7 +48,7 @@ namespace py {
 /// \param[in] type a specific type to cast to, may be null
 /// \param[in] cast_options casting options
 /// \param[out] out a ChunkedArray, to accommodate chunked output
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                       const std::shared_ptr<DataType>& type,
                       const compute::CastOptions& cast_options,
@@ -64,7 +64,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
 /// whether values are null
 /// \param[in] type a specific type to cast to, may be null
 /// \param[out] out a ChunkedArray, to accommodate chunked output
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
                       const std::shared_ptr<DataType>& type,
                       std::shared_ptr<ChunkedArray>* out);
diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h
index e637627..a5a3910 100644
--- a/cpp/src/arrow/python/pyarrow.h
+++ b/cpp/src/arrow/python/pyarrow.h
@@ -22,7 +22,7 @@
 
 #include <memory>
 
-#include "arrow/util/visibility.h"
+#include "arrow/python/visibility.h"
 
 namespace arrow {
 
@@ -39,44 +39,46 @@ class Tensor;
 
 namespace py {
 
-ARROW_EXPORT int import_pyarrow();
+ARROW_PYTHON_EXPORT int import_pyarrow();
 
-ARROW_EXPORT bool is_buffer(PyObject* buffer);
-ARROW_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr<Buffer>* out);
-ARROW_EXPORT PyObject* wrap_buffer(const std::shared_ptr<Buffer>& buffer);
+ARROW_PYTHON_EXPORT bool is_buffer(PyObject* buffer);
+ARROW_PYTHON_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr<Buffer>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_buffer(const std::shared_ptr<Buffer>& buffer);
 
-ARROW_EXPORT bool is_data_type(PyObject* data_type);
-ARROW_EXPORT Status unwrap_data_type(PyObject* data_type, std::shared_ptr<DataType>* out);
-ARROW_EXPORT PyObject* wrap_data_type(const std::shared_ptr<DataType>& type);
+ARROW_PYTHON_EXPORT bool is_data_type(PyObject* data_type);
+ARROW_PYTHON_EXPORT Status unwrap_data_type(PyObject* data_type,
+                                            std::shared_ptr<DataType>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_data_type(const std::shared_ptr<DataType>& type);
 
-ARROW_EXPORT bool is_field(PyObject* field);
-ARROW_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr<Field>* out);
-ARROW_EXPORT PyObject* wrap_field(const std::shared_ptr<Field>& field);
+ARROW_PYTHON_EXPORT bool is_field(PyObject* field);
+ARROW_PYTHON_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr<Field>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_field(const std::shared_ptr<Field>& field);
 
-ARROW_EXPORT bool is_schema(PyObject* schema);
-ARROW_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr<Schema>* out);
-ARROW_EXPORT PyObject* wrap_schema(const std::shared_ptr<Schema>& schema);
+ARROW_PYTHON_EXPORT bool is_schema(PyObject* schema);
+ARROW_PYTHON_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr<Schema>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_schema(const std::shared_ptr<Schema>& schema);
 
-ARROW_EXPORT bool is_array(PyObject* array);
-ARROW_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr<Array>* out);
-ARROW_EXPORT PyObject* wrap_array(const std::shared_ptr<Array>& array);
+ARROW_PYTHON_EXPORT bool is_array(PyObject* array);
+ARROW_PYTHON_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr<Array>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_array(const std::shared_ptr<Array>& array);
 
-ARROW_EXPORT bool is_tensor(PyObject* tensor);
-ARROW_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr<Tensor>* out);
-ARROW_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor);
+ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor);
+ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr<Tensor>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr<Tensor>& tensor);
 
-ARROW_EXPORT bool is_column(PyObject* column);
-ARROW_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out);
-ARROW_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>& column);
+ARROW_PYTHON_EXPORT bool is_column(PyObject* column);
+ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr<Column>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr<Column>& column);
 
-ARROW_EXPORT bool is_table(PyObject* table);
-ARROW_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr<Table>* out);
-ARROW_EXPORT PyObject* wrap_table(const std::shared_ptr<Table>& table);
+ARROW_PYTHON_EXPORT bool is_table(PyObject* table);
+ARROW_PYTHON_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr<Table>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_table(const std::shared_ptr<Table>& table);
 
-ARROW_EXPORT bool is_record_batch(PyObject* batch);
-ARROW_EXPORT Status unwrap_record_batch(PyObject* batch,
-                                        std::shared_ptr<RecordBatch>* out);
-ARROW_EXPORT PyObject* wrap_record_batch(const std::shared_ptr<RecordBatch>& batch);
+ARROW_PYTHON_EXPORT bool is_record_batch(PyObject* batch);
+ARROW_PYTHON_EXPORT Status unwrap_record_batch(PyObject* batch,
+                                               std::shared_ptr<RecordBatch>* out);
+ARROW_PYTHON_EXPORT PyObject* wrap_record_batch(
+    const std::shared_ptr<RecordBatch>& batch);
 
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h
index d133089..f9d9756 100644
--- a/cpp/src/arrow/python/python_to_arrow.h
+++ b/cpp/src/arrow/python/python_to_arrow.h
@@ -26,9 +26,9 @@
 #include <cstdint>
 #include <memory>
 
+#include "arrow/python/visibility.h"
 #include "arrow/type.h"
 #include "arrow/util/macros.h"
-#include "arrow/util/visibility.h"
 
 #include "arrow/python/common.h"
 
@@ -68,12 +68,12 @@ struct PyConversionOptions {
 /// \param[in] options various conversion options
 /// \param[out] out a ChunkedArray containing one or more chunks
 /// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ConvertPySequence(PyObject* obj, PyObject* mask,
                          const PyConversionOptions& options,
                          std::shared_ptr<ChunkedArray>* out);
 
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options,
                          std::shared_ptr<ChunkedArray>* out);
 
diff --git a/cpp/src/arrow/python/serialize.h b/cpp/src/arrow/python/serialize.h
index 2759d0c..9a9cc65 100644
--- a/cpp/src/arrow/python/serialize.h
+++ b/cpp/src/arrow/python/serialize.h
@@ -21,8 +21,8 @@
 #include <memory>
 #include <vector>
 
+#include "arrow/python/visibility.h"
 #include "arrow/status.h"
-#include "arrow/util/visibility.h"
 
 // Forward declaring PyObject, see
 // https://mail.python.org/pipermail/python-dev/2003-August/037601.html
@@ -47,7 +47,7 @@ class OutputStream;
 
 namespace py {
 
-struct ARROW_EXPORT SerializedPyObject {
+struct ARROW_PYTHON_EXPORT SerializedPyObject {
   std::shared_ptr<RecordBatch> batch;
   std::vector<std::shared_ptr<Tensor>> tensors;
   std::vector<std::shared_ptr<Tensor>> ndarrays;
@@ -86,14 +86,14 @@ struct ARROW_EXPORT SerializedPyObject {
 /// \return Status
 ///
 /// Release GIL before calling
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out);
 
 /// \brief Serialize an Arrow Tensor as a SerializedPyObject.
 /// \param[in] tensor Tensor to be serialized
 /// \param[out] out The serialized representation
 /// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* out);
 
 /// \brief Write the Tensor metadata header to an OutputStream.
@@ -102,7 +102,7 @@ Status SerializeTensor(std::shared_ptr<Tensor> tensor, py::SerializedPyObject* o
 /// \param[in] tensor_num_bytes The lengh of the Tensor data in bytes
 /// \param[in] dst The OutputStream to write the Tensor header to
 /// \return Status
-ARROW_EXPORT
+ARROW_PYTHON_EXPORT
 Status WriteNdarrayHeader(std::shared_ptr<DataType> dtype,
                           const std::vector<int64_t>& shape, int64_t tensor_num_bytes,
                           io::OutputStream* dst);
diff --git a/cpp/src/arrow/python/config.h b/cpp/src/arrow/python/visibility.h
similarity index 60%
copy from cpp/src/arrow/python/config.h
copy to cpp/src/arrow/python/visibility.h
index c2b089d..c0b343c 100644
--- a/cpp/src/arrow/python/config.h
+++ b/cpp/src/arrow/python/visibility.h
@@ -15,28 +15,25 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef ARROW_PYTHON_CONFIG_H
-#define ARROW_PYTHON_CONFIG_H
+#pragma once
 
-#include "arrow/python/platform.h"
-
-#include "arrow/python/numpy_interop.h"
-#include "arrow/util/visibility.h"
-
-#if PY_MAJOR_VERSION >= 3
-#define PyString_Check PyUnicode_Check
+#if defined(_WIN32) || defined(__CYGWIN__)  // Windows
+#if defined(_MSC_VER)
+#pragma warning(disable : 4251)
+#else
+#pragma GCC diagnostic ignored "-Wattributes"
 #endif
 
-namespace arrow {
-namespace py {
-
-ARROW_EXPORT
-extern PyObject* numpy_nan;
-
-ARROW_EXPORT
-void set_numpy_nan(PyObject* obj);
-
-}  // namespace py
-}  // namespace arrow
+#ifdef ARROW_STATIC
+#define ARROW_PYTHON_EXPORT
+#elif defined(ARROW_PYTHON_EXPORTING)
+#define ARROW_PYTHON_EXPORT __declspec(dllexport)
+#else
+#define ARROW_PYTHON_EXPORT __declspec(dllimport)
+#endif
 
-#endif  // ARROW_PYTHON_CONFIG_H
+#else  // Not Windows
+#ifndef ARROW_PYTHON_EXPORT
+#define ARROW_PYTHON_EXPORT __attribute__((visibility("default")))
+#endif
+#endif  // Non-Windows
diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py
index 12c2285..63ed53e 100644
--- a/python/pyarrow/__init__.py
+++ b/python/pyarrow/__init__.py
@@ -192,7 +192,7 @@ def get_libraries():
     Return list of library names to include in the `libraries` argument for C
     or Cython extensions using pyarrow
     """
-    return ['arrow_python']
+    return ['arrow', 'arrow_python']
 
 
 def get_library_dirs():
diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py
index 1350ad6..f9bd06e 100644
--- a/python/pyarrow/tests/test_array.py
+++ b/python/pyarrow/tests/test_array.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -1221,3 +1222,29 @@ def test_nested_dictionary_array():
     dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b'])
     dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr)
     assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a']
+
+
+def test_array_from_numpy_str_utf8():
+    # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python
+    # 2 they are NPY_STRING (binary), so we must do UTF-8 validation
+    vec = np.array(["toto", "tata"])
+    vec2 = np.array(["toto", "tata"], dtype=object)
+
+    arr = pa.array(vec, pa.string())
+    arr2 = pa.array(vec2, pa.string())
+    expected = pa.array([u"toto", u"tata"])
+    assert arr.equals(expected)
+    assert arr2.equals(expected)
+
+    # with mask, separate code path
+    mask = np.array([False, False], dtype=bool)
+    arr = pa.array(vec, pa.string(), mask=mask)
+    assert arr.equals(expected)
+
+    # UTF8 validation failures
+    vec = np.array([(u'maƱana').encode('utf-16-le')])
+    with pytest.raises(ValueError):
+        pa.array(vec, pa.string())
+
+    with pytest.raises(ValueError):
+        pa.array(vec, pa.string(), mask=np.array([False]))