You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/12 22:14:49 UTC
[arrow] branch master updated: ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 25b4a468 ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values
25b4a468 is described below

commit 25b4a46805a3be01c83e53a92524d4d7b021c74d
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed Jun 12 17:14:40 2019 -0500

    ARROW-4324: [Python] Triage broken type inference logic in presence of a mix of NumPy dtype-having objects and other scalar values
    
    In investigating the innocuous bug report from ARROW-4324 I stumbled on a pile of hacks and flawed design around type inference
    
    ```
    test_list = [np.dtype('int32').type(10), np.dtype('float32').type(0.5)]
    test_array = pa.array(test_list)
    
    # Expected
    # test_array
    # <pyarrow.lib.DoubleArray object at 0x7f009963bf48>
    # [
    #   10,
    #   0.5
    # ]
    
    # Got
    # test_array
    # <pyarrow.lib.Int32Array object at 0x7f009963bf48>
    # [
    #   10,
    #   0
    # ]
    ```
    
    It turns out there are several issues:
    
    * There was a kludge around handling the `numpy.nan` value which is a PyFloat, not a NumPy float64 scalar
    * Type inference assumed "NaN is null", which should not be hard coded, so I added a flag to switch between pandas semantics and non-pandas
    * Mixing NumPy scalar values and non-NumPy scalars (like our evil friend numpy.nan) caused the output type to be simply incorrect. For example `[np.float16(1.5), 2.5]` would yield `pa.float16()` output type. Yuck
    
    In inserted some hacks to force what I believe to be the correct behavior and fixed a couple unit tests that actually exhibited buggy behavior before (see within). I don't have time to do the "right thing" right now which is to more or less rewrite the hot path of `arrow/python/inference.cc`, so at least this gets the unit tests asserting what is correct so that refactoring will be more productive later.
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #4527 from wesm/ARROW-4324 and squashes the following commits:
    
    e396958b0 <Wes McKinney> Add unit test for passing pandas Series with from_pandas=False
    754468a5d <Wes McKinney> Set from_pandas to None by default in pyarrow.array so that user wishes can be respected
    e1b839339 <Wes McKinney> Remove outdated unit test, add Python unit test that shows behavior from ARROW-2240 that's been changed
    4bc8c8193 <Wes McKinney> Triage type inference logic in presence of a mix of NumPy dtype-having objects and other typed values, pending more serious refactor in ARROW-5564
---
 cpp/src/arrow/python/arrow_to_pandas.cc      |  14 +--
 cpp/src/arrow/python/inference.cc            | 134 +++++++++++++++++++--------
 cpp/src/arrow/python/inference.h             |   6 +-
 cpp/src/arrow/python/numpy-internal.h        |   9 ++
 cpp/src/arrow/python/python-test.cc          |  14 ---
 cpp/src/arrow/python/python_to_arrow.cc      |   2 +-
 python/pyarrow/array.pxi                     |  41 +++++---
 python/pyarrow/tests/test_convert_builtin.py |  42 +++++++--
 python/pyarrow/tests/test_pandas.py          |   7 ++
 9 files changed, 182 insertions(+), 87 deletions(-)

diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
index d556664..fa35a6e 100644
--- a/cpp/src/arrow/python/arrow_to_pandas.cc
+++ b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -173,19 +173,11 @@ inline void set_numpy_metadata(int type, DataType* datatype, PyArray_Descr* out)
   }
 }
 
-static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
-  if (type == NPY_DATETIME) {
-    // It is not safe to mutate the result of DescrFromType
-    return PyArray_DescrNewFromType(type);
-  } else {
-    return PyArray_DescrFromType(type);
-  }
-}
 static inline PyObject* NewArray1DFromType(DataType* arrow_type, int type, int64_t length,
                                            void* data) {
   npy_intp dims[1] = {length};
 
-  PyArray_Descr* descr = GetSafeNumPyDtype(type);
+  PyArray_Descr* descr = internal::GetSafeNumPyDtype(type);
   if (descr == nullptr) {
     // Error occurred, trust error state is set
     return nullptr;
@@ -244,7 +236,7 @@ class PandasBlock {
   Status AllocateNDArray(int npy_type, int ndim = 2) {
     PyAcquireGIL lock;
 
-    PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
+    PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
 
     PyObject* block_arr;
     if (ndim == 2) {
@@ -1220,7 +1212,7 @@ class CategoricalBlock : public PandasBlock {
 
     PyAcquireGIL lock;
 
-    PyArray_Descr* descr = GetSafeNumPyDtype(npy_type);
+    PyArray_Descr* descr = internal::GetSafeNumPyDtype(npy_type);
     if (descr == nullptr) {
       // Error occurred, trust error state is set
       return Status::OK();
diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc
index 6cf8bed..4ec4d9d 100644
--- a/cpp/src/arrow/python/inference.cc
+++ b/cpp/src/arrow/python/inference.cc
@@ -42,19 +42,27 @@ namespace py {
 
 #define _NUMPY_UNIFY_NOOP(DTYPE) \
   case NPY_##DTYPE:              \
-    return NOOP;
+    return OK;
 
 #define _NUMPY_UNIFY_PROMOTE(DTYPE) \
   case NPY_##DTYPE:                 \
-    return PROMOTE;
-
-// Form a consensus NumPy dtype to use for Arrow conversion for a collection of dtype
-// objects observed one at a time
+    current_type_num_ = dtype;      \
+    current_dtype_ = descr;         \
+    return OK;
+
+#define _NUMPY_UNIFY_PROMOTE_TO(DTYPE, NEW_TYPE)               \
+  case NPY_##DTYPE:                                            \
+    current_type_num_ = NPY_##NEW_TYPE;                        \
+    current_dtype_ = PyArray_DescrFromType(current_type_num_); \
+    return OK;
+
+// Form a consensus NumPy dtype to use for Arrow conversion for a
+// collection of dtype objects observed one at a time
 class NumPyDtypeUnifier {
  public:
-  enum Action { NOOP, PROMOTE, INVALID };
+  enum Action { OK, INVALID };
 
-  NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(NULLPTR) {}
+  NumPyDtypeUnifier() : current_type_num_(-1), current_dtype_(nullptr) {}
 
   Status InvalidMix(int new_dtype) {
     return Status::Invalid("Cannot mix NumPy dtypes ",
@@ -97,7 +105,7 @@ class NumPyDtypeUnifier {
       _NUMPY_UNIFY_PROMOTE(INT64);
       _NUMPY_UNIFY_NOOP(UINT8);
       _NUMPY_UNIFY_NOOP(UINT16);
-      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
       _NUMPY_UNIFY_PROMOTE(FLOAT64);
       default:
         return INVALID;
@@ -113,7 +121,7 @@ class NumPyDtypeUnifier {
       _NUMPY_UNIFY_NOOP(UINT8);
       _NUMPY_UNIFY_NOOP(UINT16);
       _NUMPY_UNIFY_NOOP(UINT32);
-      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
       _NUMPY_UNIFY_PROMOTE(FLOAT64);
       default:
         return INVALID;
@@ -149,7 +157,7 @@ class NumPyDtypeUnifier {
       _NUMPY_UNIFY_NOOP(UINT8);
       _NUMPY_UNIFY_NOOP(UINT16);
       _NUMPY_UNIFY_PROMOTE(UINT64);
-      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
       _NUMPY_UNIFY_PROMOTE(FLOAT64);
       default:
         return INVALID;
@@ -161,7 +169,7 @@ class NumPyDtypeUnifier {
       _NUMPY_UNIFY_NOOP(UINT8);
       _NUMPY_UNIFY_NOOP(UINT16);
       _NUMPY_UNIFY_NOOP(UINT32);
-      _NUMPY_UNIFY_PROMOTE(FLOAT32);
+      _NUMPY_UNIFY_PROMOTE_TO(FLOAT32, FLOAT64);
       _NUMPY_UNIFY_PROMOTE(FLOAT64);
       default:
         return INVALID;
@@ -210,12 +218,11 @@ class NumPyDtypeUnifier {
 
   int Observe_DATETIME(PyArray_Descr* dtype_obj) {
     // TODO: check that units are all the same
-    // current_dtype_ = dtype_obj->type_num;
-    return NOOP;
+    return OK;
   }
 
   Status Observe(PyArray_Descr* descr) {
-    const int dtype = fix_numpy_type_num(descr->type_num);
+    int dtype = fix_numpy_type_num(descr->type_num);
 
     if (current_type_num_ == -1) {
       current_dtype_ = descr;
@@ -230,7 +237,7 @@ class NumPyDtypeUnifier {
     action = Observe_##DTYPE(descr, dtype); \
     break;
 
-    int action = NOOP;
+    int action = OK;
     switch (current_type_num_) {
       OBSERVE_CASE(BOOL);
       OBSERVE_CASE(INT8);
@@ -253,9 +260,6 @@ class NumPyDtypeUnifier {
 
     if (action == INVALID) {
       return InvalidMix(dtype);
-    } else if (action == PROMOTE) {
-      current_type_num_ = dtype;
-      current_dtype_ = descr;
     }
     return Status::OK();
   }
@@ -264,6 +268,8 @@ class NumPyDtypeUnifier {
 
   PyArray_Descr* current_dtype() const { return current_dtype_; }
 
+  int current_type_num() const { return current_type_num_; }
+
  private:
   int current_type_num_;
   PyArray_Descr* current_dtype_;
@@ -278,8 +284,10 @@ class TypeInferrer {
   // early with long sequences that may have problems up front
   // \param make_unions permit mixed-type data by creating union types (not yet
   // implemented)
-  explicit TypeInferrer(int64_t validate_interval = 100, bool make_unions = false)
-      : validate_interval_(validate_interval),
+  explicit TypeInferrer(bool pandas_null_sentinels = false,
+                        int64_t validate_interval = 100, bool make_unions = false)
+      : pandas_null_sentinels_(pandas_null_sentinels),
+        validate_interval_(validate_interval),
         make_unions_(make_unions),
         total_count_(0),
         none_count_(0),
@@ -297,6 +305,7 @@ class TypeInferrer {
         decimal_count_(0),
         list_count_(0),
         struct_count_(0),
+        numpy_dtype_count_(0),
         max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
                               std::numeric_limits<int32_t>::min()),
         decimal_type_() {
@@ -311,12 +320,12 @@ class TypeInferrer {
   Status Visit(PyObject* obj, bool* keep_going) {
     ++total_count_;
 
-    if (obj == Py_None || internal::PyFloat_IsNaN(obj)) {
+    if (obj == Py_None || (pandas_null_sentinels_ && internal::PyFloat_IsNaN(obj))) {
       ++none_count_;
     } else if (PyBool_Check(obj)) {
       ++bool_count_;
       *keep_going = make_unions_;
-    } else if (internal::PyFloatScalar_Check(obj)) {
+    } else if (PyFloat_Check(obj)) {
       ++float_count_;
       *keep_going = make_unions_;
     } else if (internal::IsPyInteger(obj)) {
@@ -367,7 +376,7 @@ class TypeInferrer {
     });
   }
 
-  Status GetType(std::shared_ptr<DataType>* out) const {
+  Status GetType(std::shared_ptr<DataType>* out) {
     // TODO(wesm): handling forming unions
     if (make_unions_) {
       return Status::NotImplemented("Creating union types not yet supported");
@@ -375,11 +384,48 @@ class TypeInferrer {
 
     RETURN_NOT_OK(Validate());
 
-    if (numpy_unifier_.current_dtype() != nullptr) {
-      std::shared_ptr<DataType> type;
-      RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type));
-      *out = type;
-    } else if (list_count_) {
+    if (numpy_dtype_count_ > 0) {
+      // All NumPy scalars and Nones/nulls
+      if (numpy_dtype_count_ + none_count_ == total_count_) {
+        std::shared_ptr<DataType> type;
+        RETURN_NOT_OK(NumPyDtypeToArrow(numpy_unifier_.current_dtype(), &type));
+        *out = type;
+        return Status::OK();
+      }
+
+      // The "bad path": data contains a mix of NumPy scalars and
+      // other kinds of scalars. Note this can happen innocuously
+      // because numpy.nan is not a NumPy scalar (it's a built-in
+      // PyFloat)
+
+      // TODO(ARROW-5564): Merge together type unification so this
+      // hack is not necessary
+      switch (numpy_unifier_.current_type_num()) {
+        case NPY_BOOL:
+          bool_count_ += numpy_dtype_count_;
+          break;
+        case NPY_INT8:
+        case NPY_INT16:
+        case NPY_INT32:
+        case NPY_INT64:
+        case NPY_UINT8:
+        case NPY_UINT16:
+        case NPY_UINT32:
+        case NPY_UINT64:
+          int_count_ += numpy_dtype_count_;
+          break;
+        case NPY_FLOAT32:
+        case NPY_FLOAT64:
+          float_count_ += numpy_dtype_count_;
+          break;
+        case NPY_DATETIME:
+          return Status::Invalid(
+              "numpy.datetime64 scalars cannot be mixed "
+              "with other Python scalar values currently");
+      }
+    }
+
+    if (list_count_) {
       std::shared_ptr<DataType> value_type;
       RETURN_NOT_OK(list_inferrer_->GetType(&value_type));
       *out = list(value_type);
@@ -439,13 +485,15 @@ class TypeInferrer {
   Status VisitDType(PyArray_Descr* dtype, bool* keep_going) {
     // Continue visiting dtypes for now.
     // TODO(wesm): devise approach for unions
+    ++numpy_dtype_count_;
     *keep_going = true;
     return numpy_unifier_.Observe(dtype);
   }
 
   Status VisitList(PyObject* obj, bool* keep_going /* unused */) {
     if (!list_inferrer_) {
-      list_inferrer_.reset(new TypeInferrer(validate_interval_, make_unions_));
+      list_inferrer_.reset(
+          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
     }
     ++list_count_;
     return list_inferrer_->VisitSequence(obj);
@@ -458,9 +506,15 @@ class TypeInferrer {
     }
     // Not an object array: infer child Arrow type from dtype
     if (!list_inferrer_) {
-      list_inferrer_.reset(new TypeInferrer(validate_interval_, make_unions_));
+      list_inferrer_.reset(
+          new TypeInferrer(pandas_null_sentinels_, validate_interval_, make_unions_));
     }
     ++list_count_;
+
+    // XXX(wesm): In ARROW-4324 I added accounting to check whether
+    // all of the non-null values have NumPy dtypes, but the
+    // total_count not not being properly incremented here
+    ++(*list_inferrer_).total_count_;
     return list_inferrer_->VisitDType(dtype, keep_going);
   }
 
@@ -484,7 +538,8 @@ class TypeInferrer {
       if (it == struct_inferrers_.end()) {
         it = struct_inferrers_
                  .insert(
-                     std::make_pair(key, TypeInferrer(validate_interval_, make_unions_)))
+                     std::make_pair(key, TypeInferrer(pandas_null_sentinels_,
+                                                      validate_interval_, make_unions_)))
                  .first;
       }
       TypeInferrer* visitor = &it->second;
@@ -503,9 +558,9 @@ class TypeInferrer {
     return Status::OK();
   }
 
-  Status GetStructType(std::shared_ptr<DataType>* out) const {
+  Status GetStructType(std::shared_ptr<DataType>* out) {
     std::vector<std::shared_ptr<Field>> fields;
-    for (const auto& it : struct_inferrers_) {
+    for (auto&& it : struct_inferrers_) {
       std::shared_ptr<DataType> field_type;
       RETURN_NOT_OK(it.second.GetType(&field_type));
       fields.emplace_back(field(it.first, field_type));
@@ -515,6 +570,7 @@ class TypeInferrer {
   }
 
  private:
+  bool pandas_null_sentinels_;
   int64_t validate_interval_;
   bool make_unions_;
   int64_t total_count_;
@@ -532,8 +588,9 @@ class TypeInferrer {
   int64_t unicode_count_;
   int64_t decimal_count_;
   int64_t list_count_;
-  std::unique_ptr<TypeInferrer> list_inferrer_;
   int64_t struct_count_;
+  int64_t numpy_dtype_count_;
+  std::unique_ptr<TypeInferrer> list_inferrer_;
   std::map<std::string, TypeInferrer> struct_inferrers_;
 
   // If we observe a strongly-typed value in e.g. a NumPy array, we can store
@@ -548,9 +605,10 @@ class TypeInferrer {
 };
 
 // Non-exhaustive type inference
-Status InferArrowType(PyObject* obj, std::shared_ptr<DataType>* out_type) {
+Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+                      std::shared_ptr<DataType>* out_type) {
   PyDateTime_IMPORT;
-  TypeInferrer inferrer;
+  TypeInferrer inferrer(pandas_null_sentinels);
   RETURN_NOT_OK(inferrer.VisitSequence(obj));
   RETURN_NOT_OK(inferrer.GetType(out_type));
   if (*out_type == nullptr) {
@@ -560,7 +618,7 @@ Status InferArrowType(PyObject* obj, std::shared_ptr<DataType>* out_type) {
   return Status::OK();
 }
 
-Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
+Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels, int64_t* size,
                              std::shared_ptr<DataType>* out_type) {
   if (!PySequence_Check(obj)) {
     return Status::TypeError("Object is not a sequence");
@@ -572,7 +630,7 @@ Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
     *out_type = null();
     return Status::OK();
   }
-  RETURN_NOT_OK(InferArrowType(obj, out_type));
+  RETURN_NOT_OK(InferArrowType(obj, pandas_null_sentinels, out_type));
 
   return Status::OK();
 }
diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h
index 8790250..746e922 100644
--- a/cpp/src/arrow/python/inference.h
+++ b/cpp/src/arrow/python/inference.h
@@ -40,10 +40,12 @@ namespace py {
 
 // These three functions take a sequence input, not arbitrary iterables
 ARROW_PYTHON_EXPORT
-arrow::Status InferArrowType(PyObject* obj, std::shared_ptr<arrow::DataType>* out_type);
+arrow::Status InferArrowType(PyObject* obj, bool pandas_null_sentinels,
+                             std::shared_ptr<arrow::DataType>* out_type);
 
 ARROW_PYTHON_EXPORT
-arrow::Status InferArrowTypeAndSize(PyObject* obj, int64_t* size,
+arrow::Status InferArrowTypeAndSize(PyObject* obj, bool pandas_null_sentinels,
+                                    int64_t* size,
                                     std::shared_ptr<arrow::DataType>* out_type);
 
 /// Checks whether the passed Python object is a boolean scalar
diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h
index 19bcde0..da826f5 100644
--- a/cpp/src/arrow/python/numpy-internal.h
+++ b/cpp/src/arrow/python/numpy-internal.h
@@ -171,6 +171,15 @@ inline bool PyBoolScalar_Check(PyObject* obj) {
   return PyBool_Check(obj) || PyArray_IsScalar(obj, Bool);
 }
 
+static inline PyArray_Descr* GetSafeNumPyDtype(int type) {
+  if (type == NPY_DATETIME) {
+    // It is not safe to mutate the result of DescrFromType
+    return PyArray_DescrNewFromType(type);
+  } else {
+    return PyArray_DescrFromType(type);
+  }
+}
+
 }  // namespace internal
 
 }  // namespace py
diff --git a/cpp/src/arrow/python/python-test.cc b/cpp/src/arrow/python/python-test.cc
index ad1ae01..03e4970 100644
--- a/cpp/src/arrow/python/python-test.cc
+++ b/cpp/src/arrow/python/python-test.cc
@@ -442,19 +442,5 @@ TEST_F(DecimalTest, UpdateWithNaN) {
   ASSERT_EQ(std::numeric_limits<int32_t>::min(), metadata.scale());
 }
 
-TEST(PythonTest, ConstructStringArrayWithLeadingZeros) {
-  PyAcquireGIL lock;
-
-  OwnedRef list_ref(PyList_New(2));
-  PyObject* list = list_ref.obj();
-  std::string str("str");
-
-  ASSERT_EQ(0, PyList_SetItem(list, 0, PyFloat_FromDouble(NAN)));
-  ASSERT_EQ(0, PyList_SetItem(list, 1, PyUnicode_FromString(str.c_str())));
-
-  std::shared_ptr<ChunkedArray> out;
-  ASSERT_OK(ConvertPySequence(list, {}, &out));
-}
-
 }  // namespace py
 }  // namespace arrow
diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc
index 2d07f0e..2d1d5d2 100644
--- a/cpp/src/arrow/python/python_to_arrow.cc
+++ b/cpp/src/arrow/python/python_to_arrow.cc
@@ -972,7 +972,7 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask,
   bool strict_conversions = false;
 
   if (options.type == nullptr) {
-    RETURN_NOT_OK(InferArrowType(seq, &real_type));
+    RETURN_NOT_OK(InferArrowType(seq, options.from_pandas, &real_type));
   } else {
     real_type = options.type;
     strict_conversions = true;
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index e504806..ae6104f 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -84,7 +84,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
         return pyarrow_wrap_array(chunked_out.get().chunk(0))
 
 
-def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
+def array(object obj, type=None, mask=None, size=None, from_pandas=None,
           bint safe=True, MemoryPool memory_pool=None):
     """
     Create pyarrow.Array instance from a Python object
@@ -105,11 +105,13 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
         will be treated as a "max size", but will involve an initial allocation
         of size followed by a resize to the actual size (so if you know the
         exact size specifying it correctly will give you better performance).
-    from_pandas : boolean, default False
-        Use pandas's semantics for inferring nulls from values in ndarray-like
-        data. If passed, the mask tasks precendence, but if a value is unmasked
-        (not-null), but still null according to pandas semantics, then it is
-        null
+    from_pandas : boolean, default None
+        Use pandas's semantics for inferring nulls from values in
+        ndarray-like data. If passed, the mask tasks precendence, but
+        if a value is unmasked (not-null), but still null according to
+        pandas semantics, then it is null. Defaults to False if not
+        passed explicitly by user, or True if a pandas object is
+        passed in
     safe : boolean, default True
         Check for overflows or other unsafe conversions
     memory_pool : pyarrow.MemoryPool, optional
@@ -147,14 +149,26 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
     array : pyarrow.Array or pyarrow.ChunkedArray (if object data
     overflowed binary storage)
     """
+    cdef:
+        CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+        bint is_pandas_object = False
+        bint c_from_pandas
+
     type = ensure_type(type, allow_none=True)
-    cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool)
+
+    if from_pandas is None:
+        c_from_pandas = False
+    else:
+        c_from_pandas = from_pandas
 
     if _is_array_like(obj):
         if mask is not None:
-            mask = get_series_values(mask)
+            # out argument unused
+            mask = get_series_values(mask, &is_pandas_object)
 
-        values = get_series_values(obj)
+        values = get_series_values(obj, &is_pandas_object)
+        if is_pandas_object and from_pandas is None:
+            c_from_pandas = True
 
         if pandas_api.is_categorical(values):
             return DictionaryArray.from_arrays(
@@ -166,11 +180,11 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
             if pandas_api.have_pandas:
                 values, type = pandas_api.compat.get_datetimetz_type(
                     values, obj.dtype, type)
-            return _ndarray_to_array(values, mask, type, from_pandas, safe,
+            return _ndarray_to_array(values, mask, type, c_from_pandas, safe,
                                      pool)
     else:
         # ConvertPySequence does strict conversion if type is explicitly passed
-        return _sequence_to_array(obj, mask, size, type, pool, from_pandas)
+        return _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
 
 
 def asarray(values, type=None):
@@ -1463,13 +1477,16 @@ cdef dict _array_classes = {
 }
 
 
-cdef object get_series_values(object obj):
+cdef object get_series_values(object obj, bint* is_series):
     if pandas_api.is_series(obj):
         result = obj.values
+        is_series[0] = True
     elif isinstance(obj, np.ndarray):
         result = obj
+        is_series[0] = False
     else:
         result = pandas_api.make_series(obj).values
+        is_series[0] = False
 
     return result
 
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 17ae1e3..5eebfbd 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -290,12 +290,8 @@ def test_sequence_numpy_integer(seq, np_scalar_pa_type):
 def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
     np_scalar, pa_type = np_scalar_pa_type
     expected = [np_scalar(1), None, np_scalar(3), None]
-    if np_scalar != np.uint64:
-        expected += [np_scalar(np.iinfo(np_scalar).min),
-                     np_scalar(np.iinfo(np_scalar).max)]
-    else:
-        # max(uint64) is too large for the inferred int64 type
-        expected += [0, np.iinfo(np.int64).max]
+    expected += [np_scalar(np.iinfo(np_scalar).min),
+                 np_scalar(np.iinfo(np_scalar).max)]
     arr = pa.array(seq(expected))
     assert len(arr) == 6
     assert arr.null_count == 2
@@ -303,6 +299,14 @@ def test_sequence_numpy_integer_inferred(seq, np_scalar_pa_type):
     assert arr.to_pylist() == expected
 
 
+def test_numpy_scalars_mixed_type():
+    # ARROW-4324
+    data = [np.int32(10), np.float32(0.5)]
+    arr = pa.array(data)
+    expected = pa.array([10, 0.5], type='float64')
+    assert arr.equals(expected)
+
+
 @pytest.mark.xfail(reason="Type inference for uint64 not implemented",
                    raises=pa.ArrowException)
 def test_uint64_max_convert():
@@ -435,9 +439,13 @@ def test_mixed_sequence_errors():
 
 
 @parametrize_with_iterable_types
-@pytest.mark.parametrize("np_scalar", [np.float16, np.float32, np.float64])
+@pytest.mark.parametrize("np_scalar,pa_type", [
+    (np.float16, pa.float16()),
+    (np.float32, pa.float32()),
+    (np.float64, pa.float64())
+])
 @pytest.mark.parametrize("from_pandas", [True, False])
-def test_sequence_numpy_double(seq, np_scalar, from_pandas):
+def test_sequence_numpy_double(seq, np_scalar, pa_type, from_pandas):
     data = [np_scalar(1.5), np_scalar(1), None, np_scalar(2.5), None, np.nan]
     arr = pa.array(seq(data), from_pandas=from_pandas)
     assert len(arr) == 6
@@ -445,7 +453,12 @@ def test_sequence_numpy_double(seq, np_scalar, from_pandas):
         assert arr.null_count == 3
     else:
         assert arr.null_count == 2
-    assert arr.type == pa.float64()
+    if from_pandas:
+        # The NaN is skipped in type inference, otherwise it forces a
+        # float64 promotion
+        assert arr.type == pa_type
+    else:
+        assert arr.type == pa.float64()
 
     assert arr.to_pylist()[:4] == data[:4]
     if from_pandas:
@@ -475,6 +488,17 @@ def test_ndarray_nested_numpy_double(from_pandas, inner_seq):
                                 [[1., 2.], [1., 2., 3.], [np.nan], None])
 
 
+def test_array_ignore_nan_from_pandas():
+    # See ARROW-4324, this reverts logic that was introduced in
+    # ARROW-2240
+    with pytest.raises(ValueError):
+        pa.array([np.nan, 'str'])
+
+    arr = pa.array([np.nan, 'str'], from_pandas=True)
+    expected = pa.array([None, 'str'])
+    assert arr.equals(expected)
+
+
 def test_nested_ndarray_different_dtypes():
     data = [
         np.array([1, 2, 3], dtype='int64'),
diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
index 3b5bd57..46e4f65 100644
--- a/python/pyarrow/tests/test_pandas.py
+++ b/python/pyarrow/tests/test_pandas.py
@@ -593,6 +593,13 @@ class TestConvertPrimitiveTypes(object):
         expected = pd.Series([False, True, True, None, True])
         _check_array_roundtrip(s, expected=expected, type=pa.bool_())
 
+    def test_series_from_pandas_false_respected(self):
+        # Check that explicit from_pandas=False is respected
+        s = pd.Series([0.0, np.nan])
+        arr = pa.array(s, from_pandas=False)
+        assert arr.null_count == 0
+        assert np.isnan(arr[1].as_py())
+
     def test_integer_no_nulls(self):
         data = OrderedDict()
         fields = []