You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/01/10 04:07:22 UTC
[arrow] branch master updated: ARROW-3428: [Python] Fix from_pandas conversion from float to bool

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2b361fb  ARROW-3428: [Python] Fix from_pandas conversion from float to bool
2b361fb is described below

commit 2b361fb2e5b4321a6cdcbdbf457181702fd97eaa
Author: Bryan Cutler <cu...@gmail.com>
AuthorDate: Wed Jan 9 22:07:14 2019 -0600

    ARROW-3428: [Python] Fix from_pandas conversion from float to bool
    
    When `from_pandas` converts data to boolean, the values are read into a `uint8_t` and then checked. When the values are floating point numbers, not all bits are checked which can cause incorrect results.
    
    Author: Bryan Cutler <cu...@gmail.com>
    
    Closes #2698 from BryanCutler/python-from_pandas-float-to-bool-ARROW-3428 and squashes the following commits:
    
    f3d472626 <Bryan Cutler> added test with fix that passes, but fails other tests
---
 cpp/src/arrow/compute/kernels/cast-test.cc  | 19 +++++++++
 cpp/src/arrow/python/numpy_to_arrow.cc      | 66 +++++++++++++----------------
 cpp/src/arrow/python/type_traits.h          |  1 +
 python/pyarrow/tests/test_convert_pandas.py | 39 ++++++++++++++---
 4 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc
index 781e0af..c3a0df5 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) {
   AssertBufferSame(*arr, *result, 1);
 }
 
+TEST_F(TestCast, FromBoolean) {
+  CastOptions options;
+
+  vector<bool> is_valid(20, true);
+  is_valid[3] = false;
+
+  vector<bool> v1(is_valid.size(), true);
+  vector<int32_t> e1(is_valid.size(), 1);
+  for (size_t i = 0; i < v1.size(); ++i) {
+    if (i % 3 == 1) {
+      v1[i] = false;
+      e1[i] = 0;
+    }
+  }
+
+  CheckCase<BooleanType, bool, Int32Type, int32_t>(boolean(), v1, is_valid, int32(), e1,
+                                                   options);
+}
+
 TEST_F(TestCast, ToBoolean) {
   CastOptions options;
   for (auto type : kNumericTypes) {
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index aa28b6e..aada6bf 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -63,6 +63,7 @@ namespace arrow {
 
 using internal::checked_cast;
 using internal::CopyBitmap;
+using internal::GenerateBitsUnrolled;
 
 namespace py {
 
@@ -246,6 +247,11 @@ class NumPyConverter {
     return Status::OK();
   }
 
+  // Called before ConvertData to ensure Numpy input buffer is in expected
+  // Arrow layout
+  template <typename ArrowType>
+  Status PrepareInputData(std::shared_ptr<Buffer>* data);
+
   // ----------------------------------------------------------------------
   // Traditional visitor conversion for non-object arrays
 
@@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po
 }  // namespace
 
 template <typename ArrowType>
-inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
   if (is_strided()) {
     RETURN_NOT_OK(CopyStridedArray<ArrowType>(arr_, length_, pool_, data));
+  } else if (dtype_->type_num == NPY_BOOL) {
+    int64_t nbytes = BitUtil::BytesForBits(length_);
+    std::shared_ptr<Buffer> buffer;
+    RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));
+
+    Ndarray1DIndexer<uint8_t> values(arr_);
+    int64_t i = 0;
+    const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
+    GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
+
+    *data = buffer;
   } else {
     // Can zero-copy
     *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
   }
 
+  return Status::OK();
+}
+
+template <typename ArrowType>
+inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+  RETURN_NOT_OK(PrepareInputData<ArrowType>(data));
+
   std::shared_ptr<DataType> input_type;
   RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
 
@@ -427,37 +451,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
 }
 
 template <>
-inline Status NumPyConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
-  int64_t nbytes = BitUtil::BytesForBits(length_);
-  std::shared_ptr<Buffer> buffer;
-  RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));
-
-  Ndarray1DIndexer<uint8_t> values(arr_);
-
-  uint8_t* bitmap = buffer->mutable_data();
-
-  memset(bitmap, 0, nbytes);
-  for (int i = 0; i < length_; ++i) {
-    if (values[i] > 0) {
-      BitUtil::SetBit(bitmap, i);
-    }
-  }
-
-  *data = buffer;
-  return Status::OK();
-}
-
-template <>
 inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
-  if (is_strided()) {
-    RETURN_NOT_OK(CopyStridedArray<Date32Type>(arr_, length_, pool_, data));
-  } else {
-    // Can zero-copy
-    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
-  }
-
   std::shared_ptr<DataType> input_type;
 
+  RETURN_NOT_OK(PrepareInputData<Date32Type>(data));
+
   auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
   if (dtype_->type_num == NPY_DATETIME) {
     // If we have inbound datetime64[D] data, this needs to be downcasted
@@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
 
 template <>
 inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
-  if (is_strided()) {
-    RETURN_NOT_OK(CopyStridedArray<Date64Type>(arr_, length_, pool_, data));
-  } else {
-    // Can zero-copy
-    *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
-  }
-
   constexpr int64_t kMillisecondsInDay = 86400000;
-
   std::shared_ptr<DataType> input_type;
 
+  RETURN_NOT_OK(PrepareInputData<Date64Type>(data));
+
   auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
   if (dtype_->type_num == NPY_DATETIME) {
     // If we have inbound datetime64[D] data, this needs to be downcasted
diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index d90517a..bc71ec4 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -149,6 +149,7 @@ template <>
 struct arrow_traits<Type::BOOL> {
   static constexpr int npy_type = NPY_BOOL;
   static constexpr bool supports_nulls = false;
+  typedef typename npy_traits<NPY_BOOL>::value_type T;
 };
 
 #define INT_DECL(TYPE)                                     \
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 3e89f5e..cd7f499 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -113,13 +113,13 @@ def _check_array_roundtrip(values, expected=None, mask=None,
     else:
         assert arr.null_count == (mask | values_nulls).sum()
 
-    if mask is None:
-        tm.assert_series_equal(pd.Series(result), pd.Series(values),
-                               check_names=False)
-    else:
-        expected = pd.Series(np.ma.masked_array(values, mask=mask))
-        tm.assert_series_equal(pd.Series(result), expected,
-                               check_names=False)
+    if expected is None:
+        if mask is None:
+            expected = pd.Series(values)
+        else:
+            expected = pd.Series(np.ma.masked_array(values, mask=mask))
+
+    tm.assert_series_equal(pd.Series(result), expected, check_names=False)
 
 
 def _check_array_from_pandas_roundtrip(np_array, type=None):
@@ -559,6 +559,11 @@ class TestConvertPrimitiveTypes(object):
         assert table[0].to_pylist() == [1, 2, None]
         tm.assert_frame_equal(df, table.to_pandas())
 
+    def test_float_nulls_to_boolean(self):
+        s = pd.Series([0.0, 1.0, 2.0, None, -3.0])
+        expected = pd.Series([False, True, True, None, True])
+        _check_array_roundtrip(s, expected=expected, type=pa.bool_())
+
     def test_integer_no_nulls(self):
         data = OrderedDict()
         fields = []
@@ -672,6 +677,26 @@ class TestConvertPrimitiveTypes(object):
 
         tm.assert_frame_equal(result, ex_frame)
 
+    def test_boolean_to_int(self):
+        # test from dtype=bool
+        s = pd.Series([True, True, False, True, True] * 2)
+        expected = pd.Series([1, 1, 0, 1, 1] * 2)
+        _check_array_roundtrip(s, expected=expected, type=pa.int64())
+
+    def test_boolean_objects_to_int(self):
+        # test from dtype=object
+        s = pd.Series([True, True, False, True, True] * 2, dtype=object)
+        expected = pd.Series([1, 1, 0, 1, 1] * 2)
+        expected_msg = 'Expected integer, got bool'
+        with pytest.raises(pa.ArrowTypeError, match=expected_msg):
+            _check_array_roundtrip(s, expected=expected, type=pa.int64())
+
+    def test_boolean_nulls_to_float(self):
+        # test from dtype=object
+        s = pd.Series([True, True, False, None, True] * 2)
+        expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2)
+        _check_array_roundtrip(s, expected=expected, type=pa.float64())
+
     def test_float_object_nulls(self):
         arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
         df = pd.DataFrame({'floats': arr})