You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/01/10 04:07:22 UTC
[arrow] branch master updated: ARROW-3428: [Python] Fix from_pandas
conversion from float to bool
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2b361fb ARROW-3428: [Python] Fix from_pandas conversion from float to bool
2b361fb is described below
commit 2b361fb2e5b4321a6cdcbdbf457181702fd97eaa
Author: Bryan Cutler <cu...@gmail.com>
AuthorDate: Wed Jan 9 22:07:14 2019 -0600
ARROW-3428: [Python] Fix from_pandas conversion from float to bool
When `from_pandas` converts data to boolean, the values are read into a `uint8_t` and then checked. When the values are floating point numbers, not all bits are checked which can cause incorrect results.
Author: Bryan Cutler <cu...@gmail.com>
Closes #2698 from BryanCutler/python-from_pandas-float-to-bool-ARROW-3428 and squashes the following commits:
f3d472626 <Bryan Cutler> added test with fix that passes, but fails other tests
---
cpp/src/arrow/compute/kernels/cast-test.cc | 19 +++++++++
cpp/src/arrow/python/numpy_to_arrow.cc | 66 +++++++++++++----------------
cpp/src/arrow/python/type_traits.h | 1 +
python/pyarrow/tests/test_convert_pandas.py | 39 ++++++++++++++---
4 files changed, 81 insertions(+), 44 deletions(-)
diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc
index 781e0af..c3a0df5 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -138,6 +138,25 @@ TEST_F(TestCast, SameTypeZeroCopy) {
AssertBufferSame(*arr, *result, 1);
}
+TEST_F(TestCast, FromBoolean) {
+ CastOptions options;
+
+ vector<bool> is_valid(20, true);
+ is_valid[3] = false;
+
+ vector<bool> v1(is_valid.size(), true);
+ vector<int32_t> e1(is_valid.size(), 1);
+ for (size_t i = 0; i < v1.size(); ++i) {
+ if (i % 3 == 1) {
+ v1[i] = false;
+ e1[i] = 0;
+ }
+ }
+
+ CheckCase<BooleanType, bool, Int32Type, int32_t>(boolean(), v1, is_valid, int32(), e1,
+ options);
+}
+
TEST_F(TestCast, ToBoolean) {
CastOptions options;
for (auto type : kNumericTypes) {
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index aa28b6e..aada6bf 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -63,6 +63,7 @@ namespace arrow {
using internal::checked_cast;
using internal::CopyBitmap;
+using internal::GenerateBitsUnrolled;
namespace py {
@@ -246,6 +247,11 @@ class NumPyConverter {
return Status::OK();
}
+ // Called before ConvertData to ensure Numpy input buffer is in expected
+ // Arrow layout
+ template <typename ArrowType>
+ Status PrepareInputData(std::shared_ptr<Buffer>* data);
+
// ----------------------------------------------------------------------
// Traditional visitor conversion for non-object arrays
@@ -407,14 +413,32 @@ Status CopyStridedArray(PyArrayObject* arr, const int64_t length, MemoryPool* po
} // namespace
template <typename ArrowType>
-inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+inline Status NumPyConverter::PrepareInputData(std::shared_ptr<Buffer>* data) {
if (is_strided()) {
RETURN_NOT_OK(CopyStridedArray<ArrowType>(arr_, length_, pool_, data));
+ } else if (dtype_->type_num == NPY_BOOL) {
+ int64_t nbytes = BitUtil::BytesForBits(length_);
+ std::shared_ptr<Buffer> buffer;
+ RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));
+
+ Ndarray1DIndexer<uint8_t> values(arr_);
+ int64_t i = 0;
+ const auto generate = [&values, &i]() -> bool { return values[i++] > 0; };
+ GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate);
+
+ *data = buffer;
} else {
// Can zero-copy
*data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
}
+ return Status::OK();
+}
+
+template <typename ArrowType>
+inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
+ RETURN_NOT_OK(PrepareInputData<ArrowType>(data));
+
std::shared_ptr<DataType> input_type;
RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
@@ -427,37 +451,11 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
}
template <>
-inline Status NumPyConverter::ConvertData<BooleanType>(std::shared_ptr<Buffer>* data) {
- int64_t nbytes = BitUtil::BytesForBits(length_);
- std::shared_ptr<Buffer> buffer;
- RETURN_NOT_OK(AllocateBuffer(pool_, nbytes, &buffer));
-
- Ndarray1DIndexer<uint8_t> values(arr_);
-
- uint8_t* bitmap = buffer->mutable_data();
-
- memset(bitmap, 0, nbytes);
- for (int i = 0; i < length_; ++i) {
- if (values[i] > 0) {
- BitUtil::SetBit(bitmap, i);
- }
- }
-
- *data = buffer;
- return Status::OK();
-}
-
-template <>
inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* data) {
- if (is_strided()) {
- RETURN_NOT_OK(CopyStridedArray<Date32Type>(arr_, length_, pool_, data));
- } else {
- // Can zero-copy
- *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
- }
-
std::shared_ptr<DataType> input_type;
+ RETURN_NOT_OK(PrepareInputData<Date32Type>(data));
+
auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
@@ -489,17 +487,11 @@ inline Status NumPyConverter::ConvertData<Date32Type>(std::shared_ptr<Buffer>* d
template <>
inline Status NumPyConverter::ConvertData<Date64Type>(std::shared_ptr<Buffer>* data) {
- if (is_strided()) {
- RETURN_NOT_OK(CopyStridedArray<Date64Type>(arr_, length_, pool_, data));
- } else {
- // Can zero-copy
- *data = std::make_shared<NumPyBuffer>(reinterpret_cast<PyObject*>(arr_));
- }
-
constexpr int64_t kMillisecondsInDay = 86400000;
-
std::shared_ptr<DataType> input_type;
+ RETURN_NOT_OK(PrepareInputData<Date64Type>(data));
+
auto date_dtype = reinterpret_cast<PyArray_DatetimeDTypeMetaData*>(dtype_->c_metadata);
if (dtype_->type_num == NPY_DATETIME) {
// If we have inbound datetime64[D] data, this needs to be downcasted
diff --git a/cpp/src/arrow/python/type_traits.h b/cpp/src/arrow/python/type_traits.h
index d90517a..bc71ec4 100644
--- a/cpp/src/arrow/python/type_traits.h
+++ b/cpp/src/arrow/python/type_traits.h
@@ -149,6 +149,7 @@ template <>
struct arrow_traits<Type::BOOL> {
static constexpr int npy_type = NPY_BOOL;
static constexpr bool supports_nulls = false;
+ typedef typename npy_traits<NPY_BOOL>::value_type T;
};
#define INT_DECL(TYPE) \
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 3e89f5e..cd7f499 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -113,13 +113,13 @@ def _check_array_roundtrip(values, expected=None, mask=None,
else:
assert arr.null_count == (mask | values_nulls).sum()
- if mask is None:
- tm.assert_series_equal(pd.Series(result), pd.Series(values),
- check_names=False)
- else:
- expected = pd.Series(np.ma.masked_array(values, mask=mask))
- tm.assert_series_equal(pd.Series(result), expected,
- check_names=False)
+ if expected is None:
+ if mask is None:
+ expected = pd.Series(values)
+ else:
+ expected = pd.Series(np.ma.masked_array(values, mask=mask))
+
+ tm.assert_series_equal(pd.Series(result), expected, check_names=False)
def _check_array_from_pandas_roundtrip(np_array, type=None):
@@ -559,6 +559,11 @@ class TestConvertPrimitiveTypes(object):
assert table[0].to_pylist() == [1, 2, None]
tm.assert_frame_equal(df, table.to_pandas())
+ def test_float_nulls_to_boolean(self):
+ s = pd.Series([0.0, 1.0, 2.0, None, -3.0])
+ expected = pd.Series([False, True, True, None, True])
+ _check_array_roundtrip(s, expected=expected, type=pa.bool_())
+
def test_integer_no_nulls(self):
data = OrderedDict()
fields = []
@@ -672,6 +677,26 @@ class TestConvertPrimitiveTypes(object):
tm.assert_frame_equal(result, ex_frame)
+ def test_boolean_to_int(self):
+ # test from dtype=bool
+ s = pd.Series([True, True, False, True, True] * 2)
+ expected = pd.Series([1, 1, 0, 1, 1] * 2)
+ _check_array_roundtrip(s, expected=expected, type=pa.int64())
+
+ def test_boolean_objects_to_int(self):
+ # test from dtype=object
+ s = pd.Series([True, True, False, True, True] * 2, dtype=object)
+ expected = pd.Series([1, 1, 0, 1, 1] * 2)
+ expected_msg = 'Expected integer, got bool'
+ with pytest.raises(pa.ArrowTypeError, match=expected_msg):
+ _check_array_roundtrip(s, expected=expected, type=pa.int64())
+
+ def test_boolean_nulls_to_float(self):
+ # test from dtype=object
+ s = pd.Series([True, True, False, None, True] * 2)
+ expected = pd.Series([1.0, 1.0, 0.0, None, 1.0] * 2)
+ _check_array_roundtrip(s, expected=expected, type=pa.float64())
+
def test_float_object_nulls(self):
arr = np.array([None, 1.5, np.float64(3.5)] * 5, dtype=object)
df = pd.DataFrame({'floats': arr})