You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/05/07 18:55:27 UTC
[arrow] branch master updated: ARROW-2285: [C++/Python] Can't
convert Numpy string arrays
This is an automated email from the ASF dual-hosted git repository.
uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0ecbbf4 ARROW-2285: [C++/Python] Can't convert Numpy string arrays
0ecbbf4 is described below
commit 0ecbbf4ec4c6cf2a919f05ddf2281d7a1c7b8407
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Mon May 7 20:55:20 2018 +0200
ARROW-2285: [C++/Python] Can't convert Numpy string arrays
Author: Krisztián Szűcs <sz...@gmail.com>
Closes #1998 from kszucs/ARROW-2285 and squashes the following commits:
32ae6ff3 <Krisztián Szűcs> match on both utf8 and utf-8 error msg
aa2bb6dc <Krisztián Szűcs> fix raise assertion
e15ed680 <Krisztián Szűcs> test convert unicode array
448ee49b <Krisztián Szűcs> convert numpy string array to fixed sized binary
---
cpp/src/arrow/python/numpy_to_arrow.cc | 28 +++++++++++++++++++++++++---
python/pyarrow/tests/test_convert_pandas.py | 26 ++++++++++++++++++++++++--
2 files changed, 49 insertions(+), 5 deletions(-)
diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index e160e90..2d4d91c 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -366,9 +366,7 @@ class NumPyConverter {
Status Visit(const StructType& type);
- Status Visit(const FixedSizeBinaryType& type) {
- return TypeNotImplemented(type.ToString());
- }
+ Status Visit(const FixedSizeBinaryType& type);
Status Visit(const Decimal128Type& type) { return TypeNotImplemented(type.ToString()); }
@@ -1485,6 +1483,30 @@ Status NumPyConverter::Visit(const BinaryType& type) {
return PushArray(result->data());
}
+Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
+ auto byte_width = type.byte_width();
+
+ if (itemsize_ != byte_width) {
+ std::stringstream ss;
+ ss << "Got bytestring of length " << itemsize_ << " (expected " << byte_width << ")";
+ return Status::Invalid(ss.str());
+ }
+
+ FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_);
+ auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+
+ if (mask_ != nullptr) {
+ Ndarray1DIndexer<uint8_t> mask_values(mask_);
+ RETURN_NOT_OK(builder.AppendValues(data, length_, mask_values.data()));
+ } else {
+ RETURN_NOT_OK(builder.AppendValues(data, length_));
+ }
+
+ std::shared_ptr<Array> result;
+ RETURN_NOT_OK(builder.Finish(&result));
+ return PushArray(result->data());
+}
+
namespace {
// NumPy unicode is UCS4/UTF32 always
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index e3cc836..bdb84c7 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1250,10 +1250,32 @@ class TestConvertStringLikeTypes(object):
def test_array_of_bytes_to_strings_bad_data(self):
with pytest.raises(
pa.lib.ArrowInvalid,
- message="Unknown error: 'utf-8' codec can't decode byte 0x80 "
- "in position 0: invalid start byte"):
+ match=("'(utf8|utf-8)' codec can't decode byte 0x80 "
+ "in position 0: invalid start byte")):
pa.array(np.array([b'\x80\x81'], dtype=object), pa.string())
+ def test_numpy_string_array_to_fixed_size_binary(self):
+ arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
+
+ converted = pa.array(arr, type=pa.binary(3))
+ expected = pa.array(list(arr), type=pa.binary(3))
+ assert converted.equals(expected)
+
+ mask = np.array([True, False, True])
+ converted = pa.array(arr, type=pa.binary(3), mask=mask)
+ expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3))
+ assert converted.equals(expected)
+
+ with pytest.raises(pa.lib.ArrowInvalid,
+ match='Got bytestring of length 3 \(expected 4\)'):
+ arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
+ pa.array(arr, type=pa.binary(4))
+
+ with pytest.raises(pa.lib.ArrowInvalid,
+ match='Got bytestring of length 12 \(expected 3\)'):
+ arr = np.array([b'foo', b'bar', b'baz'], dtype='|U3')
+ pa.array(arr, type=pa.binary(3))
+
class TestConvertDecimalTypes(object):
"""
--
To stop receiving notification emails like this one, please contact
uwe@apache.org.