You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/05/07 18:55:27 UTC

[arrow] branch master updated: ARROW-2285: [C++/Python] Can't convert Numpy string arrays

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0ecbbf4  ARROW-2285: [C++/Python] Can't convert Numpy string arrays
0ecbbf4 is described below

commit 0ecbbf4ec4c6cf2a919f05ddf2281d7a1c7b8407
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Mon May 7 20:55:20 2018 +0200

    ARROW-2285: [C++/Python] Can't convert Numpy string arrays
    
    Author: Krisztián Szűcs <sz...@gmail.com>
    
    Closes #1998 from kszucs/ARROW-2285 and squashes the following commits:
    
    32ae6ff3 <Krisztián Szűcs> match on both utf8 and utf-8 error msg
    aa2bb6dc <Krisztián Szűcs> fix raise assertion
    e15ed680 <Krisztián Szűcs> test convert unicode array
    448ee49b <Krisztián Szűcs> convert numpy string array to fixed sized binary
---
 cpp/src/arrow/python/numpy_to_arrow.cc      | 28 +++++++++++++++++++++++++---
 python/pyarrow/tests/test_convert_pandas.py | 26 ++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index e160e90..2d4d91c 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -366,9 +366,7 @@ class NumPyConverter {
 
   Status Visit(const StructType& type);
 
-  Status Visit(const FixedSizeBinaryType& type) {
-    return TypeNotImplemented(type.ToString());
-  }
+  Status Visit(const FixedSizeBinaryType& type);
 
   Status Visit(const Decimal128Type& type) { return TypeNotImplemented(type.ToString()); }
 
@@ -1485,6 +1483,30 @@ Status NumPyConverter::Visit(const BinaryType& type) {
   return PushArray(result->data());
 }
 
+Status NumPyConverter::Visit(const FixedSizeBinaryType& type) {
+  auto byte_width = type.byte_width();
+
+  if (itemsize_ != byte_width) {
+    std::stringstream ss;
+    ss << "Got bytestring of length " << itemsize_ << " (expected " << byte_width << ")";
+    return Status::Invalid(ss.str());
+  }
+
+  FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_);
+  auto data = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_));
+
+  if (mask_ != nullptr) {
+    Ndarray1DIndexer<uint8_t> mask_values(mask_);
+    RETURN_NOT_OK(builder.AppendValues(data, length_, mask_values.data()));
+  } else {
+    RETURN_NOT_OK(builder.AppendValues(data, length_));
+  }
+
+  std::shared_ptr<Array> result;
+  RETURN_NOT_OK(builder.Finish(&result));
+  return PushArray(result->data());
+}
+
 namespace {
 
 // NumPy unicode is UCS4/UTF32 always
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index e3cc836..bdb84c7 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1250,10 +1250,32 @@ class TestConvertStringLikeTypes(object):
     def test_array_of_bytes_to_strings_bad_data(self):
         with pytest.raises(
                 pa.lib.ArrowInvalid,
-                message="Unknown error: 'utf-8' codec can't decode byte 0x80 "
-                "in position 0: invalid start byte"):
+                match=("'(utf8|utf-8)' codec can't decode byte 0x80 "
+                       "in position 0: invalid start byte")):
             pa.array(np.array([b'\x80\x81'], dtype=object), pa.string())
 
+    def test_numpy_string_array_to_fixed_size_binary(self):
+        arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
+
+        converted = pa.array(arr, type=pa.binary(3))
+        expected = pa.array(list(arr), type=pa.binary(3))
+        assert converted.equals(expected)
+
+        mask = np.array([True, False, True])
+        converted = pa.array(arr, type=pa.binary(3), mask=mask)
+        expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3))
+        assert converted.equals(expected)
+
+        with pytest.raises(pa.lib.ArrowInvalid,
+                           match='Got bytestring of length 3 \(expected 4\)'):
+            arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3')
+            pa.array(arr, type=pa.binary(4))
+
+        with pytest.raises(pa.lib.ArrowInvalid,
+                           match='Got bytestring of length 12 \(expected 3\)'):
+            arr = np.array([b'foo', b'bar', b'baz'], dtype='|U3')
+            pa.array(arr, type=pa.binary(3))
+
 
 class TestConvertDecimalTypes(object):
     """

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.