You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/22 19:08:12 UTC
[arrow] branch master updated: ARROW-2965: [Python] Guard against overflow when serializing Numpy uint64 scalar

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 23045d2  ARROW-2965: [Python] Guard against overflow when serializing Numpy uint64 scalar
23045d2 is described below

commit 23045d2b5be37df5cd990f5932576eb701e14b18
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Aug 22 15:08:00 2018 -0400

    ARROW-2965: [Python] Guard against overflow when serializing Numpy uint64 scalar
    
    Author: Antoine Pitrou <an...@python.org>
    
    Closes #2463 from pitrou/ARROW-2965-serialize-uint64-overflow and squashes the following commits:
    
    c2aad390 <Antoine Pitrou> Remove unused AppendUInt64
    f4f05de4 <Antoine Pitrou> ARROW-2965:  Guard against overflow when serializing Numpy uint64 scalar
---
 cpp/src/arrow/python/serialize.cc          | 62 +++++++++++++++++-------------
 python/pyarrow/tests/test_serialization.py | 22 ++++++++++-
 2 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 509950d..56565ac 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -116,12 +116,6 @@ class SequenceBuilder {
     return AppendPrimitive(data, &int_tag_, &ints_);
   }
 
-  /// Appending an uint64_t to the sequence
-  Status AppendUInt64(const uint64_t data) {
-    // TODO(wesm): Bounds check
-    return AppendPrimitive(static_cast<int64_t>(data), &int_tag_, &ints_);
-  }
-
   /// Append a list of bytes to the sequence
   Status AppendBytes(const uint8_t* data, int32_t length) {
     RETURN_NOT_OK(Update(bytes_.length(), &bytes_tag_));
@@ -435,6 +429,25 @@ Status SerializeSequences(PyObject* context, std::vector<PyObject*> sequences,
                           int32_t recursion_depth, std::shared_ptr<Array>* out,
                           SerializedPyObject* blobs_out);
 
+template <typename NumpyScalarObject>
+Status AppendIntegerScalar(PyObject* obj, SequenceBuilder* builder) {
+  int64_t value = reinterpret_cast<NumpyScalarObject*>(obj)->obval;
+  return builder->AppendInt64(value);
+}
+
+// Append a potentially 64-bit wide unsigned Numpy scalar.
+// Must check for overflow as we reinterpret it as signed int64.
+template <typename NumpyScalarObject>
+Status AppendLargeUnsignedScalar(PyObject* obj, SequenceBuilder* builder) {
+  constexpr uint64_t max_value = std::numeric_limits<int64_t>::max();
+
+  uint64_t value = reinterpret_cast<NumpyScalarObject*>(obj)->obval;
+  if (value > max_value) {
+    return Status::Invalid("cannot serialize Numpy uint64 scalar >= 2**63");
+  }
+  return builder->AppendInt64(static_cast<int64_t>(value));
+}
+
 Status AppendScalar(PyObject* obj, SequenceBuilder* builder) {
   if (PyArray_IsScalar(obj, Bool)) {
     return builder->AppendBool(reinterpret_cast<PyBoolScalarObject*>(obj)->obval != 0);
@@ -445,35 +458,32 @@ Status AppendScalar(PyObject* obj, SequenceBuilder* builder) {
   } else if (PyArray_IsScalar(obj, Double)) {
     return builder->AppendDouble(reinterpret_cast<PyDoubleScalarObject*>(obj)->obval);
   }
-  int64_t value = 0;
   if (PyArray_IsScalar(obj, Byte)) {
-    value = reinterpret_cast<PyByteScalarObject*>(obj)->obval;
-  } else if (PyArray_IsScalar(obj, UByte)) {
-    value = reinterpret_cast<PyUByteScalarObject*>(obj)->obval;
+    return AppendIntegerScalar<PyByteScalarObject>(obj, builder);
   } else if (PyArray_IsScalar(obj, Short)) {
-    value = reinterpret_cast<PyShortScalarObject*>(obj)->obval;
-  } else if (PyArray_IsScalar(obj, UShort)) {
-    value = reinterpret_cast<PyUShortScalarObject*>(obj)->obval;
+    return AppendIntegerScalar<PyShortScalarObject>(obj, builder);
   } else if (PyArray_IsScalar(obj, Int)) {
-    value = reinterpret_cast<PyIntScalarObject*>(obj)->obval;
-  } else if (PyArray_IsScalar(obj, UInt)) {
-    value = reinterpret_cast<PyUIntScalarObject*>(obj)->obval;
+    return AppendIntegerScalar<PyIntScalarObject>(obj, builder);
   } else if (PyArray_IsScalar(obj, Long)) {
-    value = reinterpret_cast<PyLongScalarObject*>(obj)->obval;
-  } else if (PyArray_IsScalar(obj, ULong)) {
-    value = reinterpret_cast<PyULongScalarObject*>(obj)->obval;
+    return AppendIntegerScalar<PyLongScalarObject>(obj, builder);
   } else if (PyArray_IsScalar(obj, LongLong)) {
-    value = reinterpret_cast<PyLongLongScalarObject*>(obj)->obval;
+    return AppendIntegerScalar<PyLongLongScalarObject>(obj, builder);
   } else if (PyArray_IsScalar(obj, Int64)) {
-    value = reinterpret_cast<PyInt64ScalarObject*>(obj)->obval;
+    return AppendIntegerScalar<PyInt64ScalarObject>(obj, builder);
+  } else if (PyArray_IsScalar(obj, UByte)) {
+    return AppendIntegerScalar<PyUByteScalarObject>(obj, builder);
+  } else if (PyArray_IsScalar(obj, UShort)) {
+    return AppendIntegerScalar<PyUShortScalarObject>(obj, builder);
+  } else if (PyArray_IsScalar(obj, UInt)) {
+    return AppendIntegerScalar<PyUIntScalarObject>(obj, builder);
+  } else if (PyArray_IsScalar(obj, ULong)) {
+    return AppendLargeUnsignedScalar<PyULongScalarObject>(obj, builder);
   } else if (PyArray_IsScalar(obj, ULongLong)) {
-    value = reinterpret_cast<PyULongLongScalarObject*>(obj)->obval;
+    return AppendLargeUnsignedScalar<PyULongLongScalarObject>(obj, builder);
   } else if (PyArray_IsScalar(obj, UInt64)) {
-    value = reinterpret_cast<PyUInt64ScalarObject*>(obj)->obval;
-  } else {
-    DCHECK(false) << "scalar type not recognized";
+    return AppendLargeUnsignedScalar<PyUInt64ScalarObject>(obj, builder);
   }
-  return builder->AppendInt64(value);
+  return Status::NotImplemented("Numpy scalar type not recognized");
 }
 
 Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 53dd5c0..4bac300 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -115,7 +115,8 @@ PRIMITIVE_OBJECTS = [
     {True: "hello", False: "world"}, {"hello": "world", 1: 42, 2.5: 45},
     {"hello": set([2, 3]), "world": set([42.0]), "this": None},
     np.int8(3), np.int32(4), np.int64(5),
-    np.uint8(3), np.uint32(4), np.uint64(5), np.float16(1.9), np.float32(1.9),
+    np.uint8(3), np.uint32(4), np.uint64(5),
+    np.float16(1.9), np.float32(1.9),
     np.float64(1.9), np.zeros([8, 20]),
     np.random.normal(size=[17, 10]), np.array(["hi", 3]),
     np.array(["hi", 3], dtype=object),
@@ -288,6 +289,25 @@ def test_primitive_serialization(large_buffer):
         serialization_roundtrip(obj, large_buffer)
 
 
+def test_integer_limits(large_buffer):
+    # Check that Numpy scalars can be represented up to their limit values
+    # (except np.uint64 which is limited to 2**63 - 1)
+    for dt in [np.int8, np.int64, np.int32, np.int64,
+               np.uint8, np.uint64, np.uint32, np.uint64]:
+        scal = dt(np.iinfo(dt).min)
+        serialization_roundtrip(scal, large_buffer)
+        if dt is not np.uint64:
+            scal = dt(np.iinfo(dt).max)
+            serialization_roundtrip(scal, large_buffer)
+        else:
+            scal = dt(2**63 - 1)
+            serialization_roundtrip(scal, large_buffer)
+            for v in (2**63, 2**64 - 1):
+                scal = dt(v)
+                with pytest.raises(pa.ArrowInvalid):
+                    pa.serialize(scal)
+
+
 def test_serialize_to_buffer():
     for nthreads in [1, 4]:
         for value in COMPLEX_OBJECTS: