You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/08/22 19:08:12 UTC
[arrow] branch master updated: ARROW-2965: [Python] Guard against
overflow when serializing Numpy uint64 scalar
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 23045d2 ARROW-2965: [Python] Guard against overflow when serializing Numpy uint64 scalar
23045d2 is described below
commit 23045d2b5be37df5cd990f5932576eb701e14b18
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Aug 22 15:08:00 2018 -0400
ARROW-2965: [Python] Guard against overflow when serializing Numpy uint64 scalar
Author: Antoine Pitrou <an...@python.org>
Closes #2463 from pitrou/ARROW-2965-serialize-uint64-overflow and squashes the following commits:
c2aad390 <Antoine Pitrou> Remove unused AppendUInt64
f4f05de4 <Antoine Pitrou> ARROW-2965: Guard against overflow when serializing Numpy uint64 scalar
---
cpp/src/arrow/python/serialize.cc | 62 +++++++++++++++++-------------
python/pyarrow/tests/test_serialization.py | 22 ++++++++++-
2 files changed, 57 insertions(+), 27 deletions(-)
diff --git a/cpp/src/arrow/python/serialize.cc b/cpp/src/arrow/python/serialize.cc
index 509950d..56565ac 100644
--- a/cpp/src/arrow/python/serialize.cc
+++ b/cpp/src/arrow/python/serialize.cc
@@ -116,12 +116,6 @@ class SequenceBuilder {
return AppendPrimitive(data, &int_tag_, &ints_);
}
- /// Appending an uint64_t to the sequence
- Status AppendUInt64(const uint64_t data) {
- // TODO(wesm): Bounds check
- return AppendPrimitive(static_cast<int64_t>(data), &int_tag_, &ints_);
- }
-
/// Append a list of bytes to the sequence
Status AppendBytes(const uint8_t* data, int32_t length) {
RETURN_NOT_OK(Update(bytes_.length(), &bytes_tag_));
@@ -435,6 +429,25 @@ Status SerializeSequences(PyObject* context, std::vector<PyObject*> sequences,
int32_t recursion_depth, std::shared_ptr<Array>* out,
SerializedPyObject* blobs_out);
+template <typename NumpyScalarObject>
+Status AppendIntegerScalar(PyObject* obj, SequenceBuilder* builder) {
+ int64_t value = reinterpret_cast<NumpyScalarObject*>(obj)->obval;
+ return builder->AppendInt64(value);
+}
+
+// Append a potentially 64-bit wide unsigned Numpy scalar.
+// Must check for overflow as we reinterpret it as signed int64.
+template <typename NumpyScalarObject>
+Status AppendLargeUnsignedScalar(PyObject* obj, SequenceBuilder* builder) {
+ constexpr uint64_t max_value = std::numeric_limits<int64_t>::max();
+
+ uint64_t value = reinterpret_cast<NumpyScalarObject*>(obj)->obval;
+ if (value > max_value) {
+ return Status::Invalid("cannot serialize Numpy uint64 scalar >= 2**63");
+ }
+ return builder->AppendInt64(static_cast<int64_t>(value));
+}
+
Status AppendScalar(PyObject* obj, SequenceBuilder* builder) {
if (PyArray_IsScalar(obj, Bool)) {
return builder->AppendBool(reinterpret_cast<PyBoolScalarObject*>(obj)->obval != 0);
@@ -445,35 +458,32 @@ Status AppendScalar(PyObject* obj, SequenceBuilder* builder) {
} else if (PyArray_IsScalar(obj, Double)) {
return builder->AppendDouble(reinterpret_cast<PyDoubleScalarObject*>(obj)->obval);
}
- int64_t value = 0;
if (PyArray_IsScalar(obj, Byte)) {
- value = reinterpret_cast<PyByteScalarObject*>(obj)->obval;
- } else if (PyArray_IsScalar(obj, UByte)) {
- value = reinterpret_cast<PyUByteScalarObject*>(obj)->obval;
+ return AppendIntegerScalar<PyByteScalarObject>(obj, builder);
} else if (PyArray_IsScalar(obj, Short)) {
- value = reinterpret_cast<PyShortScalarObject*>(obj)->obval;
- } else if (PyArray_IsScalar(obj, UShort)) {
- value = reinterpret_cast<PyUShortScalarObject*>(obj)->obval;
+ return AppendIntegerScalar<PyShortScalarObject>(obj, builder);
} else if (PyArray_IsScalar(obj, Int)) {
- value = reinterpret_cast<PyIntScalarObject*>(obj)->obval;
- } else if (PyArray_IsScalar(obj, UInt)) {
- value = reinterpret_cast<PyUIntScalarObject*>(obj)->obval;
+ return AppendIntegerScalar<PyIntScalarObject>(obj, builder);
} else if (PyArray_IsScalar(obj, Long)) {
- value = reinterpret_cast<PyLongScalarObject*>(obj)->obval;
- } else if (PyArray_IsScalar(obj, ULong)) {
- value = reinterpret_cast<PyULongScalarObject*>(obj)->obval;
+ return AppendIntegerScalar<PyLongScalarObject>(obj, builder);
} else if (PyArray_IsScalar(obj, LongLong)) {
- value = reinterpret_cast<PyLongLongScalarObject*>(obj)->obval;
+ return AppendIntegerScalar<PyLongLongScalarObject>(obj, builder);
} else if (PyArray_IsScalar(obj, Int64)) {
- value = reinterpret_cast<PyInt64ScalarObject*>(obj)->obval;
+ return AppendIntegerScalar<PyInt64ScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, UByte)) {
+ return AppendIntegerScalar<PyUByteScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, UShort)) {
+ return AppendIntegerScalar<PyUShortScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, UInt)) {
+ return AppendIntegerScalar<PyUIntScalarObject>(obj, builder);
+ } else if (PyArray_IsScalar(obj, ULong)) {
+ return AppendLargeUnsignedScalar<PyULongScalarObject>(obj, builder);
} else if (PyArray_IsScalar(obj, ULongLong)) {
- value = reinterpret_cast<PyULongLongScalarObject*>(obj)->obval;
+ return AppendLargeUnsignedScalar<PyULongLongScalarObject>(obj, builder);
} else if (PyArray_IsScalar(obj, UInt64)) {
- value = reinterpret_cast<PyUInt64ScalarObject*>(obj)->obval;
- } else {
- DCHECK(false) << "scalar type not recognized";
+ return AppendLargeUnsignedScalar<PyUInt64ScalarObject>(obj, builder);
}
- return builder->AppendInt64(value);
+ return Status::NotImplemented("Numpy scalar type not recognized");
}
Status Append(PyObject* context, PyObject* elem, SequenceBuilder* builder,
diff --git a/python/pyarrow/tests/test_serialization.py b/python/pyarrow/tests/test_serialization.py
index 53dd5c0..4bac300 100644
--- a/python/pyarrow/tests/test_serialization.py
+++ b/python/pyarrow/tests/test_serialization.py
@@ -115,7 +115,8 @@ PRIMITIVE_OBJECTS = [
{True: "hello", False: "world"}, {"hello": "world", 1: 42, 2.5: 45},
{"hello": set([2, 3]), "world": set([42.0]), "this": None},
np.int8(3), np.int32(4), np.int64(5),
- np.uint8(3), np.uint32(4), np.uint64(5), np.float16(1.9), np.float32(1.9),
+ np.uint8(3), np.uint32(4), np.uint64(5),
+ np.float16(1.9), np.float32(1.9),
np.float64(1.9), np.zeros([8, 20]),
np.random.normal(size=[17, 10]), np.array(["hi", 3]),
np.array(["hi", 3], dtype=object),
@@ -288,6 +289,25 @@ def test_primitive_serialization(large_buffer):
serialization_roundtrip(obj, large_buffer)
+def test_integer_limits(large_buffer):
+ # Check that Numpy scalars can be represented up to their limit values
+ # (except np.uint64 which is limited to 2**63 - 1)
+ for dt in [np.int8, np.int64, np.int32, np.int64,
+ np.uint8, np.uint64, np.uint32, np.uint64]:
+ scal = dt(np.iinfo(dt).min)
+ serialization_roundtrip(scal, large_buffer)
+ if dt is not np.uint64:
+ scal = dt(np.iinfo(dt).max)
+ serialization_roundtrip(scal, large_buffer)
+ else:
+ scal = dt(2**63 - 1)
+ serialization_roundtrip(scal, large_buffer)
+ for v in (2**63, 2**64 - 1):
+ scal = dt(v)
+ with pytest.raises(pa.ArrowInvalid):
+ pa.serialize(scal)
+
+
def test_serialize_to_buffer():
for nthreads in [1, 4]:
for value in COMPLEX_OBJECTS: