You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/11/27 22:16:18 UTC
[arrow] branch master updated: ARROW-3790: [C++] Fix erroneous safe
casting
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c98399d ARROW-3790: [C++] Fix erroneous safe casting
c98399d is described below
commit c98399d500e0d3db47987e826af88c7376eed7c4
Author: François Saint-Jacques <fs...@gmail.com>
AuthorDate: Tue Nov 27 16:16:09 2018 -0600
ARROW-3790: [C++] Fix erroneous safe casting
- Improves coverage of various type combinations under safe/unsafe
conditions
- Fixes proper bound detection at compile time
- Fixes type traits pattern matching
While reviewing this merge, I noted that python, Ruby/glib and R would use CastOptions differently.
- Python only exposes safe as a single boolean (which flips all options accordingly)
- Ruby exposes 2 out of 3 of the options (probably due to code rot)
- R is going to exposes all 3 modes.
What is the purpose of having the 3 options, could we just expose safe versus unsafe?
Author: François Saint-Jacques <fs...@gmail.com>
Closes #3022 from fsaintjacques/ARROW-3790-unsigned-cast and squashes the following commits:
a54392831 <François Saint-Jacques> ARROW-3790: Fix erroneous safe casting
---
cpp/src/arrow/compute/compute-test.cc | 115 ++++++++++++++++++++
cpp/src/arrow/compute/kernels/cast.cc | 197 ++++++++++++++++++++++++++++------
2 files changed, 277 insertions(+), 35 deletions(-)
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index ea91021..821569e 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -236,6 +236,76 @@ TEST_F(TestCast, ToIntDowncastSafe) {
// underflow
vector<int32_t> v6 = {0, 1000, 2000, -70000, 0};
CheckFails<Int32Type>(int32(), v6, is_valid, int16(), options);
+
+ vector<int32_t> v7 = {0, 1000, 2000, -70000, 0};
+ CheckFails<Int32Type>(int32(), v7, is_valid, uint8(), options);
+}
+
+template <typename O, typename I>
+std::vector<O> UnsafeVectorCast(const std::vector<I>& v) {
+ size_t n_elems = v.size();
+ std::vector<O> result(n_elems);
+
+ for (size_t i = 0; i < v.size(); i++) result[i] = static_cast<O>(v[i]);
+
+ return std::move(result);
+}
+
+TEST_F(TestCast, IntegerSignedToUnsigned) {
+ CastOptions options;
+ options.allow_int_overflow = false;
+
+ vector<bool> is_valid = {true, false, true, true, true};
+
+ vector<int32_t> v1 = {INT32_MIN, 100, -1, UINT16_MAX, INT32_MAX};
+
+ // Same width
+ CheckFails<Int32Type>(int32(), v1, is_valid, uint32(), options);
+ // Wider
+ CheckFails<Int32Type>(int32(), v1, is_valid, uint64(), options);
+ // Narrower
+ CheckFails<Int32Type>(int32(), v1, is_valid, uint16(), options);
+ // Fail because of overflow (instead of underflow).
+ vector<int32_t> over = {0, -11, 0, UINT16_MAX + 1, INT32_MAX};
+ CheckFails<Int32Type>(int32(), over, is_valid, uint16(), options);
+
+ options.allow_int_overflow = true;
+
+ CheckCase<Int32Type, int32_t, UInt32Type, uint32_t>(
+ int32(), v1, is_valid, uint32(), UnsafeVectorCast<uint32_t, int32_t>(v1), options);
+ CheckCase<Int32Type, int32_t, UInt64Type, uint64_t>(
+ int32(), v1, is_valid, uint64(), UnsafeVectorCast<uint64_t, int32_t>(v1), options);
+ CheckCase<Int32Type, int32_t, UInt16Type, uint16_t>(
+ int32(), v1, is_valid, uint16(), UnsafeVectorCast<uint16_t, int32_t>(v1), options);
+ CheckCase<Int32Type, int32_t, UInt16Type, uint16_t>(
+ int32(), over, is_valid, uint16(), UnsafeVectorCast<uint16_t, int32_t>(over),
+ options);
+}
+
+TEST_F(TestCast, IntegerUnsignedToSigned) {
+ CastOptions options;
+ options.allow_int_overflow = false;
+
+ vector<bool> is_valid = {true, true, true};
+
+ vector<uint32_t> v1 = {0, INT16_MAX + 1, UINT32_MAX};
+ vector<uint32_t> v2 = {0, INT16_MAX + 1, 2};
+ // Same width
+ CheckFails<UInt32Type>(uint32(), v1, is_valid, int32(), options);
+ // Narrower
+ CheckFails<UInt32Type>(uint32(), v1, is_valid, int16(), options);
+ CheckFails<UInt32Type>(uint32(), v2, is_valid, int16(), options);
+
+ options.allow_int_overflow = true;
+
+ CheckCase<UInt32Type, uint32_t, Int32Type, int32_t>(
+ uint32(), v1, is_valid, int32(), UnsafeVectorCast<int32_t, uint32_t>(v1), options);
+ CheckCase<UInt32Type, uint32_t, Int64Type, int64_t>(
+ uint32(), v1, is_valid, int64(), UnsafeVectorCast<int64_t, uint32_t>(v1), options);
+ CheckCase<UInt32Type, uint32_t, Int16Type, int16_t>(
+ uint32(), v1, is_valid, int16(), UnsafeVectorCast<int16_t, uint32_t>(v1), options);
+ CheckCase<UInt32Type, uint32_t, Int16Type, int16_t>(
+ uint32(), v2, is_valid, int16(), UnsafeVectorCast<int16_t, uint32_t>(v2), options);
}
TEST_F(TestCast, ToIntDowncastUnsafe) {
@@ -343,6 +413,21 @@ TEST_F(TestCast, FloatingPointToInt) {
options);
}
+TEST_F(TestCast, IntToFloatingPoint) {
+ auto options = CastOptions::Safe();
+
+ vector<bool> all_valid = {true, true, true, true, true};
+ vector<bool> all_invalid = {false, false, false, false, false};
+
+ vector<int64_t> v1 = {INT64_MIN, INT64_MIN + 1, 0, INT64_MAX - 1, INT64_MAX};
+ CheckFails<Int64Type>(int64(), v1, all_valid, float32(), options);
+
+ // While it's not safe to convert, all values are null.
+ CheckCase<Int64Type, int64_t, DoubleType, double>(int64(), v1, all_invalid, float64(),
+ UnsafeVectorCast<double, int64_t>(v1),
+ options);
+}
+
TEST_F(TestCast, TimestampToTimestamp) {
CastOptions options;
@@ -583,6 +668,36 @@ TEST_F(TestCast, TimeToCompatible) {
options);
}
+TEST_F(TestCast, PrimitiveZeroCopy) {
+ shared_ptr<Array> arr;
+
+ ArrayFromVector<UInt8Type, uint8_t>(uint8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, uint8());
+ ArrayFromVector<Int8Type, int8_t>(int8(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, int8());
+
+ ArrayFromVector<UInt16Type, uint16_t>(uint16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, uint16());
+ ArrayFromVector<Int16Type, int8_t>(int16(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, int16());
+
+ ArrayFromVector<UInt32Type, uint32_t>(uint32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, uint32());
+ ArrayFromVector<Int32Type, int8_t>(int32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, int32());
+
+ ArrayFromVector<UInt64Type, uint64_t>(uint64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, uint64());
+ ArrayFromVector<Int64Type, int8_t>(int64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, int64());
+
+ ArrayFromVector<FloatType, float>(float32(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, float32());
+
+ ArrayFromVector<DoubleType, double>(float64(), {1, 1, 1, 1}, {1, 2, 3, 4}, &arr);
+ CheckZeroCopy(*arr, float64());
+}
+
TEST_F(TestCast, DateToCompatible) {
CastOptions options;
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index 97bc141..cd45b2d 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -89,6 +89,8 @@ template <typename O, typename I>
struct is_zero_copy_cast<
O, I,
typename std::enable_if<std::is_same<I, O>::value &&
+ // Parametric types contains runtime data which
+ // differentiate them, it cannot be checked statically.
!std::is_base_of<ParametricType, O>::value>::type> {
static constexpr bool value = true;
};
@@ -161,23 +163,62 @@ struct CastFunctor<T, BooleanType, enable_if_number<T>> {
}
};
+// Number to Boolean
+template <typename I>
+struct CastFunctor<BooleanType, I,
+ typename std::enable_if<std::is_base_of<Number, I>::value &&
+ !std::is_same<BooleanType, I>::value>::type> {
+ void operator()(FunctionContext* ctx, const CastOptions& options,
+ const ArrayData& input, ArrayData* output) {
+ auto in_data = input.GetValues<typename I::c_type>(1);
+ const auto generate = [&in_data]() -> bool { return *in_data++ != 0; };
+ internal::GenerateBitsUnrolled(output->buffers[1]->mutable_data(), output->offset,
+ input.length, generate);
+ }
+};
+
// ----------------------------------------------------------------------
// Integers and Floating Point
+// Conversions pairs (<O, I>) are partitioned in 4 type traits:
+// - is_number_downcast
+// - is_integral_signed_to_unsigned
+// - is_integral_unsigned_to_signed
+// - is_float_truncate
+//
+// Each class has a different way of validation if the conversion is safe
+// (either with bounded intervals or with explicit C casts)
+
+template <typename O, typename I, typename Enable = void>
+struct is_number_downcast {
+ static constexpr bool value = false;
+};
+
template <typename O, typename I>
-struct is_numeric_cast {
+struct is_number_downcast<
+ O, I,
+ typename std::enable_if<std::is_base_of<Number, O>::value &&
+ std::is_base_of<Number, I>::value>::type> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
static constexpr bool value =
- (std::is_base_of<Number, O>::value && std::is_base_of<Number, I>::value) &&
- (!std::is_same<O, I>::value);
+ ((!std::is_same<O, I>::value) &&
+ // Both types are of the same sign-ness.
+ ((std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+ // Both types are of the same integral-ness.
+ (std::is_floating_point<O_T>::value == std::is_floating_point<I_T>::value)) &&
+ // Smaller output size
+ (sizeof(O_T) < sizeof(I_T)));
};
template <typename O, typename I, typename Enable = void>
-struct is_integer_downcast {
+struct is_integral_signed_to_unsigned {
static constexpr bool value = false;
};
template <typename O, typename I>
-struct is_integer_downcast<
+struct is_integral_signed_to_unsigned<
O, I,
typename std::enable_if<std::is_base_of<Integer, O>::value &&
std::is_base_of<Integer, I>::value>::type> {
@@ -186,45 +227,92 @@ struct is_integer_downcast<
static constexpr bool value =
((!std::is_same<O, I>::value) &&
-
- // same size, but unsigned to signed
- ((sizeof(O_T) == sizeof(I_T) && std::is_signed<O_T>::value &&
- std::is_unsigned<I_T>::value) ||
-
- // Smaller output size
- (sizeof(O_T) < sizeof(I_T))));
+ ((std::is_unsigned<O_T>::value && std::is_signed<I_T>::value)));
};
template <typename O, typename I, typename Enable = void>
-struct is_float_truncate {
+struct is_integral_unsigned_to_signed {
static constexpr bool value = false;
};
template <typename O, typename I>
-struct is_float_truncate<
+struct is_integral_unsigned_to_signed<
O, I,
typename std::enable_if<std::is_base_of<Integer, O>::value &&
- std::is_base_of<FloatingPoint, I>::value>::type> {
- static constexpr bool value = true;
+ std::is_base_of<Integer, I>::value>::type> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ ((!std::is_same<O, I>::value) &&
+ ((std::is_signed<O_T>::value && std::is_unsigned<I_T>::value)));
};
+// This set of functions SafeMinimum/SafeMaximum would be simplified with
+// C++17 and `if constexpr`.
+
+// clang-format doesn't handle this construct properly. Thus the macro, but it
+// also improves readability.
+//
+// The effective return type of the function is always `I::c_type`, this is
+// just how enable_if works with functions.
+#define RET_TYPE(TRAIT) \
+ typename std::enable_if<TRAIT<O, I>::value, typename I::c_type>::type
+
template <typename O, typename I>
-struct CastFunctor<O, I,
- typename std::enable_if<std::is_same<BooleanType, O>::value &&
- std::is_base_of<Number, I>::value &&
- !std::is_same<O, I>::value>::type> {
- void operator()(FunctionContext* ctx, const CastOptions& options,
- const ArrayData& input, ArrayData* output) {
- auto in_data = input.GetValues<typename I::c_type>(1);
- const auto generate = [&in_data]() -> bool { return *in_data++ != 0; };
- internal::GenerateBitsUnrolled(output->buffers[1]->mutable_data(), output->offset,
- input.length, generate);
- }
-};
+constexpr RET_TYPE(is_number_downcast) SafeMinimum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::lowest();
+}
template <typename O, typename I>
-struct CastFunctor<O, I,
- typename std::enable_if<is_integer_downcast<O, I>::value>::type> {
+constexpr RET_TYPE(is_number_downcast) SafeMaximum() {
+ using out_type = typename O::c_type;
+
+ return std::numeric_limits<out_type>::max();
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMinimum() {
+ return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_unsigned_to_signed) SafeMaximum() {
+ using in_type = typename I::c_type;
+ using out_type = typename O::c_type;
+
+ // Equality is missing because in_type::max() > out_type::max() when types
+ // are of the same width.
+ return static_cast<in_type>(sizeof(in_type) < sizeof(out_type)
+ ? std::numeric_limits<in_type>::max()
+ : std::numeric_limits<out_type>::max());
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMinimum() {
+ return 0;
+}
+
+template <typename O, typename I>
+constexpr RET_TYPE(is_integral_signed_to_unsigned) SafeMaximum() {
+ using in_type = typename I::c_type;
+ using out_type = typename O::c_type;
+
+ return static_cast<in_type>(sizeof(in_type) <= sizeof(out_type)
+ ? std::numeric_limits<in_type>::max()
+ : std::numeric_limits<out_type>::max());
+}
+
+#undef RET_TYPE
+
+template <typename O, typename I>
+struct CastFunctor<
+ O, I,
+ typename std::enable_if<is_number_downcast<O, I>::value ||
+ is_integral_signed_to_unsigned<O, I>::value ||
+ is_integral_unsigned_to_signed<O, I>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
using in_type = typename I::c_type;
@@ -236,8 +324,8 @@ struct CastFunctor<O, I,
auto out_data = output->GetMutableValues<out_type>(1);
if (!options.allow_int_overflow) {
- constexpr in_type kMax = static_cast<in_type>(std::numeric_limits<out_type>::max());
- constexpr in_type kMin = static_cast<in_type>(std::numeric_limits<out_type>::min());
+ constexpr in_type kMax = SafeMaximum<O, I>();
+ constexpr in_type kMin = SafeMinimum<O, I>();
// Null count may be -1 if the input array had been sliced
if (input.null_count != 0) {
@@ -267,6 +355,22 @@ struct CastFunctor<O, I,
}
};
+// Float to Integer or Integer to Float
+template <typename O, typename I, typename Enable = void>
+struct is_float_truncate {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_float_truncate<
+ O, I,
+ typename std::enable_if<(std::is_base_of<Integer, O>::value &&
+ std::is_base_of<FloatingPoint, I>::value) ||
+ (std::is_base_of<Integer, I>::value &&
+ std::is_base_of<FloatingPoint, O>::value)>::type> {
+ static constexpr bool value = true;
+};
+
template <typename O, typename I>
struct CastFunctor<O, I, typename std::enable_if<is_float_truncate<O, I>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
@@ -290,7 +394,8 @@ struct CastFunctor<O, I, typename std::enable_if<is_float_truncate<O, I>::value>
input.length);
for (int64_t i = 0; i < input.length; ++i) {
auto out_value = static_cast<out_type>(*in_data);
- if (ARROW_PREDICT_FALSE(static_cast<in_type>(out_value) != *in_data)) {
+ if (ARROW_PREDICT_FALSE(is_valid_reader.IsSet() &&
+ static_cast<in_type>(out_value) != *in_data)) {
ctx->SetStatus(Status::Invalid("Floating point value truncated"));
}
*out_data++ = out_value;
@@ -311,11 +416,31 @@ struct CastFunctor<O, I, typename std::enable_if<is_float_truncate<O, I>::value>
}
};
+// Leftover of Number combinations that are safe to cast.
+template <typename O, typename I, typename Enable = void>
+struct is_safe_numeric_cast {
+ static constexpr bool value = false;
+};
+
+template <typename O, typename I>
+struct is_safe_numeric_cast<
+ O, I,
+ typename std::enable_if<std::is_base_of<Number, O>::value &&
+ std::is_base_of<Number, I>::value>::type> {
+ using O_T = typename O::c_type;
+ using I_T = typename I::c_type;
+
+ static constexpr bool value =
+ (std::is_signed<O_T>::value == std::is_signed<I_T>::value) &&
+ (std::is_integral<O_T>::value == std::is_integral<I_T>::value) &&
+ (sizeof(O_T) >= sizeof(I_T)) && (!std::is_same<O, I>::value);
+};
+
template <typename O, typename I>
struct CastFunctor<O, I,
- typename std::enable_if<is_numeric_cast<O, I>::value &&
+ typename std::enable_if<is_safe_numeric_cast<O, I>::value &&
!is_float_truncate<O, I>::value &&
- !is_integer_downcast<O, I>::value>::type> {
+ !is_number_downcast<O, I>::value>::type> {
void operator()(FunctionContext* ctx, const CastOptions& options,
const ArrayData& input, ArrayData* output) {
using in_type = typename I::c_type;
@@ -324,6 +449,8 @@ struct CastFunctor<O, I,
const in_type* in_data = input.GetValues<in_type>(1);
auto out_data = output->GetMutableValues<out_type>(1);
for (int64_t i = 0; i < input.length; ++i) {
+ // Due to various checks done via type-trait, the cast is safe and bear
+ // no truncation.
*out_data++ = static_cast<out_type>(*in_data++);
}
}