You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by mi...@apache.org on 2018/08/22 23:20:25 UTC
[6/7] impala git commit: IMPALA-7412: width_bucket() function overflows too easily

IMPALA-7412: width_bucket() function overflows too easily

Running the tests of https://gerrit.cloudera.org/#/c/10859/
it turned out that the width_bucket() function overflows
very often.

A common problem is that the function tries to cast the
'num_buckets' parameter to the decimal determined by the
Frontend. When the Frontend determined the precision and
scale of this decimal it only considered the decimal
arguments and ignored everything else. Therefore the
determined precision and scale is often not suitable for
the 'num_buckets' parameter.

WidthBucketImpl() has three decimal arguments, all of them
have the same byte size, precision, and scale. So it is
possible to interpret them as plain integers and still
calculate the proper bucket.

I included the python test cases from IMPALA-7202 developed
by Taras Bobrovytsky.
I also extended the backend tests with new test cases.

For performance test I used the following query:

SELECT sum(width_bucket(cast(l_orderkey AS DECIMAL(30, 10)),
           0, 5500000, 1000000))
FROM tpch_parquet.lineitem;

The new implementation executed it in ~0.3 seconds.
The old implementation executed it in ~0.8 seconds.

Change-Id: I68262698144029ef7f54e027e586eaf105f36ab3
Reviewed-on: http://gerrit.cloudera.org:8080/11282
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/632e0e2e
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/632e0e2e
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/632e0e2e

Branch: refs/heads/master
Commit: 632e0e2e36463fc8dce7a50ddf2defe8dae25def
Parents: e27954a
Author: Zoltan Borok-Nagy <bo...@cloudera.com>
Authored: Wed Aug 8 17:17:37 2018 +0200
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Wed Aug 22 18:26:23 2018 +0000

----------------------------------------------------------------------
 be/src/exprs/expr-test.cc             |  51 +++++----
 be/src/exprs/math-functions-ir.cc     | 159 +++++++++++------------------
 be/src/util/bit-util.h                |  53 +++++++++-
 tests/query_test/test_decimal_fuzz.py |  63 +++++++++++-
 4 files changed, 201 insertions(+), 125 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/632e0e2e/be/src/exprs/expr-test.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index d0e3ff3..49d0ae2 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -5348,16 +5348,11 @@ TEST_F(ExprTest, MathFunctions) {
   // Test when min > max
   TestErrorString("width_bucket(22, 50, 5, 4)",
       "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n");
-  // Test max - min will overflow during width_bucket evaluation
-  TestErrorString("width_bucket(11, -9, 99999999999999999999999999999999999999, 4000)",
-      "UDF ERROR: Overflow while evaluating the difference between min_range: -9 and "
-      "max_range: 99999999999999999999999999999999999999\n");
-  // If expr - min overflows during width_bucket evaluation, max - min will also
-  // overflow. Since we evaluate max - min before evaluating expr - min, we will never
-  // end up overflowing expr - min.
-  TestErrorString("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)",
-      "UDF ERROR: Overflow while evaluating the difference between min_range: "
-      "-99999999999999999999999999999999999999 and max_range: 9\n");
+  // IMPALA-7412: Test max - min should not overflow anymore
+  TestValue("width_bucket(11, -9, 99999999999999999999999999999999999999, 4000)",
+      TYPE_BIGINT, 1);
+  TestValue("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)",
+      TYPE_BIGINT, 40);
   // Test when dist_from_min * buckets cannot be stored in a int128_t (overflows)
   // and needs to be stored in a int256_t
   TestValue("width_bucket(8000000000000000000000000000000000000,"
@@ -5380,16 +5375,34 @@ TEST_F(ExprTest, MathFunctions) {
   // max and min value that would require int256_t for evalation
   TestValue("width_bucket(10000000000000000000000000000000000000, 1,"
             "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2);
-  // IMPALA-7242/IMPALA-7243: check for overflow when converting IntVal to DecimalValue
-  TestErrorString("width_bucket(cast(-0.10 as decimal(37,30)), cast(-0.36028797018963968 "
+  // IMPALA-7412: These should not overflow anymore
+  TestValue("width_bucket(cast(-0.10 as decimal(37,30)), cast(-0.36028797018963968 "
       "as decimal(25,25)), cast(9151517.4969773200562764155787276999832"
-      "as decimal(38,31)), 1328180220)",
-      "UDF ERROR: Overflow while representing the num_buckets:1328180220 as a "
-      "DecimalVal\n");
-  TestErrorString("width_bucket(cast(9 as decimal(10,7)), cast(-60000 as decimal(11,6)), "
-      "cast(10 as decimal(7,5)), 249895273);",
-      "UDF ERROR: Overflow while representing the num_buckets:249895273 as a "
-      "DecimalVal\n");
+      "as decimal(38,31)), 1328180220)", TYPE_BIGINT, 38);
+  TestValue("width_bucket(cast(9 as decimal(10,7)), cast(-60000 as decimal(11,6)), "
+      "cast(10 as decimal(7,5)), 249895273);", TYPE_BIGINT, 249891109);
+  // max - min and expr - min needs bigger type than the underlying type of
+  // the deduced decimal. The calculation must succeed by using a bigger type.
+  TestValue("width_bucket(cast(0.9999 as decimal(35,35)), cast(-0.705408425140 as "
+      "decimal(23,23)), cast(0.999999999999999999999 as decimal(38,38)), 699997927)",
+      TYPE_BIGINT, 699956882ll);
+  // max - min needs bigger type, but expr - min and (expr - min) * num_buckets fits
+  // into deduced decimal
+  TestValue("width_bucket(cast(-0.7054084251 as decimal(23,23)), cast(-0.705408425140 "
+      "as decimal(23,23)), cast(0.999999999999999999999 as decimal(38,38)), 10)",
+      TYPE_BIGINT, 1);
+  // max - min fits into deduced decimal, (max - min) * num_buckets needs bigger type,
+  // but expr == min
+  TestValue("width_bucket(cast(1 as decimal(9,0)), cast(1 as decimal(9,0)), "
+      "cast(100000000 as decimal(9,0)), 100)", TYPE_BIGINT, 1);
+  // max - min fits into deduced decimal, (max - min) * num_buckets needs bigger type,
+  // but (expr - min) * num_buckets fits
+  TestValue("width_bucket(cast(2 as decimal(9,0)), cast(1 as decimal(9,0)), "
+      "cast(100000000 as decimal(9,0)), 100)", TYPE_BIGINT, 1);
+  // max - min fits into deduced decimal, but (expr - min) * num_buckets needs bigger type
+  TestValue("width_bucket(cast(100000000 as decimal(9,0)), cast(1 as decimal(9,0)), "
+      "cast(100000001 as decimal(9,0)), 100)", TYPE_BIGINT, 100);
+
   // Run twice to test deterministic behavior.
   for (uint32_t seed : {0, 1234}) {
     stringstream rand, random;

http://git-wip-us.apache.org/repos/asf/impala/blob/632e0e2e/be/src/exprs/math-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/math-functions-ir.cc b/be/src/exprs/math-functions-ir.cc
index 28c9e1e..efe429e 100644
--- a/be/src/exprs/math-functions-ir.cc
+++ b/be/src/exprs/math-functions-ir.cc
@@ -457,57 +457,32 @@ DoubleVal MathFunctions::FmodDouble(FunctionContext* ctx, const DoubleVal& a,
 
 // The bucket_number is evaluated using the following formula:
 //
-//   bucket_number = dist_from_min * num_buckets / range_size
+//   bucket_number = dist_from_min * num_buckets / range_size + 1
 //      where -
 //        dist_from_min = expr - min_range
 //        range_size = max_range - min_range
-//        buckets = number of buckets
+//        num_buckets = number of buckets
 //
-// The results of the above subtractions are stored in Decimal16Value to avoid an overflow
-// in the following cases:
-//   case 1:
-//      T1 is decimal8Value
-//         When evaluating this particular expression
-//            dist_from_min = expr - min_range
-//         If expr is a max positive value which can be represented in decimal8Value and
-//         min_range < 0 the resulting dist_from_min can be represented in decimal16Val
-//         without overflowing
-//   case 2:
-//      T1 is decimal16Value
-//         Subtracting a negative min_range from expr can result in an overflow in which
-//         case the function errors out. There is no decimal32Val to handle this. So
-//         storing it in decimal16Value.
-//   case 3:
-//      T1 is decimal4Value
-//         We can store the results in a decimal8Value. But this change hard codes to
-//         store the result in decimal16Val for now to be compatible with the other
-//         decimal*Vals
+// Since expr, min_range, and max_range are all decimals with the same
+// byte size, precision, and scale we can interpret them as plain integers
+// and still calculate the proper bucket.
 //
-// The result of this multiplication dist_from_min * buckets is stored as a int256_t
-// if storing it in a int128_t would overflow.
+// There are some possibilities of overflowing during the calculation:
+// range_size = max_range - min_range
+// dist_from_min = expr - min_range
+// dist_from_min * num_buckets
 //
-// To perform the division, range_size is scaled up. The scale and precision of the
-// numerator and denominator are adjusted to be the same. This avoids the need to compute
-// the resulting scale and precision.
+// For all the above cases we use a bigger integer type provided by the
+// BitUtil::DoubleWidth<> metafunction.
 template <class  T1>
 BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
     const T1& expr, const T1& min_range,
     const T1& max_range, const IntVal& num_buckets) {
-  // FE casts expr, min_range and max_range to be of the scale and precision
-  int input_scale = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SCALE, 1);
-  int input_precision = ctx->impl()->GetConstFnAttr(
-      FunctionContextImpl::ARG_TYPE_PRECISION, 1);
-
-  bool overflow = false;
-  Decimal16Value range_size = max_range.template Subtract<int128_t>(input_scale,
-      min_range, input_scale, input_precision, input_scale, false, &overflow);
-  if (UNLIKELY(overflow)) {
-    ostringstream error_msg;
-    error_msg << "Overflow while evaluating the difference between min_range: " <<
-        min_range.value() << " and max_range: " << max_range.value();
-    ctx->SetError(error_msg.str().c_str());
-    return BigIntVal::null();
-  }
+  auto expr_val = expr.value();
+  using ActualType = decltype(expr_val);
+  auto min_range_val = min_range.value();
+  auto max_range_val = max_range.value();
+  auto num_buckets_val = static_cast<ActualType>(num_buckets.val);
 
   if (UNLIKELY(num_buckets.val <= 0)) {
     ostringstream error_msg;
@@ -516,73 +491,61 @@ BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
     return BigIntVal::null();
   }
 
-  if (UNLIKELY(min_range >= max_range)) {
+  if (UNLIKELY(min_range_val >= max_range_val)) {
     ctx->SetError("Lower bound cannot be greater than or equal to the upper bound");
     return BigIntVal::null();
   }
 
-  if (UNLIKELY(expr < min_range)) return 0;
-
-  if (UNLIKELY(expr >= max_range)) {
-    BigIntVal result;
-    result.val = num_buckets.val;
-    ++result.val;
-    return result;
+  if (expr_val < min_range_val) return 0;
+  if (expr_val >= max_range_val) {
+    return BigIntVal(static_cast<int64_t>(num_buckets.val) + 1);
   }
 
-  Decimal16Value dist_from_min = expr.template Subtract<int128_t>(input_scale,
-      min_range, input_scale, input_precision, input_scale, false, &overflow);
-  DCHECK_EQ(overflow, false);
-
-  Decimal16Value buckets = Decimal16Value::FromInt(input_precision, input_scale,
-      num_buckets.val, &overflow);
-
-  if (UNLIKELY(overflow)) {
-    stringstream error_msg;
-    error_msg << "Overflow while representing the num_buckets:" << num_buckets.val
-        << " as a DecimalVal";
-    ctx->SetError(error_msg.str().c_str());
-    return BigIntVal::null();
-  }
-  bool needs_int256 = false;
-  // Check if dist_from_min * buckets would overflow and if there is a need to
-  // store the intermediate results in int256_t to avoid an overflows
-  // Check if scaling up range size overflows and if there is a need to store the
-  // intermediate results in int256_t to avoid the overflow
-  if (UNLIKELY(BitUtil::CountLeadingZeros(abs(buckets.value())) +
-      BitUtil::CountLeadingZeros(abs(dist_from_min.value())) <= 128 ||
-      BitUtil::CountLeadingZeros(range_size.value()) +
-      detail::MinLeadingZerosAfterScaling(BitUtil::CountLeadingZeros(
-      range_size.value()), input_scale) <= 128)) {
-    needs_int256 = true;
+  bool bigger_type_needed = false;
+  // It is likely that this if stmt can be evaluated during codegen because
+  // 'max_range' and 'min_range' are almost certainly constant expressions:
+  if (max_range_val >= 0 && min_range_val < 0) {
+    if (static_cast<UnsignedType<ActualType>>(max_range_val) +
+        static_cast<UnsignedType<ActualType>>(abs(min_range_val)) >=
+        static_cast<UnsignedType<ActualType>>(BitUtil::Max<ActualType>())) {
+      bigger_type_needed = true;
+    }
   }
 
-  int128_t result;
-  if (needs_int256) {
-    // resulting scale should be 2 * input_scale as per multiplication rules
-    int256_t x = ConvertToInt256(buckets.value()) * ConvertToInt256(
-        dist_from_min.value());
-
-    // Since "range_size" and "x" have different scales, the divison would require
-    // evaluating the resulting scale. To avoid this we scale up the denominator to
-    // match the scale of the numerator.
-    int256_t y = DecimalUtil::MultiplyByScale<int256_t>(ConvertToInt256(
-        range_size.value()), input_scale, false);
-    result = ConvertToInt128(x / y, DecimalUtil::MAX_UNSCALED_DECIMAL16,
-        &overflow);
-    DCHECK_EQ(overflow, false);
+  auto MultiplicationOverflows = [](ActualType lhs, ActualType rhs) {
+    DCHECK(lhs > 0 && rhs > 0);
+    using ActualType = decltype(lhs);
+    return BitUtil::CountLeadingZeros(lhs) + BitUtil::CountLeadingZeros(rhs) <=
+        BitUtil::UnsignedWidth<ActualType>() + 1;
+  };
+
+  // It is likely that this can be evaluated during codegen:
+  bool multiplication_can_overflow = bigger_type_needed ||  MultiplicationOverflows(
+      max_range_val - min_range_val, num_buckets_val);
+  // 'expr_val' is not likely to be a constant expression, so this almost certainly
+  // needs runtime evaluation if 'bigger_type_needed' is false and
+  // 'multiplication_can_overflow' is true:
+  bigger_type_needed = bigger_type_needed || (
+      multiplication_can_overflow &&
+      expr_val != min_range_val &&
+      MultiplicationOverflows(expr_val - min_range_val, num_buckets_val));
+
+  auto BucketFunc = [](auto element, auto min_rng, auto max_rng, auto buckets) {
+    auto range_size = max_rng - min_rng;
+    auto dist_from_min = element - min_rng;
+    auto ret = dist_from_min * buckets / range_size;
+    return BigIntVal(static_cast<int64_t>(ret) + 1);
+  };
+
+  if (bigger_type_needed) {
+    using BiggerType = typename DoubleWidth<ActualType>::type;
+
+    return BucketFunc(static_cast<BiggerType>(expr_val),
+        static_cast<BiggerType>(min_range_val), static_cast<BiggerType>(max_range_val),
+        static_cast<BiggerType>(num_buckets.val));
   } else {
-    // resulting scale should be 2 * input_scale as per multiplication rules
-    int128_t x = buckets.value() * dist_from_min.value();
-
-    // Since "range_size" and "x" have different scales, the divison would require
-    // evaluating the resulting scale. To avoid this we scale up the denominator to
-    // match the scale of the numerator.
-    int128_t y = DecimalUtil::MultiplyByScale<int128_t>(range_size.value(),
-        input_scale, false);
-    result = x / y; // NOLINT: clang-tidy thinks y may equal zero here.
+    return BucketFunc(expr_val, min_range_val, max_range_val, num_buckets.val);
   }
-  return (BigIntVal(abs(result) + 1));
 }
 
 BigIntVal MathFunctions::WidthBucket(FunctionContext* ctx, const DecimalVal& expr,

http://git-wip-us.apache.org/repos/asf/impala/blob/632e0e2e/be/src/util/bit-util.h
----------------------------------------------------------------------
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 8a65509..d07dd9d 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -28,8 +28,7 @@
 #include <climits>
 #include <limits>
 #include <typeinfo>
-
-#include <boost/type_traits/make_unsigned.hpp>
+#include <type_traits>
 
 #include "common/compiler-util.h"
 #include "gutil/bits.h"
@@ -39,7 +38,40 @@
 
 namespace impala {
 
-using boost::make_unsigned;
+/// Nested 'type' corresponds to the unsigned version of T.
+template <typename T>
+struct MakeUnsigned {
+  using type = std::make_unsigned_t<T>;
+};
+
+template <>
+struct MakeUnsigned<int128_t> {
+  using type = __uint128_t;
+};
+
+template <typename T>
+using UnsignedType = typename MakeUnsigned<T>::type;
+
+// Doubles the width of integer types (e.g. int32_t -> int64_t).
+// Currently only works with a few signed types.
+// Feel free to extend it to other types as well.
+template <typename T>
+struct DoubleWidth {};
+
+template <>
+struct DoubleWidth<int32_t> {
+  using type = int64_t;
+};
+
+template <>
+struct DoubleWidth<int64_t> {
+  using type = int128_t;
+};
+
+template <>
+struct DoubleWidth<int128_t> {
+  using type = int256_t;
+};
 
 /// Utility class to do standard bit tricks
 /// TODO: is this in boost or something else like that?
@@ -59,6 +91,17 @@ class BitUtil {
         std::is_same<CVR_REMOVED, __int128>::value ? 127 : -1;
   }
 
+  /// Returns the max value that can be represented in T.
+  template<typename T, typename CVR_REMOVED = typename std::decay<T>::type,
+      typename std::enable_if<std::is_integral<CVR_REMOVED> {}||
+                              std::is_same<CVR_REMOVED, __int128> {}, int>::type = 0>
+  constexpr static inline CVR_REMOVED Max() {
+    return std::is_integral<CVR_REMOVED>::value ?
+        std::numeric_limits<CVR_REMOVED>::max() :
+        std::is_same<CVR_REMOVED, __int128>::value ?
+            static_cast<UnsignedType<CVR_REMOVED>>(-1) / 2 : -1;
+  }
+
   /// Return an integer signifying the sign of the value, returning +1 for
   /// positive integers (and zero), -1 for negative integers.
   /// The extra shift is to silence GCC warnings about full width shift on
@@ -168,7 +211,7 @@ class BitUtil {
   template<typename T>
   static inline int PopcountSigned(T v) {
     // Converting to same-width unsigned then extending preserves the bit pattern.
-    return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v));
+    return BitUtil::Popcount(static_cast<UnsignedType<T>>(v));
   }
 
   /// Returns the 'num_bits' least-significant bits of 'v'.
@@ -249,7 +292,7 @@ class BitUtil {
   template <typename T>
   constexpr static T ShiftRightLogical(T v, int shift) {
     // Conversion to unsigned ensures most significant bits always filled with 0's
-    return static_cast<typename make_unsigned<T>::type>(v) >> shift;
+    return static_cast<UnsignedType<T>>(v) >> shift;
   }
 
   /// Get an specific bit of a numeric type

http://git-wip-us.apache.org/repos/asf/impala/blob/632e0e2e/tests/query_test/test_decimal_fuzz.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_decimal_fuzz.py b/tests/query_test/test_decimal_fuzz.py
index a129e33..1433ec3 100644
--- a/tests/query_test/test_decimal_fuzz.py
+++ b/tests/query_test/test_decimal_fuzz.py
@@ -30,6 +30,9 @@ from tests.common.test_vector import ImpalaTestDimension, ImpalaTestMatrix
 
 class TestDecimalFuzz(ImpalaTestSuite):
 
+  # Impala's max precision for decimals is 38, so we should have the same in the tests
+  decimal.getcontext().prec = 38
+
   @classmethod
   def get_workload(cls):
     return 'functional-query'
@@ -200,7 +203,7 @@ class TestDecimalFuzz(ImpalaTestSuite):
         return True
     return False
 
-  def execute_one(self):
+  def execute_one_decimal_op(self):
     '''Executes a single query and compares the result to a result that we computed in
     Python.'''
     op = random.choice(['+', '-', '*', '/', '%'])
@@ -243,6 +246,60 @@ class TestDecimalFuzz(ImpalaTestSuite):
         expected_result = None
       assert self.result_equals(expected_result, result)
 
-  def test_fuzz(self, vector):
+  def test_decimal_ops(self, vector):
+    for _ in xrange(self.iterations):
+      self.execute_one_decimal_op()
+
+  def width_bucket(self, val, min_range, max_range, num_buckets):
+    # Multiplying the values by 10**40 guarantees that the numbers can be converted
+    # to int without losing information.
+    val_int = int(decimal.Decimal(val) * 10**40)
+    min_range_int = int(decimal.Decimal(min_range) * 10**40)
+    max_range_int = int(decimal.Decimal(max_range) * 10**40)
+
+    if min_range_int >= max_range_int:
+      return None
+    if val_int < min_range_int:
+      return 0
+    if val_int > max_range_int:
+      return num_buckets + 1
+
+    range_size = max_range_int - min_range_int
+    dist_from_min = val_int - min_range_int
+    return (num_buckets * dist_from_min) / range_size + 1
+
+  def execute_one_width_bucket(self):
+    val, val_prec, val_scale = self.get_decimal()
+    min_range, min_range_prec, min_range_scale = self.get_decimal()
+    max_range, max_range_prec, max_range_scale = self.get_decimal()
+    num_buckets = random.randint(1, 2147483647)
+
+    query = ('select width_bucket('
+        'cast({val} as decimal({val_prec},{val_scale})), '
+        'cast({min_range} as decimal({min_range_prec},{min_range_scale})), '
+        'cast({max_range} as decimal({max_range_prec},{max_range_scale})), '
+        '{num_buckets})')
+
+    query = query.format(val=val, val_prec=val_prec, val_scale=val_scale,
+        min_range=min_range, min_range_prec=min_range_prec,
+        min_range_scale=min_range_scale,
+        max_range=max_range, max_range_prec=max_range_prec,
+        max_range_scale=max_range_scale,
+        num_buckets=num_buckets)
+
+    expected_result = self.width_bucket(val, min_range, max_range, num_buckets)
+    if not expected_result:
+      return
+
+    try:
+      result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
+      assert int(result) == expected_result
+    except ImpalaBeeswaxException as e:
+      if "You need to wrap the arguments in a CAST" not in str(e):
+        # Sometimes the decimal inputs are incompatible with each other, so it's ok
+        # to ignore this error.
+        raise e
+
+  def test_width_bucket(self, vector):
     for _ in xrange(self.iterations):
-      self.execute_one()
+      self.execute_one_width_bucket()