You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/02/25 07:57:26 UTC

[impala] 12/14: IMPALA-4848: Add WIDTH_BUCKET() function

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 16276f4e1f55d973709f6c568a673aff6f308b7f
Author: aphadke <ap...@cloudera.com>
AuthorDate: Wed Feb 15 14:09:51 2017 -0800

    IMPALA-4848: Add WIDTH_BUCKET() function
    
    Syntax :
    width_bucket(expr decimal, min_val decimal, max_val decimal,
      num_buckets int)
    
    This function creates equiwidth histograms , where the histogram range
    is divided into num_buckets buckets having identical sizes. This
    function returns the bucket in which the expr value would fall. min_val
    and max_val are the minimum and maximum value of the histogram range
    respectively.
    
    -> This function returns NULL if expr is a NULL.
    -> This function returns 0 if expr < min_val
    -> This function returns num_buckets + 1 if expr > max_val
    
     E.g.
    [localhost:21000] > select width_bucket(8, 1, 20, 3);
    +---------------------------+
    | width_bucket(8, 1, 20, 3) |
    +---------------------------+
    | 2                         |
    +---------------------------+
    
    Change-Id: I081bc916b1bef7b929ca161a9aade3b54c6b858f
    Reviewed-on: http://gerrit.cloudera.org:8080/6023
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exprs/expr-test.cc                          |  65 +++++++++
 be/src/exprs/math-functions-ir.cc                  | 156 ++++++++++++++++++++-
 be/src/exprs/math-functions.h                      |  15 +-
 be/src/util/string-util.cc                         |   8 ++
 be/src/util/string-util.h                          |   4 +
 common/function-registry/impala_functions.py       |   3 +-
 .../main/java/org/apache/impala/analysis/Expr.java |   2 +-
 7 files changed, 246 insertions(+), 7 deletions(-)

diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index a4ddf15..f76f098 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -59,6 +59,7 @@
 #include "udf/udf-test-harness.h"
 #include "util/debug-util.h"
 #include "util/string-parser.h"
+#include "util/string-util.h"
 #include "util/test-info.h"
 #include "gen-cpp/ImpalaInternalService_types.h"
 
@@ -584,6 +585,17 @@ class ExprTest : public testing::Test {
     ASSERT_FALSE(status.ok()) << "stmt: " << stmt << "\nunexpected Status::OK.";
   }
 
+  // "Execute 'expr' and check that the returned error ends with 'error_string'"
+  void TestErrorString(const string& expr, const string& error_string) {
+    string stmt = "select " + expr;
+    vector<FieldSchema> result_types;
+    string result_row;
+    Status status = executor_->Exec(stmt, &result_types);
+    status = executor_->FetchResult(&result_row);
+    ASSERT_FALSE(status.ok());
+    ASSERT_TRUE(EndsWith(status.msg().msg(), error_string));
+  }
+
   template <typename T> void TestFixedPointComparisons(bool test_boundaries) {
     int64_t t_min = numeric_limits<T>::min();
     int64_t t_max = numeric_limits<T>::max();
@@ -5301,6 +5313,59 @@ TEST_F(ExprTest, MathFunctions) {
   TestValue("sqrt(2.0)", TYPE_DOUBLE, sqrt(2.0));
   TestValue("dsqrt(81.0)", TYPE_DOUBLE, 9);
 
+  TestValue("width_bucket(6.3, 2, 17, 2)", TYPE_BIGINT, 1);
+  TestValue("width_bucket(11, 6, 14, 3)", TYPE_BIGINT, 2);
+  TestValue("width_bucket(-1, -5, 5, 3)", TYPE_BIGINT, 2);
+  TestValue("width_bucket(1, -5, 5, 3)", TYPE_BIGINT, 2);
+  TestValue("width_bucket(3, 5, 20.1, 4)", TYPE_BIGINT, 0);
+  TestIsNull("width_bucket(NULL, 5, 20.1, 4)", TYPE_BIGINT);
+  TestIsNull("width_bucket(22, NULL, 20.1, 4)", TYPE_BIGINT);
+  TestIsNull("width_bucket(22, 5, NULL, 4)", TYPE_BIGINT);
+  TestIsNull("width_bucket(22, 5, 20.1, NULL)", TYPE_BIGINT);
+
+  TestValue("width_bucket(22, 5, 20.1, 4)", TYPE_BIGINT, 5);
+  // Test when the result (bucket number) is greater than the max value that can be
+  // stored in a IntVal
+  TestValue("width_bucket(22, 5, 20.1, 2147483647)", TYPE_BIGINT, 2147483648);
+  // Test when min and max of the bucket width range are equal.
+  TestErrorString("width_bucket(22, 5, 5, 4)",
+      "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n");
+  // Test when min > max
+  TestErrorString("width_bucket(22, 50, 5, 4)",
+      "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n");
+  // Test max - min will overflow during width_bucket evaluation
+  TestErrorString("width_bucket(11, -9, 99999999999999999999999999999999999999, 4000)",
+      "UDF ERROR: Overflow while evaluating the difference between min_range: -9 and "
+      "max_range: 99999999999999999999999999999999999999\n");
+  // If expr - min overflows during width_bucket evaluation, max - min will also
+  // overflow. Since we evaluate max - min before evaluating expr - min, we will never
+  // end up overflowing expr - min.
+  TestErrorString("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)",
+      "UDF ERROR: Overflow while evaluating the difference between min_range: "
+      "-99999999999999999999999999999999999999 and max_range: 9\n");
+  // Test when dist_from_min * buckets cannot be stored in a int128_t (overflows)
+  // and needs to be stored in a int256_t
+  TestValue("width_bucket(8000000000000000000000000000000000000,"
+      "100000000000000000000000000000000000, 9000000000000000000000000000000000000,"
+      "900000)", TYPE_BIGINT, 798877);
+  // Test when range_size * GetScaleMultiplier(input_scale) cannot be stored in a
+  // int128_t (overflows) and needs to be stored in a int256_t
+  TestValue("width_bucket(100000000, 199999.77777777777777777777777777, 99999999999.99999"
+    ", 40)", TYPE_BIGINT, 1);
+  // Test with max values for expr and num_bucket when the width_bucket can be
+  // evaluated with int128_t. Incrementing one of them will require using int256_t for
+  // width_bucket evaluation
+  TestValue("width_bucket(9999999999999999999999999999999999999, 1,"
+            "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2);
+  // Test with the smallest value of num_bucket for the given combination of expr,
+  // max and min value that would require int256_t for evalation
+  TestValue("width_bucket(9999999999999999999999999999999999999, 1,"
+            "99999999999999999999999999999999999999, 16)", TYPE_BIGINT, 2);
+  // Test with the smallest value of expr for the given combination of num_buckets,
+  // max and min value that would require int256_t for evalation
+  TestValue("width_bucket(10000000000000000000000000000000000000, 1,"
+            "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2);
+
   // Run twice to test deterministic behavior.
   for (uint32_t seed : {0, 1234}) {
     stringstream rand, random;
diff --git a/be/src/exprs/math-functions-ir.cc b/be/src/exprs/math-functions-ir.cc
index 531bf24..6bfb672 100644
--- a/be/src/exprs/math-functions-ir.cc
+++ b/be/src/exprs/math-functions-ir.cc
@@ -15,21 +15,22 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "exprs/math-functions.h"
-
 #include <iomanip>
 #include <random>
 #include <sstream>
 #include <math.h>
-
+#include <stdint.h>
+#include <string>
 #include "exprs/anyval-util.h"
 #include "exprs/scalar-expr.h"
 #include "exprs/operators.h"
+#include "exprs/math-functions.h"
 #include "util/string-parser.h"
+#include "runtime/decimal-value.inline.h"
 #include "runtime/runtime-state.h"
 #include "runtime/string-value.inline.h"
 #include "thirdparty/pcg-cpp-0.98/include/pcg_random.hpp"
-
+#include "exprs/decimal-operators.h"
 #include "common/names.h"
 
 using std::uppercase;
@@ -431,6 +432,153 @@ DoubleVal MathFunctions::FmodDouble(FunctionContext* ctx, const DoubleVal& a,
   return DoubleVal(fmod(a.val, b.val));
 }
 
+// The bucket_number is evaluated using the following formula:
+//
+//   bucket_number = dist_from_min * num_buckets / range_size
+//      where -
+//        dist_from_min = expr - min_range
+//        range_size = max_range - min_range
+//        buckets = number of buckets
+//
+// The results of the above subtractions are stored in Decimal16Value to avoid an overflow
+// in the following cases:
+//   case 1:
+//      T1 is decimal8Value
+//         When evaluating this particular expression
+//            dist_from_min = expr - min_range
+//         If expr is a max positive value which can be represented in decimal8Value and
+//         min_range < 0 the resulting dist_from_min can be represented in decimal16Val
+//         without overflowing
+//   case 2:
+//      T1 is decimal16Value
+//         Subtracting a negative min_range from expr can result in an overflow in which
+//         case the function errors out. There is no decimal32Val to handle this. So
+//         storing it in decimal16Value.
+//   case 3:
+//      T1 is decimal4Value
+//         We can store the results in a decimal8Value. But this change hard codes to
+//         store the result in decimal16Val for now to be compatible with the other
+//         decimal*Vals
+//
+// The result of this multiplication dist_from_min * buckets is stored as a int256_t
+// if storing it in a int128_t would overflow.
+//
+// To perform the division, range_size is scaled up. The scale and precision of the
+// numerator and denominator are adjusted to be the same. This avoids the need to compute
+// the resulting scale and precision.
+template <class  T1>
+BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
+    const T1& expr, const T1& min_range,
+    const T1& max_range, const IntVal& num_buckets) {
+  // FE casts expr, min_range and max_range to be of the scale and precision
+  int input_scale = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SCALE, 1);
+  int input_precision = ctx->impl()->GetConstFnAttr(
+      FunctionContextImpl::ARG_TYPE_PRECISION, 1);
+
+  bool overflow = false;
+  Decimal16Value range_size = max_range.template Subtract<int128_t>(input_scale,
+      min_range, input_scale, input_precision, input_scale, false, &overflow);
+  if (UNLIKELY(overflow)) {
+    ostringstream error_msg;
+    error_msg << "Overflow while evaluating the difference between min_range: " <<
+        min_range.value() << " and max_range: " << max_range.value();
+    ctx->SetError(error_msg.str().c_str());
+    return BigIntVal::null();
+  }
+
+  if (UNLIKELY(num_buckets.val <= 0)) {
+    ostringstream error_msg;
+    error_msg << "Number of buckets should be greater than zero:" << num_buckets.val;
+    ctx->SetError(error_msg.str().c_str());
+    return BigIntVal::null();
+  }
+
+  if (UNLIKELY(min_range >= max_range)) {
+    ctx->SetError("Lower bound cannot be greater than or equal to the upper bound");
+    return BigIntVal::null();
+  }
+
+  if (UNLIKELY(expr < min_range)) return 0;
+
+  if (UNLIKELY(expr >= max_range)) {
+    BigIntVal result;
+    result.val = num_buckets.val;
+    ++result.val;
+    return result;
+  }
+
+  Decimal16Value dist_from_min = expr.template Subtract<int128_t>(input_scale,
+      min_range, input_scale, input_precision, input_scale, false, &overflow);
+  DCHECK_EQ(overflow, false);
+
+  Decimal16Value buckets = Decimal16Value::FromInt(input_precision, input_scale,
+      num_buckets.val, &overflow);
+
+  bool needs_int256 = false;
+  // Check if dist_from_min * buckets would overflow and if there is a need to
+  // store the intermediate results in int256_t to avoid an overflows
+  // Check if scaling up range size overflows and if there is a need to store the
+  // intermediate results in int256_t to avoid the overflow
+  if (UNLIKELY(BitUtil::CountLeadingZeros(abs(buckets.value())) +
+      BitUtil::CountLeadingZeros(abs(dist_from_min.value())) <= 128 ||
+      BitUtil::CountLeadingZeros(range_size.value()) +
+      detail::MinLeadingZerosAfterScaling(BitUtil::CountLeadingZeros(
+      range_size.value()), input_scale) <= 128)) {
+    needs_int256 = true;
+  }
+
+  int128_t result;
+  if (needs_int256) {
+    // resulting scale should be 2 * input_scale as per multiplication rules
+    int256_t x = ConvertToInt256(buckets.value()) * ConvertToInt256(
+        dist_from_min.value());
+
+    // Since "range_size" and "x" have different scales, the divison would require
+    // evaluating the resulting scale. To avoid this we scale up the denominator to
+    // match the scale of the numerator.
+    int256_t y = DecimalUtil::MultiplyByScale<int256_t>(ConvertToInt256(
+        range_size.value()), input_scale, false);
+    result = ConvertToInt128(x / y, DecimalUtil::MAX_UNSCALED_DECIMAL16,
+        &overflow);
+    DCHECK_EQ(overflow, false);
+  } else {
+    // resulting scale should be 2 * input_scale as per multiplication rules
+    int128_t x = buckets.value() * dist_from_min.value();
+
+    // Since "range_size" and "x" have different scales, the divison would require
+    // evaluating the resulting scale. To avoid this we scale up the denominator to
+    // match the scale of the numerator.
+    int128_t y = DecimalUtil::MultiplyByScale<int128_t>(range_size.value(),
+        input_scale, false);
+    result = x / y; // NOLINT: clang-tidy thinks y may equal zero here.
+  }
+  return (BigIntVal(abs(result) + 1));
+}
+
+BigIntVal MathFunctions::WidthBucket(FunctionContext* ctx, const DecimalVal& expr,
+    const DecimalVal& min_range, const DecimalVal& max_range,
+    const IntVal& num_buckets) {
+  if (expr.is_null || min_range.is_null || max_range.is_null || num_buckets.is_null) {
+    return BigIntVal::null();
+  }
+
+  int arg_size_type = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SIZE, 0);
+  switch (arg_size_type) {
+    case 4:
+      return WidthBucketImpl<Decimal4Value> (ctx, Decimal4Value(expr.val4),
+          Decimal4Value(min_range.val4), Decimal4Value(max_range.val4), num_buckets);
+    case 8:
+      return WidthBucketImpl<Decimal8Value> (ctx, Decimal8Value(expr.val8),
+          Decimal8Value(min_range.val8), Decimal8Value(max_range.val8), num_buckets);
+    case 16:
+      return WidthBucketImpl<Decimal16Value>(ctx, Decimal16Value(expr.val16),
+         Decimal16Value(min_range.val16), Decimal16Value(max_range.val16), num_buckets);
+    default:
+      DCHECK(false);
+      return BigIntVal::null();
+  }
+}
+
 template <typename T> T MathFunctions::Positive(FunctionContext* ctx, const T& val) {
   return val;
 }
diff --git a/be/src/exprs/math-functions.h b/be/src/exprs/math-functions.h
index 7063907..e9f5c89 100644
--- a/be/src/exprs/math-functions.h
+++ b/be/src/exprs/math-functions.h
@@ -15,7 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-
 #ifndef IMPALA_EXPRS_MATH_FUNCTIONS_H
 #define IMPALA_EXPRS_MATH_FUNCTIONS_H
 
@@ -112,6 +111,11 @@ class MathFunctions {
   template <bool ISLEAST> static DecimalVal LeastGreatest(
       FunctionContext*, int num_args, const DecimalVal* args);
 
+
+  static BigIntVal WidthBucket(FunctionContext* ctx, const DecimalVal& expr,
+      const DecimalVal& min_range, const DecimalVal& max_range,
+      const IntVal& num_buckets);
+
  private:
   static const int32_t MIN_BASE = 2;
   static const int32_t MAX_BASE = 36;
@@ -135,6 +139,15 @@ class MathFunctions {
   /// Returns false otherwise, indicating some other error condition.
   static bool HandleParseResult(int8_t dest_base, int64_t* num,
       StringParser::ParseResult parse_res);
+
+  /// This function creates equiwidth histograms , where the histogram range
+  /// is divided into num_buckets buckets having identical sizes. This function
+  /// returns the bucket in which the expr value would fall. min_val and
+  /// max_val are the minimum and maximum value of the histogram range
+  /// respectively.
+  template <typename T1>
+  static BigIntVal WidthBucketImpl(FunctionContext* ctx,const T1& expr,
+      const T1& min_range,const T1& max_range, const IntVal& num_buckets);
 };
 
 }
diff --git a/be/src/util/string-util.cc b/be/src/util/string-util.cc
index f8e284f..e442391 100644
--- a/be/src/util/string-util.cc
+++ b/be/src/util/string-util.cc
@@ -65,4 +65,12 @@ bool CommaSeparatedContains(const std::string& cs_list, const std::string& item)
   return false;
 }
 
+bool EndsWith(const std::string& full_string, const std::string& end) {
+  if (full_string.size() >= end.size()) {
+    return (full_string.compare(full_string.size() - end.size(), end.size(),
+        end) == 0);
+  }
+  return false;
+}
+
 }
diff --git a/be/src/util/string-util.h b/be/src/util/string-util.h
index 2811a3c..8db010c 100644
--- a/be/src/util/string-util.h
+++ b/be/src/util/string-util.h
@@ -40,6 +40,10 @@ Status TruncateUp(const std::string& str, int32_t max_length, std::string* resul
 /// Return true if the comma-separated string 'cs_list' contains 'item' as one of
 /// the comma-separated values.
 bool CommaSeparatedContains(const std::string& cs_list, const std::string& item);
+
+/// Return true if a given string 'full_str' ends with the characters in the
+/// 'end' string
+bool EndsWith(const std::string& full_string, const std::string& end);
 }
 
 #endif
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index b3387c6..1aba496 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -388,7 +388,8 @@ visible_functions = [
    '_ZN6impala13MathFunctions13LeastGreatestILb0EEEN10impala_udf9StringValEPNS2_15FunctionContextEiPKS3_'],
   [['greatest'], 'DECIMAL', ['DECIMAL', '...'],
    '_ZN6impala13MathFunctions13LeastGreatestILb0EEEN10impala_udf10DecimalValEPNS2_15FunctionContextEiPKS3_'],
-
+  [['width_bucket'], 'BIGINT', ['DECIMAL', 'DECIMAL', 'DECIMAL', 'INT'],
+    '_ZN6impala13MathFunctions11WidthBucketEPN10impala_udf15FunctionContextERKNS1_10DecimalValES6_S6_RKNS1_6IntValE'],
   # Decimal Functions
   # TODO: oracle has decimal support for transcendental functions (e.g. sin()) to very
   # high precisions. Do we need them? It's unclear if other databases do the same.
diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java
index 5cdcb16..e85a4a4 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Expr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java
@@ -504,12 +504,12 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
         ScalarType decimalType = (ScalarType) childType;
         result = decimalType.getMinResolutionDecimal();
       } else {
-        Preconditions.checkState(childType.isDecimal() || result.isDecimal());
         result = Type.getAssignmentCompatibleType(
             result, childType, false, strictDecimal);
       }
     }
     if (result != null && !result.isNull()) {
+      result = ((ScalarType)result).getMinResolutionDecimal();
       Preconditions.checkState(result.isDecimal() || result.isInvalid());
       Preconditions.checkState(!result.isWildcardDecimal());
     }