You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/02/25 07:57:26 UTC
[impala] 12/14: IMPALA-4848: Add WIDTH_BUCKET() function
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch 2.x
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 16276f4e1f55d973709f6c568a673aff6f308b7f
Author: aphadke <ap...@cloudera.com>
AuthorDate: Wed Feb 15 14:09:51 2017 -0800
IMPALA-4848: Add WIDTH_BUCKET() function
Syntax :
width_bucket(expr decimal, min_val decimal, max_val decimal,
num_buckets int)
This function creates equiwidth histograms , where the histogram range
is divided into num_buckets buckets having identical sizes. This
function returns the bucket in which the expr value would fall. min_val
and max_val are the minimum and maximum value of the histogram range
respectively.
-> This function returns NULL if expr is a NULL.
-> This function returns 0 if expr < min_val
-> This function returns num_buckets + 1 if expr > max_val
E.g.
[localhost:21000] > select width_bucket(8, 1, 20, 3);
+---------------------------+
| width_bucket(8, 1, 20, 3) |
+---------------------------+
| 2 |
+---------------------------+
Change-Id: I081bc916b1bef7b929ca161a9aade3b54c6b858f
Reviewed-on: http://gerrit.cloudera.org:8080/6023
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exprs/expr-test.cc | 65 +++++++++
be/src/exprs/math-functions-ir.cc | 156 ++++++++++++++++++++-
be/src/exprs/math-functions.h | 15 +-
be/src/util/string-util.cc | 8 ++
be/src/util/string-util.h | 4 +
common/function-registry/impala_functions.py | 3 +-
.../main/java/org/apache/impala/analysis/Expr.java | 2 +-
7 files changed, 246 insertions(+), 7 deletions(-)
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index a4ddf15..f76f098 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -59,6 +59,7 @@
#include "udf/udf-test-harness.h"
#include "util/debug-util.h"
#include "util/string-parser.h"
+#include "util/string-util.h"
#include "util/test-info.h"
#include "gen-cpp/ImpalaInternalService_types.h"
@@ -584,6 +585,17 @@ class ExprTest : public testing::Test {
ASSERT_FALSE(status.ok()) << "stmt: " << stmt << "\nunexpected Status::OK.";
}
+ // "Execute 'expr' and check that the returned error ends with 'error_string'"
+ void TestErrorString(const string& expr, const string& error_string) {
+ string stmt = "select " + expr;
+ vector<FieldSchema> result_types;
+ string result_row;
+ Status status = executor_->Exec(stmt, &result_types);
+ status = executor_->FetchResult(&result_row);
+ ASSERT_FALSE(status.ok());
+ ASSERT_TRUE(EndsWith(status.msg().msg(), error_string));
+ }
+
template <typename T> void TestFixedPointComparisons(bool test_boundaries) {
int64_t t_min = numeric_limits<T>::min();
int64_t t_max = numeric_limits<T>::max();
@@ -5301,6 +5313,59 @@ TEST_F(ExprTest, MathFunctions) {
TestValue("sqrt(2.0)", TYPE_DOUBLE, sqrt(2.0));
TestValue("dsqrt(81.0)", TYPE_DOUBLE, 9);
+ TestValue("width_bucket(6.3, 2, 17, 2)", TYPE_BIGINT, 1);
+ TestValue("width_bucket(11, 6, 14, 3)", TYPE_BIGINT, 2);
+ TestValue("width_bucket(-1, -5, 5, 3)", TYPE_BIGINT, 2);
+ TestValue("width_bucket(1, -5, 5, 3)", TYPE_BIGINT, 2);
+ TestValue("width_bucket(3, 5, 20.1, 4)", TYPE_BIGINT, 0);
+ TestIsNull("width_bucket(NULL, 5, 20.1, 4)", TYPE_BIGINT);
+ TestIsNull("width_bucket(22, NULL, 20.1, 4)", TYPE_BIGINT);
+ TestIsNull("width_bucket(22, 5, NULL, 4)", TYPE_BIGINT);
+ TestIsNull("width_bucket(22, 5, 20.1, NULL)", TYPE_BIGINT);
+
+ TestValue("width_bucket(22, 5, 20.1, 4)", TYPE_BIGINT, 5);
+ // Test when the result (bucket number) is greater than the max value that can be
+ // stored in a IntVal
+ TestValue("width_bucket(22, 5, 20.1, 2147483647)", TYPE_BIGINT, 2147483648);
+ // Test when min and max of the bucket width range are equal.
+ TestErrorString("width_bucket(22, 5, 5, 4)",
+ "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n");
+ // Test when min > max
+ TestErrorString("width_bucket(22, 50, 5, 4)",
+ "UDF ERROR: Lower bound cannot be greater than or equal to the upper bound\n");
+ // Test max - min will overflow during width_bucket evaluation
+ TestErrorString("width_bucket(11, -9, 99999999999999999999999999999999999999, 4000)",
+ "UDF ERROR: Overflow while evaluating the difference between min_range: -9 and "
+ "max_range: 99999999999999999999999999999999999999\n");
+ // If expr - min overflows during width_bucket evaluation, max - min will also
+ // overflow. Since we evaluate max - min before evaluating expr - min, we will never
+ // end up overflowing expr - min.
+ TestErrorString("width_bucket(1, -99999999999999999999999999999999999999, 9, 40)",
+ "UDF ERROR: Overflow while evaluating the difference between min_range: "
+ "-99999999999999999999999999999999999999 and max_range: 9\n");
+ // Test when dist_from_min * buckets cannot be stored in a int128_t (overflows)
+ // and needs to be stored in a int256_t
+ TestValue("width_bucket(8000000000000000000000000000000000000,"
+ "100000000000000000000000000000000000, 9000000000000000000000000000000000000,"
+ "900000)", TYPE_BIGINT, 798877);
+ // Test when range_size * GetScaleMultiplier(input_scale) cannot be stored in a
+ // int128_t (overflows) and needs to be stored in a int256_t
+ TestValue("width_bucket(100000000, 199999.77777777777777777777777777, 99999999999.99999"
+ ", 40)", TYPE_BIGINT, 1);
+ // Test with max values for expr and num_bucket when the width_bucket can be
+ // evaluated with int128_t. Incrementing one of them will require using int256_t for
+ // width_bucket evaluation
+ TestValue("width_bucket(9999999999999999999999999999999999999, 1,"
+ "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2);
+ // Test with the smallest value of num_bucket for the given combination of expr,
+ // max and min value that would require int256_t for evalation
+ TestValue("width_bucket(9999999999999999999999999999999999999, 1,"
+ "99999999999999999999999999999999999999, 16)", TYPE_BIGINT, 2);
+ // Test with the smallest value of expr for the given combination of num_buckets,
+ // max and min value that would require int256_t for evalation
+ TestValue("width_bucket(10000000000000000000000000000000000000, 1,"
+ "99999999999999999999999999999999999999, 15)", TYPE_BIGINT, 2);
+
// Run twice to test deterministic behavior.
for (uint32_t seed : {0, 1234}) {
stringstream rand, random;
diff --git a/be/src/exprs/math-functions-ir.cc b/be/src/exprs/math-functions-ir.cc
index 531bf24..6bfb672 100644
--- a/be/src/exprs/math-functions-ir.cc
+++ b/be/src/exprs/math-functions-ir.cc
@@ -15,21 +15,22 @@
// specific language governing permissions and limitations
// under the License.
-#include "exprs/math-functions.h"
-
#include <iomanip>
#include <random>
#include <sstream>
#include <math.h>
-
+#include <stdint.h>
+#include <string>
#include "exprs/anyval-util.h"
#include "exprs/scalar-expr.h"
#include "exprs/operators.h"
+#include "exprs/math-functions.h"
#include "util/string-parser.h"
+#include "runtime/decimal-value.inline.h"
#include "runtime/runtime-state.h"
#include "runtime/string-value.inline.h"
#include "thirdparty/pcg-cpp-0.98/include/pcg_random.hpp"
-
+#include "exprs/decimal-operators.h"
#include "common/names.h"
using std::uppercase;
@@ -431,6 +432,153 @@ DoubleVal MathFunctions::FmodDouble(FunctionContext* ctx, const DoubleVal& a,
return DoubleVal(fmod(a.val, b.val));
}
+// The bucket_number is evaluated using the following formula:
+//
+// bucket_number = dist_from_min * num_buckets / range_size
+// where -
+// dist_from_min = expr - min_range
+// range_size = max_range - min_range
+// buckets = number of buckets
+//
+// The results of the above subtractions are stored in Decimal16Value to avoid an overflow
+// in the following cases:
+// case 1:
+// T1 is decimal8Value
+// When evaluating this particular expression
+// dist_from_min = expr - min_range
+// If expr is a max positive value which can be represented in decimal8Value and
+// min_range < 0 the resulting dist_from_min can be represented in decimal16Val
+// without overflowing
+// case 2:
+// T1 is decimal16Value
+// Subtracting a negative min_range from expr can result in an overflow in which
+// case the function errors out. There is no decimal32Val to handle this. So
+// storing it in decimal16Value.
+// case 3:
+// T1 is decimal4Value
+// We can store the results in a decimal8Value. But this change hard codes to
+// store the result in decimal16Val for now to be compatible with the other
+// decimal*Vals
+//
+// The result of this multiplication dist_from_min * buckets is stored as a int256_t
+// if storing it in a int128_t would overflow.
+//
+// To perform the division, range_size is scaled up. The scale and precision of the
+// numerator and denominator are adjusted to be the same. This avoids the need to compute
+// the resulting scale and precision.
+template <class T1>
+BigIntVal MathFunctions::WidthBucketImpl(FunctionContext* ctx,
+ const T1& expr, const T1& min_range,
+ const T1& max_range, const IntVal& num_buckets) {
+ // FE casts expr, min_range and max_range to be of the scale and precision
+ int input_scale = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SCALE, 1);
+ int input_precision = ctx->impl()->GetConstFnAttr(
+ FunctionContextImpl::ARG_TYPE_PRECISION, 1);
+
+ bool overflow = false;
+ Decimal16Value range_size = max_range.template Subtract<int128_t>(input_scale,
+ min_range, input_scale, input_precision, input_scale, false, &overflow);
+ if (UNLIKELY(overflow)) {
+ ostringstream error_msg;
+ error_msg << "Overflow while evaluating the difference between min_range: " <<
+ min_range.value() << " and max_range: " << max_range.value();
+ ctx->SetError(error_msg.str().c_str());
+ return BigIntVal::null();
+ }
+
+ if (UNLIKELY(num_buckets.val <= 0)) {
+ ostringstream error_msg;
+ error_msg << "Number of buckets should be greater than zero:" << num_buckets.val;
+ ctx->SetError(error_msg.str().c_str());
+ return BigIntVal::null();
+ }
+
+ if (UNLIKELY(min_range >= max_range)) {
+ ctx->SetError("Lower bound cannot be greater than or equal to the upper bound");
+ return BigIntVal::null();
+ }
+
+ if (UNLIKELY(expr < min_range)) return 0;
+
+ if (UNLIKELY(expr >= max_range)) {
+ BigIntVal result;
+ result.val = num_buckets.val;
+ ++result.val;
+ return result;
+ }
+
+ Decimal16Value dist_from_min = expr.template Subtract<int128_t>(input_scale,
+ min_range, input_scale, input_precision, input_scale, false, &overflow);
+ DCHECK_EQ(overflow, false);
+
+ Decimal16Value buckets = Decimal16Value::FromInt(input_precision, input_scale,
+ num_buckets.val, &overflow);
+
+ bool needs_int256 = false;
+ // Check if dist_from_min * buckets would overflow and if there is a need to
+ // store the intermediate results in int256_t to avoid an overflows
+ // Check if scaling up range size overflows and if there is a need to store the
+ // intermediate results in int256_t to avoid the overflow
+ if (UNLIKELY(BitUtil::CountLeadingZeros(abs(buckets.value())) +
+ BitUtil::CountLeadingZeros(abs(dist_from_min.value())) <= 128 ||
+ BitUtil::CountLeadingZeros(range_size.value()) +
+ detail::MinLeadingZerosAfterScaling(BitUtil::CountLeadingZeros(
+ range_size.value()), input_scale) <= 128)) {
+ needs_int256 = true;
+ }
+
+ int128_t result;
+ if (needs_int256) {
+ // resulting scale should be 2 * input_scale as per multiplication rules
+ int256_t x = ConvertToInt256(buckets.value()) * ConvertToInt256(
+ dist_from_min.value());
+
+ // Since "range_size" and "x" have different scales, the divison would require
+ // evaluating the resulting scale. To avoid this we scale up the denominator to
+ // match the scale of the numerator.
+ int256_t y = DecimalUtil::MultiplyByScale<int256_t>(ConvertToInt256(
+ range_size.value()), input_scale, false);
+ result = ConvertToInt128(x / y, DecimalUtil::MAX_UNSCALED_DECIMAL16,
+ &overflow);
+ DCHECK_EQ(overflow, false);
+ } else {
+ // resulting scale should be 2 * input_scale as per multiplication rules
+ int128_t x = buckets.value() * dist_from_min.value();
+
+ // Since "range_size" and "x" have different scales, the divison would require
+ // evaluating the resulting scale. To avoid this we scale up the denominator to
+ // match the scale of the numerator.
+ int128_t y = DecimalUtil::MultiplyByScale<int128_t>(range_size.value(),
+ input_scale, false);
+ result = x / y; // NOLINT: clang-tidy thinks y may equal zero here.
+ }
+ return (BigIntVal(abs(result) + 1));
+}
+
+BigIntVal MathFunctions::WidthBucket(FunctionContext* ctx, const DecimalVal& expr,
+ const DecimalVal& min_range, const DecimalVal& max_range,
+ const IntVal& num_buckets) {
+ if (expr.is_null || min_range.is_null || max_range.is_null || num_buckets.is_null) {
+ return BigIntVal::null();
+ }
+
+ int arg_size_type = ctx->impl()->GetConstFnAttr(FunctionContextImpl::ARG_TYPE_SIZE, 0);
+ switch (arg_size_type) {
+ case 4:
+ return WidthBucketImpl<Decimal4Value> (ctx, Decimal4Value(expr.val4),
+ Decimal4Value(min_range.val4), Decimal4Value(max_range.val4), num_buckets);
+ case 8:
+ return WidthBucketImpl<Decimal8Value> (ctx, Decimal8Value(expr.val8),
+ Decimal8Value(min_range.val8), Decimal8Value(max_range.val8), num_buckets);
+ case 16:
+ return WidthBucketImpl<Decimal16Value>(ctx, Decimal16Value(expr.val16),
+ Decimal16Value(min_range.val16), Decimal16Value(max_range.val16), num_buckets);
+ default:
+ DCHECK(false);
+ return BigIntVal::null();
+ }
+}
+
template <typename T> T MathFunctions::Positive(FunctionContext* ctx, const T& val) {
return val;
}
diff --git a/be/src/exprs/math-functions.h b/be/src/exprs/math-functions.h
index 7063907..e9f5c89 100644
--- a/be/src/exprs/math-functions.h
+++ b/be/src/exprs/math-functions.h
@@ -15,7 +15,6 @@
// specific language governing permissions and limitations
// under the License.
-
#ifndef IMPALA_EXPRS_MATH_FUNCTIONS_H
#define IMPALA_EXPRS_MATH_FUNCTIONS_H
@@ -112,6 +111,11 @@ class MathFunctions {
template <bool ISLEAST> static DecimalVal LeastGreatest(
FunctionContext*, int num_args, const DecimalVal* args);
+
+ static BigIntVal WidthBucket(FunctionContext* ctx, const DecimalVal& expr,
+ const DecimalVal& min_range, const DecimalVal& max_range,
+ const IntVal& num_buckets);
+
private:
static const int32_t MIN_BASE = 2;
static const int32_t MAX_BASE = 36;
@@ -135,6 +139,15 @@ class MathFunctions {
/// Returns false otherwise, indicating some other error condition.
static bool HandleParseResult(int8_t dest_base, int64_t* num,
StringParser::ParseResult parse_res);
+
+ /// This function creates equiwidth histograms , where the histogram range
+ /// is divided into num_buckets buckets having identical sizes. This function
+ /// returns the bucket in which the expr value would fall. min_val and
+ /// max_val are the minimum and maximum value of the histogram range
+ /// respectively.
+ template <typename T1>
+ static BigIntVal WidthBucketImpl(FunctionContext* ctx,const T1& expr,
+ const T1& min_range,const T1& max_range, const IntVal& num_buckets);
};
}
diff --git a/be/src/util/string-util.cc b/be/src/util/string-util.cc
index f8e284f..e442391 100644
--- a/be/src/util/string-util.cc
+++ b/be/src/util/string-util.cc
@@ -65,4 +65,12 @@ bool CommaSeparatedContains(const std::string& cs_list, const std::string& item)
return false;
}
+bool EndsWith(const std::string& full_string, const std::string& end) {
+ if (full_string.size() >= end.size()) {
+ return (full_string.compare(full_string.size() - end.size(), end.size(),
+ end) == 0);
+ }
+ return false;
+}
+
}
diff --git a/be/src/util/string-util.h b/be/src/util/string-util.h
index 2811a3c..8db010c 100644
--- a/be/src/util/string-util.h
+++ b/be/src/util/string-util.h
@@ -40,6 +40,10 @@ Status TruncateUp(const std::string& str, int32_t max_length, std::string* resul
/// Return true if the comma-separated string 'cs_list' contains 'item' as one of
/// the comma-separated values.
bool CommaSeparatedContains(const std::string& cs_list, const std::string& item);
+
+/// Return true if a given string 'full_str' ends with the characters in the
+/// 'end' string
+bool EndsWith(const std::string& full_string, const std::string& end);
}
#endif
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index b3387c6..1aba496 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -388,7 +388,8 @@ visible_functions = [
'_ZN6impala13MathFunctions13LeastGreatestILb0EEEN10impala_udf9StringValEPNS2_15FunctionContextEiPKS3_'],
[['greatest'], 'DECIMAL', ['DECIMAL', '...'],
'_ZN6impala13MathFunctions13LeastGreatestILb0EEEN10impala_udf10DecimalValEPNS2_15FunctionContextEiPKS3_'],
-
+ [['width_bucket'], 'BIGINT', ['DECIMAL', 'DECIMAL', 'DECIMAL', 'INT'],
+ '_ZN6impala13MathFunctions11WidthBucketEPN10impala_udf15FunctionContextERKNS1_10DecimalValES6_S6_RKNS1_6IntValE'],
# Decimal Functions
# TODO: oracle has decimal support for transcendental functions (e.g. sin()) to very
# high precisions. Do we need them? It's unclear if other databases do the same.
diff --git a/fe/src/main/java/org/apache/impala/analysis/Expr.java b/fe/src/main/java/org/apache/impala/analysis/Expr.java
index 5cdcb16..e85a4a4 100644
--- a/fe/src/main/java/org/apache/impala/analysis/Expr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/Expr.java
@@ -504,12 +504,12 @@ abstract public class Expr extends TreeNode<Expr> implements ParseNode, Cloneabl
ScalarType decimalType = (ScalarType) childType;
result = decimalType.getMinResolutionDecimal();
} else {
- Preconditions.checkState(childType.isDecimal() || result.isDecimal());
result = Type.getAssignmentCompatibleType(
result, childType, false, strictDecimal);
}
}
if (result != null && !result.isNull()) {
+ result = ((ScalarType)result).getMinResolutionDecimal();
Preconditions.checkState(result.isDecimal() || result.isInvalid());
Preconditions.checkState(!result.isWildcardDecimal());
}