You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/08/25 21:41:53 UTC
[impala] 03/04: IMPALA-9962: Implement ds_kll_quantiles_as_string()
function
This is an automated email from the ASF dual-hosted git repository.
tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 41065845e927acef5a0ff95ef8fb32b2f86272d8
Author: Gabor Kaszab <ga...@cloudera.com>
AuthorDate: Thu Aug 6 08:46:47 2020 +0200
IMPALA-9962: Implement ds_kll_quantiles_as_string() function
This function is very similar to ds_kll_quantile() but this one can
receive any number of rank parameters and returns a comma separated
string that holds the results for all of the given ranks.
For more details about ds_kll_quantile() see IMPALA-9959.
Note, ds_kll_quantiles() should return an Array of floats as the result
but with that we have to wait for the complex type support. Until, we
provide ds_kll_quantiles_as_string() that can be deprecated once we
have array support. Tracking Jira for returning complex types from
functions is IMPALA-9520.
Change-Id: I76f6039977f4e14ded89a3ee4bc4e6ff855f5e7f
Reviewed-on: http://gerrit.cloudera.org:8080/16324
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exprs/aggregate-functions-ir.cc | 8 --
be/src/exprs/datasketches-common.cc | 8 ++
be/src/exprs/datasketches-common.h | 7 ++
be/src/exprs/datasketches-functions-ir.cc | 34 ++++++
be/src/exprs/datasketches-functions.h | 9 ++
common/function-registry/impala_functions.py | 2 +
.../queries/QueryTest/datasketches-kll.test | 119 +++++++++++++++++++++
7 files changed, 179 insertions(+), 8 deletions(-)
diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index 48b2191..d0afd51 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -1615,14 +1615,6 @@ BigIntVal AggregateFunctions::HllFinalize(FunctionContext* ctx, const StringVal&
return estimate;
}
-StringVal StringStreamToStringVal(FunctionContext* ctx,
- const stringstream& str_stream) {
- string str = str_stream.str();
- StringVal dst(ctx, str.size());
- memcpy(dst.ptr, str.c_str(), str.size());
- return dst;
-}
-
/// Auxiliary function that receives a hll_sketch and returns the serialized version of
/// it wrapped into a StringVal.
/// Introducing this function in the .cc to avoid including the whole DataSketches HLL
diff --git a/be/src/exprs/datasketches-common.cc b/be/src/exprs/datasketches-common.cc
index c9bdcaf..dacc0d7 100644
--- a/be/src/exprs/datasketches-common.cc
+++ b/be/src/exprs/datasketches-common.cc
@@ -26,6 +26,7 @@ namespace impala {
using datasketches::hll_sketch;
using datasketches::kll_sketch;
using impala_udf::StringVal;
+using std::stringstream;
void LogSketchDeserializationError(FunctionContext* ctx) {
ctx->SetError("Unable to deserialize sketch.");
@@ -50,5 +51,12 @@ template bool DeserializeDsSketch(const StringVal& serialized_sketch,
template bool DeserializeDsSketch(const StringVal& serialized_sketch,
kll_sketch<float>* sketch);
+StringVal StringStreamToStringVal(FunctionContext* ctx, const stringstream& str_stream) {
+ string str = str_stream.str();
+ StringVal dst(ctx, str.size());
+ memcpy(dst.ptr, str.c_str(), str.size());
+ return dst;
+}
+
}
diff --git a/be/src/exprs/datasketches-common.h b/be/src/exprs/datasketches-common.h
index 37a6458..b721e04 100644
--- a/be/src/exprs/datasketches-common.h
+++ b/be/src/exprs/datasketches-common.h
@@ -17,6 +17,8 @@
#pragma once
+#include <sstream>
+
#include "common/status.h"
#include "thirdparty/datasketches/hll.hpp"
#include "udf/udf.h"
@@ -45,5 +47,10 @@ void LogSketchDeserializationError(FunctionContext* ctx);
template<class T>
bool DeserializeDsSketch(const StringVal& serialized_sketch, T* sketch)
WARN_UNUSED_RESULT;
+
+/// Helper function that receives an std::stringstream and converts it to StringVal. Uses
+/// 'ctx' for memory allocation.
+StringVal StringStreamToStringVal(FunctionContext* ctx,
+ const std::stringstream& str_stream);
}
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index 3af1e9e..f9f91e2 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -81,5 +81,39 @@ DoubleVal DataSketchesFunctions::DsKllRank(FunctionContext* ctx,
return sketch.get_rank(probe_value.val);
}
+StringVal DataSketchesFunctions::DsKllQuantilesAsString(FunctionContext* ctx,
+ const StringVal& serialized_sketch, int num_args, const DoubleVal* args) {
+ DCHECK(num_args > 0);
+ if (args == nullptr) return StringVal::null();
+ if (serialized_sketch.is_null || serialized_sketch.len == 0) return StringVal::null();
+ for (int i = 0; i < num_args; ++i) {
+ if (args[i].is_null || std::isnan(args[i].val)) {
+ ctx->SetError("NULL or NaN provided in the input list.");
+ return StringVal::null();
+ }
+ }
+ datasketches::kll_sketch<float> sketch;
+ if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+ LogSketchDeserializationError(ctx);
+ return StringVal::null();
+ }
+ double quantiles_input[(unsigned int)num_args];
+ for (int i = 0; i < num_args; ++i) quantiles_input[i] = args[i].val;
+ try {
+ std::vector<float> quantiles_results =
+ sketch.get_quantiles(quantiles_input, num_args);
+ std::stringstream result_stream;
+ for(int i = 0; i < quantiles_results.size(); ++i) {
+ if (i > 0) result_stream << ",";
+ result_stream << quantiles_results[i];
+ }
+ return StringStreamToStringVal(ctx, result_stream);
+ } catch(const std::exception& e) {
+ ctx->SetError(Substitute("Error while getting quantiles from DataSketches KLL. "
+ "Message: $0", e.what()).c_str());
+ return StringVal::null();
+ }
+}
+
}
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index 7dbee46..817cd47 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -54,6 +54,15 @@ public:
/// 'serialized_sketch'. Note, this is an approximate calculation.
static DoubleVal DsKllRank(FunctionContext* ctx, const StringVal& serialized_sketch,
const FloatVal& probe_value);
+
+ /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
+ /// it is not, then the query fails. This function is similar to DsKllQuantile() but
+ /// this one can receive multiple ranks and returns a comma separated string that
+ /// contains the results for all the given ranks.
+ /// Note, this function is meant to return an Array of floats as the result but with
+ /// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
+ static StringVal DsKllQuantilesAsString(FunctionContext* ctx,
+ const StringVal& serialized_sketch, int num_args, const DoubleVal* args);
};
}
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 2ec4a62..092a6f0 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -939,6 +939,8 @@ visible_functions = [
'_ZN6impala21DataSketchesFunctions6DsKllNEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
[['ds_kll_rank'], 'DOUBLE', ['STRING', 'FLOAT'],
'_ZN6impala21DataSketchesFunctions9DsKllRankEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_8FloatValE'],
+ [['ds_kll_quantiles_as_string'], 'STRING', ['STRING', 'DOUBLE', '...'],
+ '_ZN6impala21DataSketchesFunctions22DsKllQuantilesAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_9DoubleValE'],
]
invisible_functions = [
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
index abe3426..e8b8c25 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
@@ -288,3 +288,122 @@ FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
---- RESULTS
100.1999969482422,25000.099609375,50.90000152587891,NULL,50.5,NULL
====
+---- QUERY
+# Checks that ds_kll_quantiles_as_string() produces NULL for an empty dataset.
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(cast(f2 as float)), 0.5)
+from functional_parquet.emptytable;
+---- TYPES
+STRING
+---- RESULTS
+'NULL'
+====
+---- QUERY
+# Check that ds_kll_quantiles_as_string() returns null for a null input.
+select ds_kll_quantiles_as_string(c, 0.5) from functional_parquet.nulltable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_kll_quantiles_as_string() returns error for strings that are not
+# serialized sketches.
+select
+ ds_kll_quantiles_as_string(date_string_col, 0.5)
+from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+# Check error when rank is out of range
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, -1, 1, 0.5)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while getting quantiles from DataSketches KLL. Message: Fraction cannot be less than zero or greater than 1.0
+====
+---- QUERY
+# Check error when rank is out of range
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 1.1, 1, 0.5)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while getting quantiles from DataSketches KLL. Message: Fraction cannot be less than zero or greater than 1.0
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.2, 0.5, 0.8, 1)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'0,1.1,4.4,7.7,9.9'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0.5, 1, 0, 0.2, 0.8)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'4.4,9.9,0,1.1,7.7'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0.5)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'4.4'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.2, NULL, 0.8, 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.2, 0.5, 0.8, NULL)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), NULL)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.5, cast('nan' as float), 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.5, 1, cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col), cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+ ds_kll_quantiles_as_string(ds_kll_sketch(float_col))
+from functional_parquet.alltypessmall;
+---- CATCH
+AnalysisException: No matching function with signature: ds_kll_quantiles_as_string(STRING)
+====