You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2020/08/06 20:12:54 UTC
[impala] 01/02: IMPALA-9963: Implement ds_kll_n() function
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 87aeb2ad78e2106f1d8df84d4d84975c7cde5b5a
Author: Gabor Kaszab <ga...@cloudera.com>
AuthorDate: Thu Jul 30 09:41:00 2020 +0200
IMPALA-9963: Implement ds_kll_n() function
This function receives a serialized Apache DataSketches KLL sketch
and returns how many input values were fed into this sketch.
Change-Id: I166e87a468e68e888ac15fca7429ac2552dbb781
Reviewed-on: http://gerrit.cloudera.org:8080/16259
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exprs/datasketches-common.h | 2 +-
be/src/exprs/datasketches-functions-ir.cc | 11 +++++++
be/src/exprs/datasketches-functions.h | 5 +++
common/function-registry/impala_functions.py | 2 ++
.../queries/QueryTest/datasketches-kll.test | 37 ++++++++++++++++++++++
5 files changed, 56 insertions(+), 1 deletion(-)
diff --git a/be/src/exprs/datasketches-common.h b/be/src/exprs/datasketches-common.h
index 7560692..37a6458 100644
--- a/be/src/exprs/datasketches-common.h
+++ b/be/src/exprs/datasketches-common.h
@@ -37,7 +37,7 @@ const int DS_SKETCH_CONFIG = 12;
/// Logs a common error message saying that sketch deserialization failed.
void LogSketchDeserializationError(FunctionContext* ctx);
-/// Receives a serialized DataSketches sketch (either Hll or KLL) in
+/// Receives a serialized DataSketches sketch (either Hll or KLL) in
/// 'serialized_sketch', deserializes it and puts the deserialized sketch into 'sketch'.
/// The outgoing 'sketch' will hold the same configs as 'serialized_sketch' regardless of
/// what was provided when it was constructed before this function call. Returns false if
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index d2898bc..b76cbe9 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -59,5 +59,16 @@ FloatVal DataSketchesFunctions::DsKllQuantile(FunctionContext* ctx,
}
}
+BigIntVal DataSketchesFunctions::DsKllN(FunctionContext* ctx,
+ const StringVal& serialized_sketch) {
+ if (serialized_sketch.is_null || serialized_sketch.len == 0) return BigIntVal::null();
+ datasketches::kll_sketch<float> sketch;
+ if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+ LogSketchDeserializationError(ctx);
+ return BigIntVal::null();
+ }
+ return sketch.get_n();
+}
+
}
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index 143fd69..bd6b76c 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -42,6 +42,11 @@ public:
/// of [0,1]. Otherwise this function returns error.
static FloatVal DsKllQuantile(FunctionContext* ctx, const StringVal& serialized_sketch,
const DoubleVal& rank);
+
+ /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
+ /// it is not, then the query fails.
+ /// Returns the number of input values fed to 'serialized_sketch'.
+ static BigIntVal DsKllN(FunctionContext* ctx, const StringVal& serialized_sketch);
};
}
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 8398785..fbed357 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -935,6 +935,8 @@ visible_functions = [
'_ZN6impala21DataSketchesFunctions13DsHllEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
[['ds_kll_quantile'], 'FLOAT', ['STRING', 'DOUBLE'],
'_ZN6impala21DataSketchesFunctions13DsKllQuantileEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_9DoubleValE'],
+ [['ds_kll_n'], 'BIGINT', ['STRING'],
+ '_ZN6impala21DataSketchesFunctions6DsKllNEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
]
invisible_functions = [
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
index b7b734b..ee240bf 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
@@ -144,3 +144,40 @@ FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
---- RESULTS
100.1999969482422,25000.099609375,50.90000152587891,NULL,50.5,NULL
====
+---- QUERY
+# Check that ds_kll_n() returns null for an empty sketch.
+select ds_kll_n(ds_kll_sketch(cast(f2 as float))) from functional_parquet.emptytable;
+---- RESULTS
+NULL
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() returns null for a null input.
+select ds_kll_n(c) from functional_parquet.nulltable;
+---- RESULTS
+NULL
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() returns error for strings that are not serialized sketches.
+select ds_kll_n(date_string_col) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+select ds_kll_n(float_sketch) from sketch_store where year=2009 and month=1;
+---- RESULTS
+25
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() works on sketches created by Hive.
+select ds_kll_n(f) from kll_sketches_from_hive;
+---- RESULTS
+6
+---- TYPES
+BIGINT
+====