You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2020/08/06 20:12:54 UTC

[impala] 01/02: IMPALA-9963: Implement ds_kll_n() function

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 87aeb2ad78e2106f1d8df84d4d84975c7cde5b5a
Author: Gabor Kaszab <ga...@cloudera.com>
AuthorDate: Thu Jul 30 09:41:00 2020 +0200

    IMPALA-9963: Implement ds_kll_n() function
    
    This function receives a serialized Apache DataSketches KLL sketch
    and returns how many input values were fed into this sketch.
    
    Change-Id: I166e87a468e68e888ac15fca7429ac2552dbb781
    Reviewed-on: http://gerrit.cloudera.org:8080/16259
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exprs/datasketches-common.h                 |  2 +-
 be/src/exprs/datasketches-functions-ir.cc          | 11 +++++++
 be/src/exprs/datasketches-functions.h              |  5 +++
 common/function-registry/impala_functions.py       |  2 ++
 .../queries/QueryTest/datasketches-kll.test        | 37 ++++++++++++++++++++++
 5 files changed, 56 insertions(+), 1 deletion(-)

diff --git a/be/src/exprs/datasketches-common.h b/be/src/exprs/datasketches-common.h
index 7560692..37a6458 100644
--- a/be/src/exprs/datasketches-common.h
+++ b/be/src/exprs/datasketches-common.h
@@ -37,7 +37,7 @@ const int DS_SKETCH_CONFIG = 12;
 /// Logs a common error message saying that sketch deserialization failed.
 void LogSketchDeserializationError(FunctionContext* ctx);
 
-/// Receives a serialized DataSketches sketch  (either Hll or KLL) in
+/// Receives a serialized DataSketches sketch (either Hll or KLL) in
 /// 'serialized_sketch', deserializes it and puts the deserialized sketch into 'sketch'.
 /// The outgoing 'sketch' will hold the same configs as 'serialized_sketch' regardless of
 /// what was provided when it was constructed before this function call. Returns false if
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index d2898bc..b76cbe9 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -59,5 +59,16 @@ FloatVal DataSketchesFunctions::DsKllQuantile(FunctionContext* ctx,
   }
 }
 
+BigIntVal DataSketchesFunctions::DsKllN(FunctionContext* ctx,
+    const StringVal& serialized_sketch) {
+  if (serialized_sketch.is_null || serialized_sketch.len == 0) return BigIntVal::null();
+  datasketches::kll_sketch<float> sketch;
+  if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+    LogSketchDeserializationError(ctx);
+    return BigIntVal::null();
+  }
+  return sketch.get_n();
+}
+
 }
 
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index 143fd69..bd6b76c 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -42,6 +42,11 @@ public:
   /// of [0,1]. Otherwise this function returns error.
   static FloatVal DsKllQuantile(FunctionContext* ctx, const StringVal& serialized_sketch,
       const DoubleVal& rank);
+
+  /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
+  /// it is not, then the query fails.
+  /// Returns the number of input values fed to 'serialized_sketch'.
+  static BigIntVal DsKllN(FunctionContext* ctx, const StringVal& serialized_sketch);
 };
 
 }
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 8398785..fbed357 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -935,6 +935,8 @@ visible_functions = [
       '_ZN6impala21DataSketchesFunctions13DsHllEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
   [['ds_kll_quantile'], 'FLOAT', ['STRING', 'DOUBLE'],
       '_ZN6impala21DataSketchesFunctions13DsKllQuantileEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_9DoubleValE'],
+  [['ds_kll_n'], 'BIGINT', ['STRING'],
+      '_ZN6impala21DataSketchesFunctions6DsKllNEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
 ]
 
 invisible_functions = [
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
index b7b734b..ee240bf 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
@@ -144,3 +144,40 @@ FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
 ---- RESULTS
 100.1999969482422,25000.099609375,50.90000152587891,NULL,50.5,NULL
 ====
+---- QUERY
+# Check that ds_kll_n() returns null for an empty sketch.
+select ds_kll_n(ds_kll_sketch(cast(f2 as float))) from functional_parquet.emptytable;
+---- RESULTS
+NULL
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() returns null for a null input.
+select ds_kll_n(c) from functional_parquet.nulltable;
+---- RESULTS
+NULL
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() returns error for strings that are not serialized sketches.
+select ds_kll_n(date_string_col) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+select ds_kll_n(float_sketch) from sketch_store where year=2009 and month=1;
+---- RESULTS
+25
+---- TYPES
+BIGINT
+====
+---- QUERY
+# Check that ds_kll_n() works on sketches created by Hive.
+select ds_kll_n(f) from kll_sketches_from_hive;
+---- RESULTS
+6
+---- TYPES
+BIGINT
+====