You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2021/05/26 23:51:37 UTC
[impala] 01/03: IMPALA-10688: Implement ds_cpc_stringify() function
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 67de4a48b08208b3ebe907b05b303c91b68a20d7
Author: Fucun Chu <ch...@hotmail.com>
AuthorDate: Fri Apr 30 16:03:49 2021 +0800
IMPALA-10688: Implement ds_cpc_stringify() function
This function receives a string that is a serialized Apache
DataSketches CPC sketch and returns its stringified format.
A stringified format should look like and contains the following data:
select ds_cpc_stringify(ds_cpc_sketch(float_col)) from
functional_parquet.alltypestiny;
+--------------------------------------------+
| ds_cpc_stringify(ds_cpc_sketch(float_col)) |
+--------------------------------------------+
| ### CPC sketch summary: |
| lg_k : 11 |
| seed hash : 93cc |
| C : 2 |
| flavor : 1 |
| merged : true |
| intresting col : 0 |
| table entries : 2 |
| window : not allocated |
| ### End sketch summary |
| |
+--------------------------------------------+
Change-Id: I8c9d089bfada6bebd078d8f388d2e146c79e5285
Reviewed-on: http://gerrit.cloudera.org:8080/17373
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Reviewed-by: Gabor Kaszab <ga...@cloudera.com>
---
be/src/exprs/datasketches-functions-ir.cc | 14 ++++++++
be/src/exprs/datasketches-functions.h | 6 ++++
common/function-registry/impala_functions.py | 2 ++
.../queries/QueryTest/datasketches-cpc.test | 37 ++++++++++++++++++++++
4 files changed, 59 insertions(+)
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index c754d2f..f741edc 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -120,6 +120,20 @@ BigIntVal DataSketchesFunctions::DsCpcEstimate(
return sketch.get_estimate();
}
+StringVal DataSketchesFunctions::DsCpcStringify(
+ FunctionContext* ctx, const StringVal& serialized_sketch) {
+ if (serialized_sketch.is_null || serialized_sketch.len == 0) return StringVal::null();
+ datasketches::cpc_sketch sketch(DS_CPC_SKETCH_CONFIG);
+ if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+ LogSketchDeserializationError(ctx);
+ return StringVal::null();
+ }
+ string str = sketch.to_string();
+ StringVal dst(ctx, str.size());
+ memcpy(dst.ptr, str.c_str(), str.size());
+ return dst;
+}
+
BigIntVal DataSketchesFunctions::DsThetaEstimate(
FunctionContext* ctx, const StringVal& serialized_sketch) {
if (serialized_sketch.is_null || serialized_sketch.len == 0) return 0;
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index 1218c19..2477189 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -70,6 +70,12 @@ public:
static BigIntVal DsCpcEstimate(
FunctionContext* ctx, const StringVal& serialized_sketch);
+ /// 'serialized_sketch' is expected as a serialized Apache DataSketches CPC sketch. If
+ /// it is not, then the query fails. This function returns the stringified format of
+ /// an Apache DataSketches CPC sketch.
+ static StringVal DsCpcStringify(
+ FunctionContext* ctx, const StringVal& serialized_sketch);
+
/// 'serialized_sketch' is expected as a serialized Apache DataSketches Theta sketch.
/// If it is not, then the query fails. Otherwise, returns the count(distinct) estimate
/// from the sketch.
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 4871914..7c36995 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -1009,6 +1009,8 @@ visible_functions = [
'_ZN6impala21DataSketchesFunctions14DsHllStringifyEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
[['ds_cpc_estimate'], 'BIGINT', ['STRING'],
'_ZN6impala21DataSketchesFunctions13DsCpcEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
+ [['ds_cpc_stringify'], 'STRING', ['STRING'],
+ '_ZN6impala21DataSketchesFunctions14DsCpcStringifyEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
[['ds_theta_estimate'], 'BIGINT', ['STRING'],
'_ZN6impala21DataSketchesFunctions15DsThetaEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
[['ds_theta_exclude'], 'STRING', ['STRING', 'STRING'],
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test
index b05eb58..4c1ce48 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test
@@ -245,4 +245,41 @@ from cpc_sketches_impala_hive;
BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
---- RESULTS
5,7,6,6,7,8,8,6,NULL
+====
+---- QUERY
+# Check that ds_cpc_stringify() returns null for an empty sketch.
+select ds_cpc_stringify(ds_cpc_sketch(cast(f2 as float))) from functional_parquet.emptytable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_cpc_stringify() returns null for a null input.
+select ds_cpc_stringify(c) from functional_parquet.nulltable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_cpc_stringify() returns error for strings that are not serialized sketches.
+select ds_cpc_stringify(date_string_col) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch.
+====
+---- QUERY
+# Check that ds_cpc_stringify() works on sketches created by Hive.
+select ds_cpc_stringify(f) from cpc_sketches_from_hive;
+---- RESULTS
+row_regex: .*### CPC sketch summary:.*lg_k.*seed hash.*C.*flavor.*### End sketch summary.*
+---- TYPES
+STRING
+====
+---- QUERY
+select ds_cpc_stringify(ds_cpc_sketch(float_col)) from functional_parquet.alltypestiny;
+---- RESULTS
+row_regex: .*### CPC sketch summary:.*lg_k.*seed hash.*C.*flavor.*### End sketch summary.*
+---- TYPES
+STRING
====
\ No newline at end of file