You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2021/05/26 23:51:37 UTC

[impala] 01/03: IMPALA-10688: Implement ds_cpc_stringify() function

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 67de4a48b08208b3ebe907b05b303c91b68a20d7
Author: Fucun Chu <ch...@hotmail.com>
AuthorDate: Fri Apr 30 16:03:49 2021 +0800

    IMPALA-10688: Implement ds_cpc_stringify() function
    
    This function receives a string that is a serialized Apache
    DataSketches CPC sketch and returns its stringified format.
    
    A stringified format should look like and contains the following data:
    
    select ds_cpc_stringify(ds_cpc_sketch(float_col)) from
    functional_parquet.alltypestiny;
    +--------------------------------------------+
    | ds_cpc_stringify(ds_cpc_sketch(float_col)) |
    +--------------------------------------------+
    | ### CPC sketch summary:                    |
    |    lg_k           : 11                     |
    |    seed hash      : 93cc                   |
    |    C              : 2                      |
    |    flavor         : 1                      |
    |    merged         : true                   |
    |    intresting col : 0                      |
    |    table entries  : 2                      |
    |    window         : not allocated          |
    | ### End sketch summary                     |
    |                                            |
    +--------------------------------------------+
    
    Change-Id: I8c9d089bfada6bebd078d8f388d2e146c79e5285
    Reviewed-on: http://gerrit.cloudera.org:8080/17373
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-by: Gabor Kaszab <ga...@cloudera.com>
---
 be/src/exprs/datasketches-functions-ir.cc          | 14 ++++++++
 be/src/exprs/datasketches-functions.h              |  6 ++++
 common/function-registry/impala_functions.py       |  2 ++
 .../queries/QueryTest/datasketches-cpc.test        | 37 ++++++++++++++++++++++
 4 files changed, 59 insertions(+)

diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index c754d2f..f741edc 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -120,6 +120,20 @@ BigIntVal DataSketchesFunctions::DsCpcEstimate(
   return sketch.get_estimate();
 }
 
+StringVal DataSketchesFunctions::DsCpcStringify(
+    FunctionContext* ctx, const StringVal& serialized_sketch) {
+  if (serialized_sketch.is_null || serialized_sketch.len == 0) return StringVal::null();
+  datasketches::cpc_sketch sketch(DS_CPC_SKETCH_CONFIG);
+  if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+    LogSketchDeserializationError(ctx);
+    return StringVal::null();
+  }
+  string str = sketch.to_string();
+  StringVal dst(ctx, str.size());
+  memcpy(dst.ptr, str.c_str(), str.size());
+  return dst;
+}
+
 BigIntVal DataSketchesFunctions::DsThetaEstimate(
     FunctionContext* ctx, const StringVal& serialized_sketch) {
   if (serialized_sketch.is_null || serialized_sketch.len == 0) return 0;
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index 1218c19..2477189 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -70,6 +70,12 @@ public:
   static BigIntVal DsCpcEstimate(
       FunctionContext* ctx, const StringVal& serialized_sketch);
 
+  /// 'serialized_sketch' is expected as a serialized Apache DataSketches CPC sketch. If
+  /// it is not, then the query fails. This function returns the stringified format of
+  /// an Apache DataSketches CPC sketch.
+  static StringVal DsCpcStringify(
+      FunctionContext* ctx, const StringVal& serialized_sketch);
+
   /// 'serialized_sketch' is expected as a serialized Apache DataSketches Theta sketch.
   /// If it is not, then the query fails. Otherwise, returns the count(distinct) estimate
   /// from the sketch.
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 4871914..7c36995 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -1009,6 +1009,8 @@ visible_functions = [
       '_ZN6impala21DataSketchesFunctions14DsHllStringifyEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
   [['ds_cpc_estimate'], 'BIGINT', ['STRING'],
       '_ZN6impala21DataSketchesFunctions13DsCpcEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
+  [['ds_cpc_stringify'], 'STRING', ['STRING'],
+     '_ZN6impala21DataSketchesFunctions14DsCpcStringifyEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
   [['ds_theta_estimate'], 'BIGINT', ['STRING'],
      '_ZN6impala21DataSketchesFunctions15DsThetaEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
   [['ds_theta_exclude'], 'STRING', ['STRING', 'STRING'],
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test
index b05eb58..4c1ce48 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-cpc.test
@@ -245,4 +245,41 @@ from cpc_sketches_impala_hive;
 BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
 ---- RESULTS
 5,7,6,6,7,8,8,6,NULL
+====
+---- QUERY
+# Check that ds_cpc_stringify() returns null for an empty sketch.
+select ds_cpc_stringify(ds_cpc_sketch(cast(f2 as float))) from functional_parquet.emptytable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_cpc_stringify() returns null for a null input.
+select ds_cpc_stringify(c) from functional_parquet.nulltable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_cpc_stringify() returns error for strings that are not serialized sketches.
+select ds_cpc_stringify(date_string_col) from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch.
+====
+---- QUERY
+# Check that ds_cpc_stringify() works on sketches created by Hive.
+select ds_cpc_stringify(f) from cpc_sketches_from_hive;
+---- RESULTS
+row_regex: .*### CPC sketch summary:.*lg_k.*seed hash.*C.*flavor.*### End sketch summary.*
+---- TYPES
+STRING
+====
+---- QUERY
+select ds_cpc_stringify(ds_cpc_sketch(float_col)) from functional_parquet.alltypestiny;
+---- RESULTS
+row_regex: .*### CPC sketch summary:.*lg_k.*seed hash.*C.*flavor.*### End sketch summary.*
+---- TYPES
+STRING
 ====
\ No newline at end of file