You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2020/08/25 21:41:53 UTC

[impala] 03/04: IMPALA-9962: Implement ds_kll_quantiles_as_string() function

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 41065845e927acef5a0ff95ef8fb32b2f86272d8
Author: Gabor Kaszab <ga...@cloudera.com>
AuthorDate: Thu Aug 6 08:46:47 2020 +0200

    IMPALA-9962: Implement ds_kll_quantiles_as_string() function
    
    This function is very similar to ds_kll_quantile() but this one can
    receive any number of rank parameters and returns a comma separated
    string that holds the results for all of the given ranks.
    For more details about ds_kll_quantile() see IMPALA-9959.
    
    Note, ds_kll_quantiles() should return an Array of floats as the result
    but with that we have to wait for the complex type support. Until, we
    provide ds_kll_quantiles_as_string() that can be deprecated once we
    have array support. Tracking Jira for returning complex types from
    functions is IMPALA-9520.
    
    Change-Id: I76f6039977f4e14ded89a3ee4bc4e6ff855f5e7f
    Reviewed-on: http://gerrit.cloudera.org:8080/16324
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exprs/aggregate-functions-ir.cc             |   8 --
 be/src/exprs/datasketches-common.cc                |   8 ++
 be/src/exprs/datasketches-common.h                 |   7 ++
 be/src/exprs/datasketches-functions-ir.cc          |  34 ++++++
 be/src/exprs/datasketches-functions.h              |   9 ++
 common/function-registry/impala_functions.py       |   2 +
 .../queries/QueryTest/datasketches-kll.test        | 119 +++++++++++++++++++++
 7 files changed, 179 insertions(+), 8 deletions(-)

diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index 48b2191..d0afd51 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -1615,14 +1615,6 @@ BigIntVal AggregateFunctions::HllFinalize(FunctionContext* ctx, const StringVal&
   return estimate;
 }
 
-StringVal StringStreamToStringVal(FunctionContext* ctx,
-    const stringstream& str_stream) {
-  string str = str_stream.str();
-  StringVal dst(ctx, str.size());
-  memcpy(dst.ptr, str.c_str(), str.size());
-  return dst;
-}
-
 /// Auxiliary function that receives a hll_sketch and returns the serialized version of
 /// it wrapped into a StringVal.
 /// Introducing this function in the .cc to avoid including the whole DataSketches HLL
diff --git a/be/src/exprs/datasketches-common.cc b/be/src/exprs/datasketches-common.cc
index c9bdcaf..dacc0d7 100644
--- a/be/src/exprs/datasketches-common.cc
+++ b/be/src/exprs/datasketches-common.cc
@@ -26,6 +26,7 @@ namespace impala {
 using datasketches::hll_sketch;
 using datasketches::kll_sketch;
 using impala_udf::StringVal;
+using std::stringstream;
 
 void LogSketchDeserializationError(FunctionContext* ctx) {
   ctx->SetError("Unable to deserialize sketch.");
@@ -50,5 +51,12 @@ template bool DeserializeDsSketch(const StringVal& serialized_sketch,
 template bool DeserializeDsSketch(const StringVal& serialized_sketch,
     kll_sketch<float>* sketch);
 
+StringVal StringStreamToStringVal(FunctionContext* ctx, const stringstream& str_stream) {
+  string str = str_stream.str();
+  StringVal dst(ctx, str.size());
+  memcpy(dst.ptr, str.c_str(), str.size());
+  return dst;
+}
+
 }
 
diff --git a/be/src/exprs/datasketches-common.h b/be/src/exprs/datasketches-common.h
index 37a6458..b721e04 100644
--- a/be/src/exprs/datasketches-common.h
+++ b/be/src/exprs/datasketches-common.h
@@ -17,6 +17,8 @@
 
 #pragma once
 
+#include <sstream>
+
 #include "common/status.h"
 #include "thirdparty/datasketches/hll.hpp"
 #include "udf/udf.h"
@@ -45,5 +47,10 @@ void LogSketchDeserializationError(FunctionContext* ctx);
 template<class T>
 bool DeserializeDsSketch(const StringVal& serialized_sketch, T* sketch)
     WARN_UNUSED_RESULT;
+
+/// Helper function that receives an std::stringstream and converts it to StringVal. Uses
+/// 'ctx' for memory allocation.
+StringVal StringStreamToStringVal(FunctionContext* ctx,
+    const std::stringstream& str_stream);
 }
 
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index 3af1e9e..f9f91e2 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -81,5 +81,39 @@ DoubleVal DataSketchesFunctions::DsKllRank(FunctionContext* ctx,
   return sketch.get_rank(probe_value.val);
 }
 
+StringVal DataSketchesFunctions::DsKllQuantilesAsString(FunctionContext* ctx,
+    const StringVal& serialized_sketch, int num_args, const DoubleVal* args) {
+  DCHECK(num_args > 0);
+  if (args == nullptr) return StringVal::null();
+  if (serialized_sketch.is_null || serialized_sketch.len == 0) return StringVal::null();
+  for (int i = 0; i < num_args; ++i) {
+    if (args[i].is_null || std::isnan(args[i].val)) {
+      ctx->SetError("NULL or NaN provided in the input list.");
+      return StringVal::null();
+    }
+  }
+  datasketches::kll_sketch<float> sketch;
+  if (!DeserializeDsSketch(serialized_sketch, &sketch)) {
+    LogSketchDeserializationError(ctx);
+    return StringVal::null();
+  }
+  double quantiles_input[(unsigned int)num_args];
+  for (int i = 0; i < num_args; ++i) quantiles_input[i] = args[i].val;
+  try {
+    std::vector<float> quantiles_results =
+        sketch.get_quantiles(quantiles_input, num_args);
+    std::stringstream result_stream;
+    for(int i = 0; i < quantiles_results.size(); ++i) {
+      if (i > 0) result_stream << ",";
+      result_stream << quantiles_results[i];
+    }
+    return StringStreamToStringVal(ctx, result_stream);
+  } catch(const std::exception& e) {
+    ctx->SetError(Substitute("Error while getting quantiles from DataSketches KLL. "
+        "Message: $0", e.what()).c_str());
+    return StringVal::null();
+  }
+}
+
 }
 
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index 7dbee46..817cd47 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -54,6 +54,15 @@ public:
   /// 'serialized_sketch'. Note, this is an approximate calculation.
   static DoubleVal DsKllRank(FunctionContext* ctx, const StringVal& serialized_sketch,
       const FloatVal& probe_value);
+
+  /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
+  /// it is not, then the query fails. This function is similar to DsKllQuantile() but
+  /// this one can receive multiple ranks and returns a comma separated string that
+  /// contains the results for all the given ranks.
+  /// Note, this function is meant to return an Array of floats as the result but with
+  /// that we have to wait for the complex type support. Tracking Jira is IMPALA-9520.
+  static StringVal DsKllQuantilesAsString(FunctionContext* ctx,
+      const StringVal& serialized_sketch, int num_args, const DoubleVal* args);
 };
 
 }
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index 2ec4a62..092a6f0 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -939,6 +939,8 @@ visible_functions = [
       '_ZN6impala21DataSketchesFunctions6DsKllNEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
   [['ds_kll_rank'], 'DOUBLE', ['STRING', 'FLOAT'],
       '_ZN6impala21DataSketchesFunctions9DsKllRankEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_8FloatValE'],
+  [['ds_kll_quantiles_as_string'], 'STRING', ['STRING', 'DOUBLE', '...'],
+      '_ZN6impala21DataSketchesFunctions22DsKllQuantilesAsStringEPN10impala_udf15FunctionContextERKNS1_9StringValEiPKNS1_9DoubleValE'],
 ]
 
 invisible_functions = [
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
index abe3426..e8b8c25 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-kll.test
@@ -288,3 +288,122 @@ FLOAT,FLOAT,FLOAT,FLOAT,FLOAT,FLOAT
 ---- RESULTS
 100.1999969482422,25000.099609375,50.90000152587891,NULL,50.5,NULL
 ====
+---- QUERY
+# Checks that ds_kll_quantiles_as_string() produces NULL for an empty dataset.
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(cast(f2 as float)), 0.5)
+from functional_parquet.emptytable;
+---- TYPES
+STRING
+---- RESULTS
+'NULL'
+====
+---- QUERY
+# Check that ds_kll_quantiles_as_string() returns null for a null input.
+select ds_kll_quantiles_as_string(c, 0.5) from functional_parquet.nulltable;
+---- RESULTS
+'NULL'
+---- TYPES
+STRING
+====
+---- QUERY
+# Check that ds_kll_quantiles_as_string() returns error for strings that are not
+# serialized sketches.
+select
+    ds_kll_quantiles_as_string(date_string_col, 0.5)
+from functional_parquet.alltypestiny;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch
+====
+---- QUERY
+# Check error when rank is out of range
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, -1, 1, 0.5)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while getting quantiles from DataSketches KLL. Message: Fraction cannot be less than zero or greater than 1.0
+====
+---- QUERY
+# Check error when rank is out of range
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 1.1, 1, 0.5)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: Error while getting quantiles from DataSketches KLL. Message: Fraction cannot be less than zero or greater than 1.0
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.2, 0.5, 0.8, 1)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'0,1.1,4.4,7.7,9.9'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0.5, 1, 0, 0.2, 0.8)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'4.4,9.9,0,1.1,7.7'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0.5)
+from functional_parquet.alltypessmall;
+---- RESULTS
+'4.4'
+---- TYPES
+STRING
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.2, NULL, 0.8, 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.2, 0.5, 0.8, NULL)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), NULL)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.5, cast('nan' as float), 1)
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), 0, 0.5, 1, cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col), cast('nan' as float))
+from functional_parquet.alltypessmall;
+---- CATCH
+UDF ERROR: NULL or NaN provided in the input list.
+====
+---- QUERY
+select
+    ds_kll_quantiles_as_string(ds_kll_sketch(float_col))
+from functional_parquet.alltypessmall;
+---- CATCH
+AnalysisException: No matching function with signature: ds_kll_quantiles_as_string(STRING)
+====