You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by as...@apache.org on 2021/03/18 16:29:47 UTC

[impala] 01/05: IMPALA-10558: Implement ds_theta_exclude() function

This is an automated email from the ASF dual-hosted git repository.

asherman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3e8250153108954870409fa27e35b33d608c5cb7
Author: Fucun Chu <ch...@hotmail.com>
AuthorDate: Mon Feb 22 22:12:59 2021 +0800

    IMPALA-10558: Implement ds_theta_exclude() function
    
    This function receives two strings that are serialized Apache
    DataSketches Theta sketches. Computes the a-not-b set operation given
    two sketches of same or different column.
    
    Example:
    select ds_theta_estimate(ds_theta_exclude(sketch1, sketch2))
    from sketch_tbl;
    +-------------------------------------------------------+
    | ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)) |
    +-------------------------------------------------------+
    | 5                                                     |
    +-------------------------------------------------------+
    
    Change-Id: I05119fd8c652c07ff248a99e44b0da3541e46ca3
    Reviewed-on: http://gerrit.cloudera.org:8080/17153
    Reviewed-by: Gabor Kaszab <ga...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exprs/datasketches-common.cc                |  19 ++++
 be/src/exprs/datasketches-functions-ir.cc          |  37 ++++++++
 be/src/exprs/datasketches-functions.h              |   8 ++
 common/function-registry/impala_functions.py       |   2 +
 .../queries/QueryTest/datasketches-theta.test      | 103 +++++++++++++++++++++
 5 files changed, 169 insertions(+)

diff --git a/be/src/exprs/datasketches-common.cc b/be/src/exprs/datasketches-common.cc
index ae1c8ec..c80bd5b 100644
--- a/be/src/exprs/datasketches-common.cc
+++ b/be/src/exprs/datasketches-common.cc
@@ -20,11 +20,13 @@
 #include "common/logging.h"
 #include "udf/udf-internal.h"
 #include "thirdparty/datasketches/kll_sketch.hpp"
+#include "thirdparty/datasketches/theta_sketch.hpp"
 
 namespace impala {
 
 using datasketches::hll_sketch;
 using datasketches::kll_sketch;
+using datasketches::theta_sketch;
 using impala_udf::StringVal;
 using std::stringstream;
 using std::vector;
@@ -47,6 +49,23 @@ bool DeserializeDsSketch(const StringVal& serialized_sketch, T* sketch) {
   }
 }
 
+// This is a specialization of the template DeserializeDsSketch() for theta sketches.
+template <>
+bool DeserializeDsSketch(
+    const StringVal& serialized_sketch, theta_sketch::unique_ptr* sketch) {
+  DCHECK(sketch != nullptr);
+  if (serialized_sketch.is_null || serialized_sketch.len == 0) return false;
+  try {
+    *sketch =
+        theta_sketch::deserialize((void*)serialized_sketch.ptr, serialized_sketch.len);
+    return true;
+  } catch (const std::exception&) {
+    // One reason of throwing from deserialization is that the input string is not a
+    // serialized sketch.
+    return false;
+  }
+}
+
 template bool DeserializeDsSketch(const StringVal& serialized_sketch,
     hll_sketch* sketch);
 template bool DeserializeDsSketch(const StringVal& serialized_sketch,
diff --git a/be/src/exprs/datasketches-functions-ir.cc b/be/src/exprs/datasketches-functions-ir.cc
index df390bd..bf0eb1a 100644
--- a/be/src/exprs/datasketches-functions-ir.cc
+++ b/be/src/exprs/datasketches-functions-ir.cc
@@ -21,6 +21,7 @@
 #include "gutil/strings/substitute.h"
 #include "thirdparty/datasketches/hll.hpp"
 #include "thirdparty/datasketches/theta_sketch.hpp"
+#include "thirdparty/datasketches/theta_a_not_b.hpp"
 #include "thirdparty/datasketches/kll_sketch.hpp"
 #include "udf/udf-internal.h"
 
@@ -122,6 +123,42 @@ BigIntVal DataSketchesFunctions::DsThetaEstimate(
   }
 }
 
+StringVal DataSketchesFunctions::DsThetaExclude(FunctionContext* ctx,
+    const StringVal& first_serialized_sketch, const StringVal& second_serialized_sketch) {
+  datasketches::theta_a_not_b a_not_b;
+  // Deserialize two sketches
+  datasketches::theta_sketch::unique_ptr first_sketch_ptr;
+  if (!first_serialized_sketch.is_null && first_serialized_sketch.len > 0) {
+    if (!DeserializeDsSketch(first_serialized_sketch, &first_sketch_ptr)) {
+      LogSketchDeserializationError(ctx);
+      return StringVal::null();
+    }
+  }
+  datasketches::theta_sketch::unique_ptr second_sketch_ptr;
+  if (!second_serialized_sketch.is_null && second_serialized_sketch.len > 0) {
+    if (!DeserializeDsSketch(second_serialized_sketch, &second_sketch_ptr)) {
+      LogSketchDeserializationError(ctx);
+      return StringVal::null();
+    }
+  }
+  // Note, A and B refer to the two input sketches in the order A-not-B.
+  // if A is null return null.
+  // if A is not null, B is null return copyA.
+  // other return A-not-B.
+  if (first_sketch_ptr) {
+    if (!second_sketch_ptr) {
+      return StringVal::CopyFrom(
+          ctx, first_serialized_sketch.ptr, first_serialized_sketch.len);
+    }
+    // A and B are not null, call a_not_b.compute()
+    auto result = a_not_b.compute(*first_sketch_ptr, *second_sketch_ptr);
+    std::stringstream serialized_input;
+    result.serialize(serialized_input);
+    return StringStreamToStringVal(ctx, serialized_input);
+  }
+  return StringVal::null();
+}
+
 FloatVal DataSketchesFunctions::DsKllQuantile(FunctionContext* ctx,
     const StringVal& serialized_sketch, const DoubleVal& rank) {
   if (serialized_sketch.is_null || serialized_sketch.len == 0) return FloatVal::null();
diff --git a/be/src/exprs/datasketches-functions.h b/be/src/exprs/datasketches-functions.h
index 3a8036e..26f276c 100644
--- a/be/src/exprs/datasketches-functions.h
+++ b/be/src/exprs/datasketches-functions.h
@@ -70,6 +70,14 @@ public:
   static BigIntVal DsThetaEstimate(
       FunctionContext* ctx, const StringVal& serialized_sketch);
 
+  /// 'first_serialized_sketch' and 'second_serialized_sketch' are both expected as
+  /// serialized Apache DataSketches Theta sketches. If they are not, then the query
+  /// fails. Computes the a-not-b set operation given two sketches of same or different
+  /// column.
+  static StringVal DsThetaExclude(FunctionContext* ctx,
+      const StringVal& first_serialized_sketch,
+      const StringVal& second_serialized_sketch);
+
   /// 'serialized_sketch' is expected as a serialized Apache DataSketches KLL sketch. If
   /// it is not, then the query fails. 'rank' is used to identify which item (estimate)
   /// to return from the sketched dataset. E.g. 0.1 means the item where 10% of the
diff --git a/common/function-registry/impala_functions.py b/common/function-registry/impala_functions.py
index ee62062..ee9f87f 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -1005,6 +1005,8 @@ visible_functions = [
       '_ZN6impala21DataSketchesFunctions14DsHllStringifyEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
   [['ds_theta_estimate'], 'BIGINT', ['STRING'],
      '_ZN6impala21DataSketchesFunctions15DsThetaEstimateEPN10impala_udf15FunctionContextERKNS1_9StringValE'],
+  [['ds_theta_exclude'], 'STRING', ['STRING', 'STRING'],
+     '_ZN6impala21DataSketchesFunctions14DsThetaExcludeEPN10impala_udf15FunctionContextERKNS1_9StringValES6_'],
   [['ds_kll_quantile'], 'FLOAT', ['STRING', 'DOUBLE'],
       '_ZN6impala21DataSketchesFunctions13DsKllQuantileEPN10impala_udf15FunctionContextERKNS1_9StringValERKNS1_9DoubleValE'],
   [['ds_kll_n'], 'BIGINT', ['STRING'],
diff --git a/testdata/workloads/functional-query/queries/QueryTest/datasketches-theta.test b/testdata/workloads/functional-query/queries/QueryTest/datasketches-theta.test
index fe7da72..34a3a0e 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/datasketches-theta.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/datasketches-theta.test
@@ -304,4 +304,107 @@ from theta_sketches_impala_hive;
 BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
 ---- RESULTS
 5,7,6,6,7,4,4,3,0
+====
+---- QUERY
+# A and B refer to the two input sketches in the order A-not-B.
+# Checks that ds_theta_exclude() returns an null sketch for A, B is NULL inputs.
+select ds_theta_exclude(null_str, some_nulls) from functional_parquet.nullrows
+where id='b';
+---- TYPES
+STRING
+---- RESULTS
+'NULL'
+====
+---- QUERY
+# Checks that ds_theta_exclude() returns an null sketch for A is NULL inputs.
+select ds_theta_exclude(null_str, sketch) from (
+select null null_str, ds_theta_sketch(some_nulls) sketch from functional_parquet.nullrows
+where id='a'
+) t;
+---- TYPES
+STRING
+---- RESULTS
+'NULL'
+====
+---- QUERY
+# Check that ds_theta_exclude() returns an empty sketch When A is empty and B is null.
+select ds_theta_estimate(ds_theta_exclude(ds_theta_sketch(f2), null))
+from functional_parquet.emptytable;
+---- TYPES
+BIGINT
+---- RESULTS
+0
+====
+---- QUERY
+# Check that ds_theta_exclude() returns an null sketch When A is null and B is empty.
+select ds_theta_exclude(null, ds_theta_sketch(f2)) from functional_parquet.emptytable;
+---- TYPES
+STRING
+---- RESULTS
+'NULL'
+====
+---- QUERY
+# Check that ds_theta_exclude() returns an empty sketch When A and B are both empty.
+select ds_theta_estimate(ds_theta_exclude(ds_theta_sketch(field), ds_theta_sketch(f2)))
+from functional_parquet.emptytable;
+---- TYPES
+BIGINT
+---- RESULTS
+0
+====
+---- QUERY
+# ds_theta_exclude() returns an error if it receives an invalid serialized sketch.
+select ds_theta_exclude(null, date_string_col) from functional_parquet.alltypestiny
+where id=1;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch.
+====
+---- QUERY
+# ds_theta_exclude() returns an error if it receives an invalid serialized sketch.
+select ds_theta_exclude(date_string_col, null) from functional_parquet.alltypestiny
+where id=1;
+---- CATCH
+UDF ERROR: Unable to deserialize sketch.
+====
+---- QUERY
+# Get the same sketches from Impala and Hive and put them into the same table (different
+# column prefix). The estimate in sketch A but not in sketch B is 0.
+create table theta_sketches_impala_hive2 (
+i_ti string,i_i string,i_bi string,i_f string,i_d string,i_s string,i_c string,i_v string,
+i_nc string,
+h_ti string,h_i string,h_bi string,h_f string,h_d string,h_s string,h_c string,h_v string,
+h_nc string) stored as parquet;
+insert overwrite theta_sketches_impala_hive2 select
+i.ti i_ti, i.i i_i, i.bi i_bi, i.f i_f, i.d i_d, i.s i_s, i.c i_c, i.v i_v,i.nc i_nc,
+h.ti h_ti, h.i h_i, h.bi h_bi, h.f h_f, h.d h_d, h.s h_s, h.c h_c, h.v h_v,h.nc h_nc
+from theta_sketches_from_impala i, theta_sketches_from_hive h;
+select
+    ds_theta_estimate(ds_theta_exclude(i_ti, h_ti)) as ti,
+    ds_theta_estimate(ds_theta_exclude(i_i, h_i)) as i,
+    ds_theta_estimate(ds_theta_exclude(i_bi, h_bi)) as bi,
+    ds_theta_estimate(ds_theta_exclude(i_f, h_f)) as f,
+    ds_theta_estimate(ds_theta_exclude(i_d, h_d)) as d,
+    ds_theta_estimate(ds_theta_exclude(i_s, h_s)) as s,
+    ds_theta_estimate(ds_theta_exclude(i_c, h_c)) as c,
+    ds_theta_estimate(ds_theta_exclude(i_v, h_v)) as v,
+    ds_theta_estimate(ds_theta_exclude(i_nc, h_nc)) as nc
+from theta_sketches_impala_hive2;
+---- TYPES
+BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT,BIGINT
+---- RESULTS
+0,0,0,0,0,0,0,0,0
+====
+---- QUERY
+# Check that the result of an a-not-b is a non-empty sketch.
+create table sketch_input (id1 int, id2 int);
+insert into table sketch_input values
+  (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20);
+create table sketch_intermediate (sketch1 string, sketch2 string) stored as parquet;
+insert overwrite sketch_intermediate
+  select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input;
+select ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)) from sketch_intermediate;
+---- TYPES
+BIGINT
+---- RESULTS
+5
 ====
\ No newline at end of file