You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by tm...@apache.org on 2020/02/26 23:55:33 UTC
[impala] 03/05: IMPALA-8759: Use double precision for HLL finalize function

This is an automated email from the ASF dual-hosted git repository.

tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 322483987600fb3cf84da21a47cadccb6989820b
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Wed Feb 5 16:35:53 2020 -0800

    IMPALA-8759: Use double precision for HLL finalize function
    
    Current HLL finalize function use single precision of data type
    float32 to calculate estimate. It's not accurate for the larger
    cardinalities beyond 1,000,000 since float32 only has 6~7 decimal
    digit precision.
    This patch change single precision data type to double precision
    type for HLL finalize function.
    
    Testing:
     - Passed all exhaustive tests.
     - Did benchmark for queries with NDV functions. The performance
       impact is negligible.
       See following spreadsheet for the menchmark:
       https://docs.google.com/spreadsheets/d/1DIVOEs5C4MJL1b7O4MA_jkaM3Y-JSMFREjXCUHJ3eHc/edit#gid=0
    
    Change-Id: I0c5a5229b682070b0bc14da287db5231159dbb3d
    Reviewed-on: http://gerrit.cloudera.org:8080/15167
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/exprs/aggregate-functions-ir.cc | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index 1450171..699e80e 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -1482,24 +1482,24 @@ uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets) {
   DCHECK(buckets != NULL);
 
   // Empirical constants for the algorithm.
-  float alpha = 0;
+  double alpha = 0;
   if (HLL_LEN == 16) {
-    alpha = 0.673f;
+    alpha = 0.673;
   } else if (HLL_LEN == 32) {
-    alpha = 0.697f;
+    alpha = 0.697;
   } else if (HLL_LEN == 64) {
-    alpha = 0.709f;
+    alpha = 0.709;
   } else {
-    alpha = 0.7213f / (1 + 1.079f / HLL_LEN);
+    alpha = 0.7213 / (1 + 1.079 / HLL_LEN);
   }
 
-  float harmonic_mean = 0;
+  double harmonic_mean = 0;
   int num_zero_registers = 0;
   for (int i = 0; i < HLL_LEN; ++i) {
-    harmonic_mean += ldexp(1.0f, -buckets[i]);
+    harmonic_mean += ldexp(1.0, -buckets[i]);
     if (buckets[i] == 0) ++num_zero_registers;
   }
-  harmonic_mean = 1.0f / harmonic_mean;
+  harmonic_mean = 1.0 / harmonic_mean;
   int64_t estimate = alpha * HLL_LEN * HLL_LEN * harmonic_mean;
   // Adjust for Hll bias based on Hll++ algorithm
   if (estimate <= 5 * HLL_LEN) {
@@ -1510,7 +1510,7 @@ uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets) {
 
   // Estimated cardinality is too low. Hll is too inaccurate here, instead use
   // linear counting.
-  int64_t h = HLL_LEN * log(static_cast<float>(HLL_LEN) / num_zero_registers);
+  int64_t h = HLL_LEN * log(static_cast<double>(HLL_LEN) / num_zero_registers);
 
   return (h <= HllThreshold(HLL_PRECISION)) ? h : estimate;
 }