You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by tm...@apache.org on 2020/02/26 23:55:33 UTC
[impala] 03/05: IMPALA-8759: Use double precision for HLL finalize
function
This is an automated email from the ASF dual-hosted git repository.
tmarshall pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 322483987600fb3cf84da21a47cadccb6989820b
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Wed Feb 5 16:35:53 2020 -0800
IMPALA-8759: Use double precision for HLL finalize function
Current HLL finalize function use single precision of data type
float32 to calculate estimate. It's not accurate for the larger
cardinalities beyond 1,000,000 since float32 only has 6~7 decimal
digit precision.
This patch change single precision data type to double precision
type for HLL finalize function.
Testing:
- Passed all exhaustive tests.
- Did benchmark for queries with NDV functions. The performance
impact is negligible.
See following spreadsheet for the menchmark:
https://docs.google.com/spreadsheets/d/1DIVOEs5C4MJL1b7O4MA_jkaM3Y-JSMFREjXCUHJ3eHc/edit#gid=0
Change-Id: I0c5a5229b682070b0bc14da287db5231159dbb3d
Reviewed-on: http://gerrit.cloudera.org:8080/15167
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exprs/aggregate-functions-ir.cc | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index 1450171..699e80e 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -1482,24 +1482,24 @@ uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets) {
DCHECK(buckets != NULL);
// Empirical constants for the algorithm.
- float alpha = 0;
+ double alpha = 0;
if (HLL_LEN == 16) {
- alpha = 0.673f;
+ alpha = 0.673;
} else if (HLL_LEN == 32) {
- alpha = 0.697f;
+ alpha = 0.697;
} else if (HLL_LEN == 64) {
- alpha = 0.709f;
+ alpha = 0.709;
} else {
- alpha = 0.7213f / (1 + 1.079f / HLL_LEN);
+ alpha = 0.7213 / (1 + 1.079 / HLL_LEN);
}
- float harmonic_mean = 0;
+ double harmonic_mean = 0;
int num_zero_registers = 0;
for (int i = 0; i < HLL_LEN; ++i) {
- harmonic_mean += ldexp(1.0f, -buckets[i]);
+ harmonic_mean += ldexp(1.0, -buckets[i]);
if (buckets[i] == 0) ++num_zero_registers;
}
- harmonic_mean = 1.0f / harmonic_mean;
+ harmonic_mean = 1.0 / harmonic_mean;
int64_t estimate = alpha * HLL_LEN * HLL_LEN * harmonic_mean;
// Adjust for Hll bias based on Hll++ algorithm
if (estimate <= 5 * HLL_LEN) {
@@ -1510,7 +1510,7 @@ uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets) {
// Estimated cardinality is too low. Hll is too inaccurate here, instead use
// linear counting.
- int64_t h = HLL_LEN * log(static_cast<float>(HLL_LEN) / num_zero_registers);
+ int64_t h = HLL_LEN * log(static_cast<double>(HLL_LEN) / num_zero_registers);
return (h <= HllThreshold(HLL_PRECISION)) ? h : estimate;
}