You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ph...@apache.org on 2018/01/23 00:42:46 UTC
[3/5] impala git commit: IMPALA-6422: Use ldexp() instead of powf() in HLL.

IMPALA-6422: Use ldexp() instead of powf() in HLL.

Using ldexp() to compute a floating point power of two is
over 10x faster than powf().

This change is particularly helpful for speeding up
COMPUTE STATS TABLESAMPLE which has many calls to
HllFinalEstimate() where floating point power of two
computations are relevant.

Testing:
- core/hdfs run passed

Change-Id: I517614d3f9cf1cf56b15a173c3cfe76e0f2e0382
Reviewed-on: http://gerrit.cloudera.org:8080/9078
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/38a9ab78
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/38a9ab78
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/38a9ab78

Branch: refs/heads/2.x
Commit: 38a9ab78b95e2229a3dc97f766833e9ef9d7330d
Parents: 4afabd4
Author: Alex Behm <al...@cloudera.com>
Authored: Thu Jan 18 19:06:30 2018 -0800
Committer: Philip Zeyliger <ph...@cloudera.com>
Committed: Mon Jan 22 16:41:16 2018 -0800

----------------------------------------------------------------------
 be/src/exec/incr-stats-util.cc         |  3 +--
 be/src/exprs/aggregate-functions-ir.cc | 13 +++++--------
 be/src/exprs/aggregate-functions.h     |  6 +++---
 3 files changed, 9 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/38a9ab78/be/src/exec/incr-stats-util.cc
----------------------------------------------------------------------
diff --git a/be/src/exec/incr-stats-util.cc b/be/src/exec/incr-stats-util.cc
index 6f5e2b6..f0bb73f 100644
--- a/be/src/exec/incr-stats-util.cc
+++ b/be/src/exec/incr-stats-util.cc
@@ -165,8 +165,7 @@ struct PerColumnStats {
   // avg_width contain valid values.
   void Finalize() {
     ndv_estimate = AggregateFunctions::HllFinalEstimate(
-        reinterpret_cast<const uint8_t*>(intermediate_ndv.data()),
-        intermediate_ndv.size());
+        reinterpret_cast<const uint8_t*>(intermediate_ndv.data()));
     avg_width = num_rows == 0 ? 0 : avg_width / num_rows;
   }
 

http://git-wip-us.apache.org/repos/asf/impala/blob/38a9ab78/be/src/exprs/aggregate-functions-ir.cc
----------------------------------------------------------------------
diff --git a/be/src/exprs/aggregate-functions-ir.cc b/be/src/exprs/aggregate-functions-ir.cc
index a8fbe57..9af563e 100644
--- a/be/src/exprs/aggregate-functions-ir.cc
+++ b/be/src/exprs/aggregate-functions-ir.cc
@@ -1467,10 +1467,8 @@ void AggregateFunctions::HllMerge(
   }
 }
 
-uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets,
-    int32_t num_buckets) {
+uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets) {
   DCHECK(buckets != NULL);
-  DCHECK_EQ(num_buckets, HLL_LEN);
 
   // Empirical constants for the algorithm.
   float alpha = 0;
@@ -1486,9 +1484,8 @@ uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets,
 
   float harmonic_mean = 0;
   int num_zero_registers = 0;
-  // TODO: Consider improving this loop (e.g. replacing 'if' with arithmetic op).
-  for (int i = 0; i < num_buckets; ++i) {
-    harmonic_mean += powf(2.0f, -buckets[i]);
+  for (int i = 0; i < HLL_LEN; ++i) {
+    harmonic_mean += ldexp(1.0f, -buckets[i]);
     if (buckets[i] == 0) ++num_zero_registers;
   }
   harmonic_mean = 1.0f / harmonic_mean;
@@ -1509,7 +1506,7 @@ uint64_t AggregateFunctions::HllFinalEstimate(const uint8_t* buckets,
 
 BigIntVal AggregateFunctions::HllFinalize(FunctionContext* ctx, const StringVal& src) {
   if (UNLIKELY(src.is_null)) return BigIntVal::null();
-  uint64_t estimate = HllFinalEstimate(src.ptr, src.len);
+  uint64_t estimate = HllFinalEstimate(src.ptr);
   return estimate;
 }
 
@@ -1620,7 +1617,7 @@ BigIntVal AggregateFunctions::SampledNdvFinalize(FunctionContext* ctx,
       counts[pidx] = merged_count;
       StringVal hll = StringVal(state->buckets[bucket_idx].hll, HLL_LEN);
       HllMerge(ctx, hll, &merged_hll);
-      ndvs[pidx] = HllFinalEstimate(merged_hll.ptr, HLL_LEN);
+      ndvs[pidx] = HllFinalEstimate(merged_hll.ptr);
       ++pidx;
     }
     min_count = std::min(min_count, state->buckets[i].row_count);

http://git-wip-us.apache.org/repos/asf/impala/blob/38a9ab78/be/src/exprs/aggregate-functions.h
----------------------------------------------------------------------
diff --git a/be/src/exprs/aggregate-functions.h b/be/src/exprs/aggregate-functions.h
index 81884c1..243e9f7 100644
--- a/be/src/exprs/aggregate-functions.h
+++ b/be/src/exprs/aggregate-functions.h
@@ -200,9 +200,9 @@ class AggregateFunctions {
   static void HllMerge(FunctionContext*, const StringVal& src, StringVal* dst);
   static BigIntVal HllFinalize(FunctionContext*, const StringVal& src);
 
-  /// Utility method to compute the final result of an HLL estimation from num_buckets
-  /// estimates.
-  static uint64_t HllFinalEstimate(const uint8_t* buckets, int32_t num_buckets);
+  /// Utility method to compute the final result of an HLL estimation.
+  /// Assumes HLL_LEN number of buckets.
+  static uint64_t HllFinalEstimate(const uint8_t* buckets);
 
   /// Estimates the number of distinct values (NDV) based on a sample of data and the
   /// corresponding sampling rate. The main idea of this function is to collect several