You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2023/02/24 21:08:34 UTC

[datasketches-java] 01/01: check lgK range, handle > 12

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch hll_rel_err
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit 6d30c7bb2942ebce4f4a305e6df5c642b1ccf02b
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Feb 24 13:08:27 2023 -0800

    check lgK range, handle > 12
---
 .../apache/datasketches/hll/AbstractHllArray.java  |  2 +-
 .../org/apache/datasketches/hll/BaseHllSketch.java | 10 ++++++-
 .../org/apache/datasketches/hll/HllEstimators.java | 34 ++++------------------
 .../org/apache/datasketches/hll/HllSketch.java     |  4 +--
 4 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java b/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java
index d662e892..c60698eb 100644
--- a/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java
+++ b/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java
@@ -122,7 +122,7 @@ abstract class AbstractHllArray extends HllSketchImpl {
    * <p>The paper Fig 2 defines</p>
    * <pre>Z := ( sum[j=1,m](2^(-M[j])) )^(-1).</pre>
    * But the HIP estimator requires a computation of the probability defined above.
-   * We accomplish both by redefing Z as
+   * We accomplish both by redefining Z as
    * <pre>Z := ( m + sum[j=1,m](2^(-M[j] - 1)) )^(-1).</pre>
    * They are mathematically equivalent since:
    * <pre>m + sum[j=1,m](2^(-M[j] - 1)) == m + sum[j=1,m](2^(-M[j])) - m == sum[j=1,m](2^(-M[j])).</pre>
diff --git a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
index a0296d43..52b69f39 100644
--- a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
+++ b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
@@ -21,6 +21,8 @@ package org.apache.datasketches.hll;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.datasketches.hash.MurmurHash3.hash;
+import static org.apache.datasketches.hll.HllUtil.HLL_HIP_RSE_FACTOR;
+import static org.apache.datasketches.hll.HllUtil.HLL_NON_HIP_RSE_FACTOR;
 import static org.apache.datasketches.hll.HllUtil.KEY_BITS_26;
 import static org.apache.datasketches.hll.HllUtil.KEY_MASK_26;
 
@@ -118,8 +120,14 @@ abstract class BaseHllSketch {
    * <a href="{@docRoot}/resources/dictionary.html#numStdDev">Number of Standard Deviations</a>
    * @return the current (approximate) RelativeError
    */
-  public double getRelErr(final boolean upperBound, final boolean unioned,
+  public static double getRelErr(final boolean upperBound, final boolean unioned,
       final int lgConfigK, final int numStdDev) {
+    HllUtil.checkLgK(lgConfigK);
+    if (lgConfigK > 12) {
+      final double rseFactor = unioned ? HLL_NON_HIP_RSE_FACTOR : HLL_HIP_RSE_FACTOR;
+      final int configK = 1 << lgConfigK;
+      return (upperBound ? -1.0 : 1.0) * (numStdDev * rseFactor) / Math.sqrt(configK);
+    }
     return RelativeErrorTables.getRelErr(upperBound, unioned, lgConfigK, numStdDev);
   }
 
diff --git a/src/main/java/org/apache/datasketches/hll/HllEstimators.java b/src/main/java/org/apache/datasketches/hll/HllEstimators.java
index 8ff18f61..32a7893a 100644
--- a/src/main/java/org/apache/datasketches/hll/HllEstimators.java
+++ b/src/main/java/org/apache/datasketches/hll/HllEstimators.java
@@ -19,8 +19,6 @@
 
 package org.apache.datasketches.hll;
 
-import static org.apache.datasketches.hll.HllUtil.HLL_HIP_RSE_FACTOR;
-import static org.apache.datasketches.hll.HllUtil.HLL_NON_HIP_RSE_FACTOR;
 import static org.apache.datasketches.hll.HllUtil.MIN_LOG_K;
 
 /**
@@ -50,39 +48,17 @@ class HllEstimators {
     final int configK = 1 << lgConfigK;
     final double numNonZeros =
         (absHllArr.getCurMin() == 0) ? configK - absHllArr.getNumAtCurMin() : configK;
-    final double estimate;
-    final double rseFactor;
+    final double estimate = absHllArr.getEstimate();
     final boolean oooFlag = absHllArr.isOutOfOrder();
-    if (oooFlag) {
-      estimate = absHllArr.getCompositeEstimate();
-      rseFactor = HLL_NON_HIP_RSE_FACTOR;
-    } else {
-      estimate = absHllArr.getHipAccum();
-      rseFactor = HLL_HIP_RSE_FACTOR;
-    }
-    final double relErr = (lgConfigK > 12)
-        ? (numStdDev * rseFactor) / Math.sqrt(configK)
-        : RelativeErrorTables.getRelErr(false, oooFlag, lgConfigK, numStdDev);
+    final double relErr = BaseHllSketch.getRelErr(false, oooFlag, lgConfigK, numStdDev);
     return Math.max(estimate / (1.0 + relErr), numNonZeros);
   }
 
   static final double hllUpperBound(final AbstractHllArray absHllArr, final int numStdDev) {
     final int lgConfigK = absHllArr.lgConfigK;
-    final int configK = 1 << lgConfigK;
-    final double estimate;
-    final double rseFactor;
+    final double estimate = absHllArr.getEstimate();
     final boolean oooFlag = absHllArr.isOutOfOrder();
-    if (oooFlag) {
-      estimate = absHllArr.getCompositeEstimate();
-      rseFactor = HLL_NON_HIP_RSE_FACTOR;
-    } else {
-      estimate = absHllArr.getHipAccum();
-      rseFactor = HLL_HIP_RSE_FACTOR;
-    }
-
-    final double relErr = (lgConfigK > 12)
-        ? ((-1.0) * (numStdDev * rseFactor)) / Math.sqrt(configK)
-        : RelativeErrorTables.getRelErr(true, oooFlag, lgConfigK, numStdDev);
+    final double relErr = BaseHllSketch.getRelErr(true, oooFlag, lgConfigK, numStdDev);
     return estimate / (1.0 + relErr);
   }
 
@@ -134,7 +110,7 @@ class HllEstimators {
     final double avgEst = (adjEst + linEst) / 2.0;
 
     // The following constants comes from empirical measurements of the crossover point
-    // between the average error of the linear estimator and the adjusted hll estimator
+    // between the average error of the linear estimator and the adjusted HLL estimator
     double crossOver = 0.64;
     if (lgConfigK == 4)      { crossOver = 0.718; }
     else if (lgConfigK == 5) { crossOver = 0.672; }
diff --git a/src/main/java/org/apache/datasketches/hll/HllSketch.java b/src/main/java/org/apache/datasketches/hll/HllSketch.java
index 1ec74df9..55387f71 100644
--- a/src/main/java/org/apache/datasketches/hll/HllSketch.java
+++ b/src/main/java/org/apache/datasketches/hll/HllSketch.java
@@ -102,7 +102,7 @@ public class HllSketch extends BaseHllSketch {
    * Constructs a new on-heap sketch with the type of HLL sketch to configure.
    * @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
    * between 4 and 21 inclusively.
-   * @param tgtHllType the desired Hll type.
+   * @param tgtHllType the desired HLL type.
    */
   public HllSketch(final int lgConfigK, final TgtHllType tgtHllType) {
     hllSketchImpl = new CouponList(HllUtil.checkLgK(lgConfigK), tgtHllType, CurMode.LIST);
@@ -118,7 +118,7 @@ public class HllSketch extends BaseHllSketch {
    * {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}.
    * @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
    * between 4 and 21 inclusively.
-   * @param tgtHllType the desired Hll type.
+   * @param tgtHllType the desired HLL type.
    * @param dstMem the destination memory for the sketch.
    */
   public HllSketch(final int lgConfigK, final TgtHllType tgtHllType, final WritableMemory dstMem) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org