You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2023/02/24 21:08:34 UTC
[datasketches-java] 01/01: check lgK range, handle > 12
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch hll_rel_err
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit 6d30c7bb2942ebce4f4a305e6df5c642b1ccf02b
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Feb 24 13:08:27 2023 -0800
check lgK range, handle > 12
---
.../apache/datasketches/hll/AbstractHllArray.java | 2 +-
.../org/apache/datasketches/hll/BaseHllSketch.java | 10 ++++++-
.../org/apache/datasketches/hll/HllEstimators.java | 34 ++++------------------
.../org/apache/datasketches/hll/HllSketch.java | 4 +--
4 files changed, 17 insertions(+), 33 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java b/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java
index d662e892..c60698eb 100644
--- a/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java
+++ b/src/main/java/org/apache/datasketches/hll/AbstractHllArray.java
@@ -122,7 +122,7 @@ abstract class AbstractHllArray extends HllSketchImpl {
* <p>The paper Fig 2 defines</p>
* <pre>Z := ( sum[j=1,m](2^(-M[j])) )^(-1).</pre>
* But the HIP estimator requires a computation of the probability defined above.
- * We accomplish both by redefing Z as
+ * We accomplish both by redefining Z as
* <pre>Z := ( m + sum[j=1,m](2^(-M[j] - 1)) )^(-1).</pre>
* They are mathematically equivalent since:
* <pre>m + sum[j=1,m](2^(-M[j] - 1)) == m + sum[j=1,m](2^(-M[j])) - m == sum[j=1,m](2^(-M[j])).</pre>
diff --git a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
index a0296d43..52b69f39 100644
--- a/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
+++ b/src/main/java/org/apache/datasketches/hll/BaseHllSketch.java
@@ -21,6 +21,8 @@ package org.apache.datasketches.hll;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.datasketches.hash.MurmurHash3.hash;
+import static org.apache.datasketches.hll.HllUtil.HLL_HIP_RSE_FACTOR;
+import static org.apache.datasketches.hll.HllUtil.HLL_NON_HIP_RSE_FACTOR;
import static org.apache.datasketches.hll.HllUtil.KEY_BITS_26;
import static org.apache.datasketches.hll.HllUtil.KEY_MASK_26;
@@ -118,8 +120,14 @@ abstract class BaseHllSketch {
* <a href="{@docRoot}/resources/dictionary.html#numStdDev">Number of Standard Deviations</a>
* @return the current (approximate) RelativeError
*/
- public double getRelErr(final boolean upperBound, final boolean unioned,
+ public static double getRelErr(final boolean upperBound, final boolean unioned,
final int lgConfigK, final int numStdDev) {
+ HllUtil.checkLgK(lgConfigK);
+ if (lgConfigK > 12) {
+ final double rseFactor = unioned ? HLL_NON_HIP_RSE_FACTOR : HLL_HIP_RSE_FACTOR;
+ final int configK = 1 << lgConfigK;
+ return (upperBound ? -1.0 : 1.0) * (numStdDev * rseFactor) / Math.sqrt(configK);
+ }
return RelativeErrorTables.getRelErr(upperBound, unioned, lgConfigK, numStdDev);
}
diff --git a/src/main/java/org/apache/datasketches/hll/HllEstimators.java b/src/main/java/org/apache/datasketches/hll/HllEstimators.java
index 8ff18f61..32a7893a 100644
--- a/src/main/java/org/apache/datasketches/hll/HllEstimators.java
+++ b/src/main/java/org/apache/datasketches/hll/HllEstimators.java
@@ -19,8 +19,6 @@
package org.apache.datasketches.hll;
-import static org.apache.datasketches.hll.HllUtil.HLL_HIP_RSE_FACTOR;
-import static org.apache.datasketches.hll.HllUtil.HLL_NON_HIP_RSE_FACTOR;
import static org.apache.datasketches.hll.HllUtil.MIN_LOG_K;
/**
@@ -50,39 +48,17 @@ class HllEstimators {
final int configK = 1 << lgConfigK;
final double numNonZeros =
(absHllArr.getCurMin() == 0) ? configK - absHllArr.getNumAtCurMin() : configK;
- final double estimate;
- final double rseFactor;
+ final double estimate = absHllArr.getEstimate();
final boolean oooFlag = absHllArr.isOutOfOrder();
- if (oooFlag) {
- estimate = absHllArr.getCompositeEstimate();
- rseFactor = HLL_NON_HIP_RSE_FACTOR;
- } else {
- estimate = absHllArr.getHipAccum();
- rseFactor = HLL_HIP_RSE_FACTOR;
- }
- final double relErr = (lgConfigK > 12)
- ? (numStdDev * rseFactor) / Math.sqrt(configK)
- : RelativeErrorTables.getRelErr(false, oooFlag, lgConfigK, numStdDev);
+ final double relErr = BaseHllSketch.getRelErr(false, oooFlag, lgConfigK, numStdDev);
return Math.max(estimate / (1.0 + relErr), numNonZeros);
}
static final double hllUpperBound(final AbstractHllArray absHllArr, final int numStdDev) {
final int lgConfigK = absHllArr.lgConfigK;
- final int configK = 1 << lgConfigK;
- final double estimate;
- final double rseFactor;
+ final double estimate = absHllArr.getEstimate();
final boolean oooFlag = absHllArr.isOutOfOrder();
- if (oooFlag) {
- estimate = absHllArr.getCompositeEstimate();
- rseFactor = HLL_NON_HIP_RSE_FACTOR;
- } else {
- estimate = absHllArr.getHipAccum();
- rseFactor = HLL_HIP_RSE_FACTOR;
- }
-
- final double relErr = (lgConfigK > 12)
- ? ((-1.0) * (numStdDev * rseFactor)) / Math.sqrt(configK)
- : RelativeErrorTables.getRelErr(true, oooFlag, lgConfigK, numStdDev);
+ final double relErr = BaseHllSketch.getRelErr(true, oooFlag, lgConfigK, numStdDev);
return estimate / (1.0 + relErr);
}
@@ -134,7 +110,7 @@ class HllEstimators {
final double avgEst = (adjEst + linEst) / 2.0;
// The following constants comes from empirical measurements of the crossover point
- // between the average error of the linear estimator and the adjusted hll estimator
+ // between the average error of the linear estimator and the adjusted HLL estimator
double crossOver = 0.64;
if (lgConfigK == 4) { crossOver = 0.718; }
else if (lgConfigK == 5) { crossOver = 0.672; }
diff --git a/src/main/java/org/apache/datasketches/hll/HllSketch.java b/src/main/java/org/apache/datasketches/hll/HllSketch.java
index 1ec74df9..55387f71 100644
--- a/src/main/java/org/apache/datasketches/hll/HllSketch.java
+++ b/src/main/java/org/apache/datasketches/hll/HllSketch.java
@@ -102,7 +102,7 @@ public class HllSketch extends BaseHllSketch {
* Constructs a new on-heap sketch with the type of HLL sketch to configure.
* @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
* between 4 and 21 inclusively.
- * @param tgtHllType the desired Hll type.
+ * @param tgtHllType the desired HLL type.
*/
public HllSketch(final int lgConfigK, final TgtHllType tgtHllType) {
hllSketchImpl = new CouponList(HllUtil.checkLgK(lgConfigK), tgtHllType, CurMode.LIST);
@@ -118,7 +118,7 @@ public class HllSketch extends BaseHllSketch {
* {@link #getMaxUpdatableSerializationBytes(int, TgtHllType)}.
* @param lgConfigK The Log2 of K for the target HLL sketch. This value must be
* between 4 and 21 inclusively.
- * @param tgtHllType the desired Hll type.
+ * @param tgtHllType the desired HLL type.
* @param dstMem the destination memory for the sketch.
*/
public HllSketch(final int lgConfigK, final TgtHllType tgtHllType, final WritableMemory dstMem) {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org