You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/06/25 00:29:23 UTC

[datasketches-java] branch kll_inclusive updated: inclusive PMF and CDF

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch kll_inclusive
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git


The following commit(s) were added to refs/heads/kll_inclusive by this push:
     new 7f23f329 inclusive PMF and CDF
7f23f329 is described below

commit 7f23f329cd370925eaf792becc3482ae2a804d84
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Jun 24 17:29:16 2022 -0700

    inclusive PMF and CDF
---
 .../apache/datasketches/kll/KllDoublesHelper.java  | 21 ++++----
 .../apache/datasketches/kll/KllDoublesSketch.java  | 61 +++++++++++++++++++++-
 .../apache/datasketches/kll/KllFloatsHelper.java   | 21 ++++----
 .../apache/datasketches/kll/KllFloatsSketch.java   | 61 +++++++++++++++++++++-
 .../datasketches/kll/KllDoublesSketchTest.java     | 39 ++++++++++----
 .../datasketches/kll/KllFloatsSketchTest.java      | 38 ++++++++++----
 6 files changed, 197 insertions(+), 44 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java
index ceaeeb55..426402e7 100644
--- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java
+++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java
@@ -59,7 +59,8 @@ final class KllDoublesHelper {
     return (double) total / mine.getN();
   }
 
-  static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoints, final boolean isCdf) {
+  static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoints,
+      final boolean isCdf, final boolean inclusive) {
     if (mine.isEmpty()) { return null; }
     validateDoubleValues(splitPoints);
     final double[] buckets = new double[splitPoints.length + 1];
@@ -71,9 +72,11 @@ final class KllDoublesHelper {
       final int fromIndex = myLevelsArr[level];
       final int toIndex = myLevelsArr[level + 1]; // exclusive
       if (level == 0 && !mine.isLevelZeroSorted()) {
-        KllDoublesHelper.incrementDoublesBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+        KllDoublesHelper.incrementDoublesBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+            buckets, inclusive);
       } else {
-        KllDoublesHelper.incrementDoublesBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+        KllDoublesHelper.incrementDoublesBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+            buckets, inclusive);
       }
       level++;
       weight *= 2;
@@ -446,13 +449,13 @@ final class KllDoublesHelper {
   }
 
   private static void incrementDoublesBucketsSortedLevel(
-      final KllSketch mine, final int fromIndex, final int toIndex,
-      final int weight, final double[] splitPoints, final double[] buckets) {
+      final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+      final double[] splitPoints, final double[] buckets, final boolean inclusive) {
     final double[] myDoubleItemsArr = mine.getDoubleItemsArray();
     int i = fromIndex;
     int j = 0;
     while (i <  toIndex && j < splitPoints.length) {
-      if (myDoubleItemsArr[i] < splitPoints[j]) {
+      if (inclusive ? splitPoints[j] >= myDoubleItemsArr[i] : myDoubleItemsArr[i] < splitPoints[j]) {
         buckets[j] += weight; // this sample goes into this bucket
         i++; // move on to next sample and see whether it also goes into this bucket
       } else {
@@ -468,13 +471,13 @@ final class KllDoublesHelper {
   }
 
   private static void incrementDoublesBucketsUnsortedLevel(
-      final KllSketch mine, final int fromIndex, final int toIndex,
-      final int weight, final double[] splitPoints, final double[] buckets) {
+      final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+      final double[] splitPoints, final double[] buckets, final boolean inclusive) {
     final double[] myDoubleItemsArr = mine.getDoubleItemsArray();
     for (int i = fromIndex; i < toIndex; i++) {
       int j;
       for (j = 0; j < splitPoints.length; j++) {
-        if (myDoubleItemsArr[i] < splitPoints[j]) {
+        if (inclusive ? splitPoints[j] >= myDoubleItemsArr[i] : myDoubleItemsArr[i] < splitPoints[j]) {
           break;
         }
       }
diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
index cc9ccbef..905c3f8d 100644
--- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
@@ -178,7 +178,35 @@ public abstract class KllDoublesSketch extends KllSketch {
    * in positions 0 through j of the returned PMF array.
    */
   public double[] getCDF(final double[] splitPoints) {
-    return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true);
+    return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true, false);
+  }
+
+  /**
+   * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+   * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
+   *
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(false) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing double values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @param inclusive if true the weight of the given value is included into the rank.
+   * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+   *
+   * @return an array of m+1 double values on the interval [0.0, 1.0),
+   * which are a consecutive approximation to the CDF of the input stream given the splitPoints.
+   * The value at array position j of the returned CDF array is the sum of the returned values
+   * in positions 0 through j of the returned PMF array.
+   */
+  public double[] getCDF(final double[] splitPoints, final boolean inclusive) {
+    return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true, inclusive);
   }
 
   /**
@@ -220,7 +248,36 @@ public abstract class KllDoublesSketch extends KllSketch {
    * splitPoint, with the exception that the last interval will include maximum value.
    */
   public double[] getPMF(final double[] splitPoints) {
-    return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false);
+    return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false, false);
+  }
+
+  /**
+   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+   * given a set of splitPoints (values).
+   *
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(true) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing double values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @param inclusive if true the weight of the given value is included into the rank.
+   * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+   *
+   * @return an array of m+1 doubles on the interval [0.0, 1.0),
+   * each of which is an approximation to the fraction of the total input stream values
+   * (the mass) that fall into one of those intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
+   * splitPoint, with the exception that the last interval will include maximum value.
+   */
+  public double[] getPMF(final double[] splitPoints, final boolean inclusive) {
+    return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false, inclusive);
   }
 
   /**
diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java
index ee2b7486..d4371ce1 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java
@@ -59,7 +59,8 @@ final class KllFloatsHelper {
     return (double) total / mine.getN();
   }
 
-  static double[] getFloatsPmfOrCdf(final KllSketch mine, final float[] splitPoints, final boolean isCdf) {
+  static double[] getFloatsPmfOrCdf(final KllSketch mine, final float[] splitPoints,
+      final boolean isCdf, final boolean inclusive) {
     if (mine.isEmpty()) { return null; }
     validateFloatValues(splitPoints);
     final double[] buckets = new double[splitPoints.length + 1];
@@ -71,9 +72,11 @@ final class KllFloatsHelper {
       final int fromIndex = myLevelsArr[level];
       final int toIndex = myLevelsArr[level + 1]; // exclusive
       if (level == 0 && !mine.isLevelZeroSorted()) {
-        KllFloatsHelper.incrementFloatBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+        KllFloatsHelper.incrementFloatBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+            buckets, inclusive);
       } else {
-        KllFloatsHelper.incrementFloatBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+        KllFloatsHelper.incrementFloatBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+            buckets, inclusive);
       }
       level++;
       weight *= 2;
@@ -445,13 +448,13 @@ final class KllFloatsHelper {
   }
 
   private static void incrementFloatBucketsSortedLevel(
-      final KllSketch mine, final int fromIndex, final int toIndex,
-      final int weight, final float[] splitPoints, final double[] buckets) {
+      final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+      final float[] splitPoints, final double[] buckets, final boolean inclusive) {
     final float[] myFloatItemsArr = mine.getFloatItemsArray();
     int i = fromIndex;
     int j = 0;
     while (i <  toIndex && j < splitPoints.length) {
-      if (myFloatItemsArr[i] < splitPoints[j]) {
+      if (inclusive ? splitPoints[j] >= myFloatItemsArr[i] : myFloatItemsArr[i] < splitPoints[j]) {
         buckets[j] += weight; // this sample goes into this bucket
         i++; // move on to next sample and see whether it also goes into this bucket
       } else {
@@ -467,13 +470,13 @@ final class KllFloatsHelper {
   }
 
   private static void incrementFloatBucketsUnsortedLevel(
-      final KllSketch mine, final int fromIndex, final int toIndex,
-      final int weight, final float[] splitPoints, final double[] buckets) {
+      final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+      final float[] splitPoints, final double[] buckets, final boolean inclusive) {
     final float[] myFloatItemsArr = mine.getFloatItemsArray();
     for (int i = fromIndex; i < toIndex; i++) {
       int j;
       for (j = 0; j < splitPoints.length; j++) {
-        if (myFloatItemsArr[i] < splitPoints[j]) {
+        if (inclusive ? splitPoints[j] >= myFloatItemsArr[i] : myFloatItemsArr[i] < splitPoints[j]) {
           break;
         }
       }
diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
index 39772de3..e1debffe 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
@@ -178,7 +178,35 @@ public abstract class KllFloatsSketch extends KllSketch {
    * in positions 0 through j of the returned PMF array.
    */
   public double[] getCDF(final float[] splitPoints) {
-    return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true);
+    return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true, false);
+  }
+
+  /**
+   * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+   * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
+   *
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(false) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @param inclusive if true the weight of the given value is included into the rank.
+   * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+   *
+   * @return an array of m+1 double values on the interval [0.0, 1.0),
+   * which are a consecutive approximation to the CDF of the input stream given the splitPoints.
+   * The value at array position j of the returned CDF array is the sum of the returned values
+   * in positions 0 through j of the returned PMF array.
+   */
+  public double[] getCDF(final float[] splitPoints, final boolean inclusive) {
+    return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true, inclusive);
   }
 
   /**
@@ -220,7 +248,36 @@ public abstract class KllFloatsSketch extends KllSketch {
    * splitPoint, with the exception that the last interval will include maximum value.
    */
   public double[] getPMF(final float[] splitPoints) {
-    return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false);
+    return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false, false);
+  }
+
+  /**
+   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+   * given a set of splitPoints (values).
+   *
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(true) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @param inclusive if true the weight of the given value is included into the rank.
+   * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+   *
+   * @return an array of m+1 doubles on the interval [0.0, 1.0),
+   * each of which is an approximation to the fraction of the total input stream values
+   * (the mass) that fall into one of those intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
+   * splitPoint, with the exception that the last interval will include maximum value.
+   */
+  public double[] getPMF(final float[] splitPoints, final boolean inclusive) {
+    return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false, inclusive);
   }
 
   /**
diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java
index 1744ab41..06afaa05 100644
--- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java
+++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java
@@ -199,18 +199,35 @@ public class KllDoublesSketchTest {
       sketch.update(i);
       values[i] = i;
     }
-    final double[] ranks = sketch.getCDF(values);
-    final double[] pmf = sketch.getPMF(values);
-    double sumPmf = 0;
-    for (int i = 0; i < n; i++) {
-      assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
-          "rank vs CDF for value " + i);
-      sumPmf += pmf[i];
-      assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+
+    { // inclusive = false (default)
+      final double[] ranks = sketch.getCDF(values);
+      final double[] pmf = sketch.getPMF(values);
+      double sumPmf = 0;
+      for (int i = 0; i < n; i++) {
+        assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
+            "rank vs CDF for value " + i);
+        sumPmf += pmf[i];
+        assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+      }
+      sumPmf += pmf[n];
+      assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+      assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
+    }
+    { // inclusive = true
+      final double[] ranks = sketch.getCDF(values, true);
+      final double[] pmf = sketch.getPMF(values, true);
+      double sumPmf = 0;
+      for (int i = 0; i < n; i++) {
+        assertEquals(ranks[i], sketch.getRank(values[i], true), NUMERIC_NOISE_TOLERANCE,
+            "rank vs CDF for value " + i);
+        sumPmf += pmf[i];
+        assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+      }
+      sumPmf += pmf[n];
+      assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+      assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
     }
-    sumPmf += pmf[n];
-    assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
-    assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
   }
 
   @Test
diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
index 9ead0adf..e121f93d 100644
--- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
+++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
@@ -200,18 +200,34 @@ public class KllFloatsSketchTest {
       sketch.update(i);
       values[i] = i;
     }
-    final double[] ranks = sketch.getCDF(values);
-    final double[] pmf = sketch.getPMF(values);
-    double sumPmf = 0;
-    for (int i = 0; i < n; i++) {
-      assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
-          "rank vs CDF for value " + i);
-      sumPmf += pmf[i];
-      assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+    { // inclusive = false (default)
+      final double[] ranks = sketch.getCDF(values);
+      final double[] pmf = sketch.getPMF(values);
+      double sumPmf = 0;
+      for (int i = 0; i < n; i++) {
+        assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
+            "rank vs CDF for value " + i);
+        sumPmf += pmf[i];
+        assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+      }
+      sumPmf += pmf[n];
+      assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+      assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
+    }
+    { // inclusive = false (default)
+      final double[] ranks = sketch.getCDF(values, true);
+      final double[] pmf = sketch.getPMF(values, true);
+      double sumPmf = 0;
+      for (int i = 0; i < n; i++) {
+        assertEquals(ranks[i], sketch.getRank(values[i], true), NUMERIC_NOISE_TOLERANCE,
+            "rank vs CDF for value " + i);
+        sumPmf += pmf[i];
+        assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+      }
+      sumPmf += pmf[n];
+      assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+      assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
     }
-    sumPmf += pmf[n];
-    assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
-    assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org