You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/06/25 00:29:23 UTC
[datasketches-java] branch kll_inclusive updated: inclusive PMF and CDF
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch kll_inclusive
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
The following commit(s) were added to refs/heads/kll_inclusive by this push:
new 7f23f329 inclusive PMF and CDF
7f23f329 is described below
commit 7f23f329cd370925eaf792becc3482ae2a804d84
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Fri Jun 24 17:29:16 2022 -0700
inclusive PMF and CDF
---
.../apache/datasketches/kll/KllDoublesHelper.java | 21 ++++----
.../apache/datasketches/kll/KllDoublesSketch.java | 61 +++++++++++++++++++++-
.../apache/datasketches/kll/KllFloatsHelper.java | 21 ++++----
.../apache/datasketches/kll/KllFloatsSketch.java | 61 +++++++++++++++++++++-
.../datasketches/kll/KllDoublesSketchTest.java | 39 ++++++++++----
.../datasketches/kll/KllFloatsSketchTest.java | 38 ++++++++++----
6 files changed, 197 insertions(+), 44 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java
index ceaeeb55..426402e7 100644
--- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java
+++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java
@@ -59,7 +59,8 @@ final class KllDoublesHelper {
return (double) total / mine.getN();
}
- static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoints, final boolean isCdf) {
+ static double[] getDoublesPmfOrCdf(final KllSketch mine, final double[] splitPoints,
+ final boolean isCdf, final boolean inclusive) {
if (mine.isEmpty()) { return null; }
validateDoubleValues(splitPoints);
final double[] buckets = new double[splitPoints.length + 1];
@@ -71,9 +72,11 @@ final class KllDoublesHelper {
final int fromIndex = myLevelsArr[level];
final int toIndex = myLevelsArr[level + 1]; // exclusive
if (level == 0 && !mine.isLevelZeroSorted()) {
- KllDoublesHelper.incrementDoublesBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+ KllDoublesHelper.incrementDoublesBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+ buckets, inclusive);
} else {
- KllDoublesHelper.incrementDoublesBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+ KllDoublesHelper.incrementDoublesBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+ buckets, inclusive);
}
level++;
weight *= 2;
@@ -446,13 +449,13 @@ final class KllDoublesHelper {
}
private static void incrementDoublesBucketsSortedLevel(
- final KllSketch mine, final int fromIndex, final int toIndex,
- final int weight, final double[] splitPoints, final double[] buckets) {
+ final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+ final double[] splitPoints, final double[] buckets, final boolean inclusive) {
final double[] myDoubleItemsArr = mine.getDoubleItemsArray();
int i = fromIndex;
int j = 0;
while (i < toIndex && j < splitPoints.length) {
- if (myDoubleItemsArr[i] < splitPoints[j]) {
+ if (inclusive ? splitPoints[j] >= myDoubleItemsArr[i] : myDoubleItemsArr[i] < splitPoints[j]) {
buckets[j] += weight; // this sample goes into this bucket
i++; // move on to next sample and see whether it also goes into this bucket
} else {
@@ -468,13 +471,13 @@ final class KllDoublesHelper {
}
private static void incrementDoublesBucketsUnsortedLevel(
- final KllSketch mine, final int fromIndex, final int toIndex,
- final int weight, final double[] splitPoints, final double[] buckets) {
+ final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+ final double[] splitPoints, final double[] buckets, final boolean inclusive) {
final double[] myDoubleItemsArr = mine.getDoubleItemsArray();
for (int i = fromIndex; i < toIndex; i++) {
int j;
for (j = 0; j < splitPoints.length; j++) {
- if (myDoubleItemsArr[i] < splitPoints[j]) {
+ if (inclusive ? splitPoints[j] >= myDoubleItemsArr[i] : myDoubleItemsArr[i] < splitPoints[j]) {
break;
}
}
diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
index cc9ccbef..905c3f8d 100644
--- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java
@@ -178,7 +178,35 @@ public abstract class KllDoublesSketch extends KllSketch {
* in positions 0 through j of the returned PMF array.
*/
public double[] getCDF(final double[] splitPoints) {
- return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true);
+ return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true, false);
+ }
+
+ /**
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+ * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
+ *
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+ * getNormalizedRankError(false) function.
+ *
+ * <p>If the sketch is empty this returns null.</p>
+ *
+ * @param splitPoints an array of <i>m</i> unique, monotonically increasing double values
+ * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+ * exclusive of the right splitPoint, with the exception that the last interval will include
+ * the maximum value.
+ * It is not necessary to include either the min or max values in these split points.
+ *
+ * @param inclusive if true the weight of the given value is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+ *
+ * @return an array of m+1 double values on the interval [0.0, 1.0),
+ * which are a consecutive approximation to the CDF of the input stream given the splitPoints.
+ * The value at array position j of the returned CDF array is the sum of the returned values
+ * in positions 0 through j of the returned PMF array.
+ */
+ public double[] getCDF(final double[] splitPoints, final boolean inclusive) {
+ return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, true, inclusive);
}
/**
@@ -220,7 +248,36 @@ public abstract class KllDoublesSketch extends KllSketch {
* splitPoint, with the exception that the last interval will include maximum value.
*/
public double[] getPMF(final double[] splitPoints) {
- return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false);
+ return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false, false);
+ }
+
+ /**
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+ * given a set of splitPoints (values).
+ *
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+ * getNormalizedRankError(true) function.
+ *
+ * <p>If the sketch is empty this returns null.</p>
+ *
+ * @param splitPoints an array of <i>m</i> unique, monotonically increasing double values
+ * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+ * exclusive of the right splitPoint, with the exception that the last interval will include
+ * the maximum value.
+ * It is not necessary to include either the min or max values in these split points.
+ *
+ * @param inclusive if true the weight of the given value is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+ *
+ * @return an array of m+1 doubles on the interval [0.0, 1.0),
+ * each of which is an approximation to the fraction of the total input stream values
+ * (the mass) that fall into one of those intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
+ * splitPoint, with the exception that the last interval will include maximum value.
+ */
+ public double[] getPMF(final double[] splitPoints, final boolean inclusive) {
+ return KllDoublesHelper.getDoublesPmfOrCdf(this, splitPoints, false, inclusive);
}
/**
diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java
index ee2b7486..d4371ce1 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java
@@ -59,7 +59,8 @@ final class KllFloatsHelper {
return (double) total / mine.getN();
}
- static double[] getFloatsPmfOrCdf(final KllSketch mine, final float[] splitPoints, final boolean isCdf) {
+ static double[] getFloatsPmfOrCdf(final KllSketch mine, final float[] splitPoints,
+ final boolean isCdf, final boolean inclusive) {
if (mine.isEmpty()) { return null; }
validateFloatValues(splitPoints);
final double[] buckets = new double[splitPoints.length + 1];
@@ -71,9 +72,11 @@ final class KllFloatsHelper {
final int fromIndex = myLevelsArr[level];
final int toIndex = myLevelsArr[level + 1]; // exclusive
if (level == 0 && !mine.isLevelZeroSorted()) {
- KllFloatsHelper.incrementFloatBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+ KllFloatsHelper.incrementFloatBucketsUnsortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+ buckets, inclusive);
} else {
- KllFloatsHelper.incrementFloatBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints, buckets);
+ KllFloatsHelper.incrementFloatBucketsSortedLevel(mine, fromIndex, toIndex, weight, splitPoints,
+ buckets, inclusive);
}
level++;
weight *= 2;
@@ -445,13 +448,13 @@ final class KllFloatsHelper {
}
private static void incrementFloatBucketsSortedLevel(
- final KllSketch mine, final int fromIndex, final int toIndex,
- final int weight, final float[] splitPoints, final double[] buckets) {
+ final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+ final float[] splitPoints, final double[] buckets, final boolean inclusive) {
final float[] myFloatItemsArr = mine.getFloatItemsArray();
int i = fromIndex;
int j = 0;
while (i < toIndex && j < splitPoints.length) {
- if (myFloatItemsArr[i] < splitPoints[j]) {
+ if (inclusive ? splitPoints[j] >= myFloatItemsArr[i] : myFloatItemsArr[i] < splitPoints[j]) {
buckets[j] += weight; // this sample goes into this bucket
i++; // move on to next sample and see whether it also goes into this bucket
} else {
@@ -467,13 +470,13 @@ final class KllFloatsHelper {
}
private static void incrementFloatBucketsUnsortedLevel(
- final KllSketch mine, final int fromIndex, final int toIndex,
- final int weight, final float[] splitPoints, final double[] buckets) {
+ final KllSketch mine, final int fromIndex, final int toIndex, final int weight,
+ final float[] splitPoints, final double[] buckets, final boolean inclusive) {
final float[] myFloatItemsArr = mine.getFloatItemsArray();
for (int i = fromIndex; i < toIndex; i++) {
int j;
for (j = 0; j < splitPoints.length; j++) {
- if (myFloatItemsArr[i] < splitPoints[j]) {
+ if (inclusive ? splitPoints[j] >= myFloatItemsArr[i] : myFloatItemsArr[i] < splitPoints[j]) {
break;
}
}
diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
index 39772de3..e1debffe 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
@@ -178,7 +178,35 @@ public abstract class KllFloatsSketch extends KllSketch {
* in positions 0 through j of the returned PMF array.
*/
public double[] getCDF(final float[] splitPoints) {
- return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true);
+ return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true, false);
+ }
+
+ /**
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+ * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
+ *
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+ * getNormalizedRankError(false) function.
+ *
+ * <p>If the sketch is empty this returns null.</p>
+ *
+ * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+ * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+ * exclusive of the right splitPoint, with the exception that the last interval will include
+ * the maximum value.
+ * It is not necessary to include either the min or max values in these split points.
+ *
+ * @param inclusive if true the weight of the given value is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+ *
+ * @return an array of m+1 double values on the interval [0.0, 1.0),
+ * which are a consecutive approximation to the CDF of the input stream given the splitPoints.
+ * The value at array position j of the returned CDF array is the sum of the returned values
+ * in positions 0 through j of the returned PMF array.
+ */
+ public double[] getCDF(final float[] splitPoints, final boolean inclusive) {
+ return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, true, inclusive);
}
/**
@@ -220,7 +248,36 @@ public abstract class KllFloatsSketch extends KllSketch {
* splitPoint, with the exception that the last interval will include maximum value.
*/
public double[] getPMF(final float[] splitPoints) {
- return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false);
+ return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false, false);
+ }
+
+ /**
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+ * given a set of splitPoints (values).
+ *
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+ * getNormalizedRankError(true) function.
+ *
+ * <p>If the sketch is empty this returns null.</p>
+ *
+ * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+ * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+ * exclusive of the right splitPoint, with the exception that the last interval will include
+ * the maximum value.
+ * It is not necessary to include either the min or max values in these split points.
+ *
+ * @param inclusive if true the weight of the given value is included into the rank.
+ * Otherwise the rank equals the sum of the weights of all values that are less than the given value
+ *
+ * @return an array of m+1 doubles on the interval [0.0, 1.0),
+ * each of which is an approximation to the fraction of the total input stream values
+ * (the mass) that fall into one of those intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
+ * splitPoint, with the exception that the last interval will include maximum value.
+ */
+ public double[] getPMF(final float[] splitPoints, final boolean inclusive) {
+ return KllFloatsHelper.getFloatsPmfOrCdf(this, splitPoints, false, inclusive);
}
/**
diff --git a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java
index 1744ab41..06afaa05 100644
--- a/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java
+++ b/src/test/java/org/apache/datasketches/kll/KllDoublesSketchTest.java
@@ -199,18 +199,35 @@ public class KllDoublesSketchTest {
sketch.update(i);
values[i] = i;
}
- final double[] ranks = sketch.getCDF(values);
- final double[] pmf = sketch.getPMF(values);
- double sumPmf = 0;
- for (int i = 0; i < n; i++) {
- assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
- "rank vs CDF for value " + i);
- sumPmf += pmf[i];
- assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+
+ { // inclusive = false (default)
+ final double[] ranks = sketch.getCDF(values);
+ final double[] pmf = sketch.getPMF(values);
+ double sumPmf = 0;
+ for (int i = 0; i < n; i++) {
+ assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
+ "rank vs CDF for value " + i);
+ sumPmf += pmf[i];
+ assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+ }
+ sumPmf += pmf[n];
+ assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+ assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
+ }
+ { // inclusive = true
+ final double[] ranks = sketch.getCDF(values, true);
+ final double[] pmf = sketch.getPMF(values, true);
+ double sumPmf = 0;
+ for (int i = 0; i < n; i++) {
+ assertEquals(ranks[i], sketch.getRank(values[i], true), NUMERIC_NOISE_TOLERANCE,
+ "rank vs CDF for value " + i);
+ sumPmf += pmf[i];
+ assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+ }
+ sumPmf += pmf[n];
+ assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+ assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
}
- sumPmf += pmf[n];
- assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
- assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
}
@Test
diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
index 9ead0adf..e121f93d 100644
--- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
+++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
@@ -200,18 +200,34 @@ public class KllFloatsSketchTest {
sketch.update(i);
values[i] = i;
}
- final double[] ranks = sketch.getCDF(values);
- final double[] pmf = sketch.getPMF(values);
- double sumPmf = 0;
- for (int i = 0; i < n; i++) {
- assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
- "rank vs CDF for value " + i);
- sumPmf += pmf[i];
- assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+ { // inclusive = false (default)
+ final double[] ranks = sketch.getCDF(values);
+ final double[] pmf = sketch.getPMF(values);
+ double sumPmf = 0;
+ for (int i = 0; i < n; i++) {
+ assertEquals(ranks[i], sketch.getRank(values[i]), NUMERIC_NOISE_TOLERANCE,
+ "rank vs CDF for value " + i);
+ sumPmf += pmf[i];
+ assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+ }
+ sumPmf += pmf[n];
+ assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+ assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
+ }
+ { // inclusive = false (default)
+ final double[] ranks = sketch.getCDF(values, true);
+ final double[] pmf = sketch.getPMF(values, true);
+ double sumPmf = 0;
+ for (int i = 0; i < n; i++) {
+ assertEquals(ranks[i], sketch.getRank(values[i], true), NUMERIC_NOISE_TOLERANCE,
+ "rank vs CDF for value " + i);
+ sumPmf += pmf[i];
+ assertEquals(ranks[i], sumPmf, NUMERIC_NOISE_TOLERANCE, "CDF vs PMF for value " + i);
+ }
+ sumPmf += pmf[n];
+ assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
+ assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
}
- sumPmf += pmf[n];
- assertEquals(sumPmf, 1.0, NUMERIC_NOISE_TOLERANCE);
- assertEquals(ranks[n], 1.0, NUMERIC_NOISE_TOLERANCE);
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org