You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2022/07/05 16:39:59 UTC
[datasketches-java] 01/01: added inclusive mode
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch quantiles_inclusive
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
commit 3d64fb7a5dd517db386e1b99b11d6f7eb22e994e
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Jul 5 09:39:52 2022 -0700
added inclusive mode
---
.../datasketches/quantiles/DoublesAuxiliary.java | 5 +-
.../datasketches/quantiles/DoublesPmfCdfImpl.java | 24 +++---
.../datasketches/quantiles/DoublesSketch.java | 93 +++++++++++++++++---
.../datasketches/quantiles/ItemsAuxiliary.java | 6 +-
.../datasketches/quantiles/ItemsPmfCdfImpl.java | 29 ++++---
.../apache/datasketches/quantiles/ItemsSketch.java | 99 +++++++++++++++++-----
.../datasketches/quantiles/KolmogorovSmirnov.java | 4 +-
.../quantiles/HeapUpdateDoublesSketchTest.java | 72 +++++++++++++++-
.../datasketches/quantiles/ItemsSketchTest.java | 71 +++++++++++++++-
.../apache/datasketches/quantiles/UtilTest.java | 12 +--
10 files changed, 336 insertions(+), 79 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java b/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java
index 344d9c14..96e3afa9 100644
--- a/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java
+++ b/src/main/java/org/apache/datasketches/quantiles/DoublesAuxiliary.java
@@ -41,8 +41,9 @@ final class DoublesAuxiliary {
/**
* Constructs the Auxiliary structure from the DoublesSketch
* @param qs a DoublesSketch
+ * @param inclusive if true, fractional ranks are considered inclusive
*/
- DoublesAuxiliary(final DoublesSketch qs ) {
+ DoublesAuxiliary(final DoublesSketch qs, final boolean inclusive) {
final int k = qs.getK();
final long n = qs.getN();
final long bitPattern = qs.getBitPattern();
@@ -60,7 +61,7 @@ final class DoublesAuxiliary {
// taking advantage of the already sorted blocks of length k
blockyTandemMergeSort(itemsArr, cumWtsArr, numSamples, k);
- final long total = QuantilesHelper.convertToPrecedingCummulative(cumWtsArr, false);
+ final long total = QuantilesHelper.convertToPrecedingCummulative(cumWtsArr, inclusive);
assert total == n;
auxN_ = n;
diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesPmfCdfImpl.java b/src/main/java/org/apache/datasketches/quantiles/DoublesPmfCdfImpl.java
index 29134473..a0e63ea8 100644
--- a/src/main/java/org/apache/datasketches/quantiles/DoublesPmfCdfImpl.java
+++ b/src/main/java/org/apache/datasketches/quantiles/DoublesPmfCdfImpl.java
@@ -27,8 +27,9 @@ package org.apache.datasketches.quantiles;
*/
class DoublesPmfCdfImpl {
- static double[] getPMFOrCDF(final DoublesSketch sketch, final double[] splitPoints, final boolean isCDF) {
- final double[] buckets = internalBuildHistogram(sketch, splitPoints);
+ static double[] getPMFOrCDF(final DoublesSketch sketch, final double[] splitPoints,
+ final boolean isCDF, final boolean inclusive) {
+ final double[] buckets = internalBuildHistogram(sketch, splitPoints, inclusive);
final long n = sketch.getN();
if (isCDF) {
double subtotal = 0;
@@ -52,7 +53,8 @@ class DoublesPmfCdfImpl {
* that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
* @return the unnormalized, accumulated counts of <i>m + 1</i> intervals.
*/
- private static double[] internalBuildHistogram(final DoublesSketch sketch, final double[] splitPoints) {
+ private static double[] internalBuildHistogram(final DoublesSketch sketch, final double[] splitPoints,
+ final boolean inclusive) {
final DoublesSketchAccessor sketchAccessor = DoublesSketchAccessor.wrap(sketch);
Util.checkSplitPointsOrder(splitPoints);
@@ -65,12 +67,12 @@ class DoublesPmfCdfImpl {
if (numSplitPoints < 50) { // empirically determined crossover
// sort not worth it when few split points
DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(
- sketchAccessor, weight, splitPoints, counters);
+ sketchAccessor, weight, splitPoints, counters, inclusive);
} else {
sketchAccessor.sort();
// sort is worth it when many split points
DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(
- sketchAccessor, weight, splitPoints, counters);
+ sketchAccessor, weight, splitPoints, counters, inclusive);
}
long myBitPattern = sketch.getBitPattern();
@@ -82,7 +84,7 @@ class DoublesPmfCdfImpl {
// the levels are already sorted so we can use the fast version
sketchAccessor.setLevel(lvl);
DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(
- sketchAccessor, weight, splitPoints, counters);
+ sketchAccessor, weight, splitPoints, counters, inclusive);
}
}
return counters;
@@ -96,16 +98,17 @@ class DoublesPmfCdfImpl {
* @param weight of the samples
* @param splitPoints must be unique and sorted. Number of splitPoints + 1 == counters.length.
* @param counters array of counters
+ * @param inclusive if true the weight of a value is included into its rank.
*/
static void bilinearTimeIncrementHistogramCounters(final DoublesBufferAccessor samples, final long weight,
- final double[] splitPoints, final double[] counters) {
+ final double[] splitPoints, final double[] counters, final boolean inclusive) {
assert ((splitPoints.length + 1) == counters.length);
for (int i = 0; i < samples.numItems(); i++) {
final double sample = samples.get(i);
int j;
for (j = 0; j < splitPoints.length; j++) {
final double splitpoint = splitPoints[j];
- if (sample < splitpoint) {
+ if (inclusive ? sample <= splitpoint : sample < splitpoint) {
break;
}
}
@@ -126,13 +129,14 @@ class DoublesPmfCdfImpl {
* @param weight of the samples
* @param splitPoints must be unique and sorted. Number of splitPoints + 1 = counters.length.
* @param counters array of counters
+ * @param inclusive if true the weight of a value is included into its rank.
*/
static void linearTimeIncrementHistogramCounters(final DoublesBufferAccessor samples, final long weight,
- final double[] splitPoints, final double[] counters) {
+ final double[] splitPoints, final double[] counters, final boolean inclusive) {
int i = 0;
int j = 0;
while ((i < samples.numItems()) && (j < splitPoints.length)) {
- if (samples.get(i) < splitPoints[j]) {
+ if (inclusive ? samples.get(i) <= splitPoints[j] : samples.get(i) < splitPoints[j]) {
counters[j] += weight; // this sample goes into this bucket
i++; // move on to next sample and see whether it also goes into this bucket
} else {
diff --git a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java
index 23f5bca0..4235a0e8 100644
--- a/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java
+++ b/src/main/java/org/apache/datasketches/quantiles/DoublesSketch.java
@@ -211,19 +211,30 @@ public abstract class DoublesSketch {
* If fraction = 0.0, the true minimum value of the stream is returned.
* If fraction = 1.0, the true maximum value of the stream is returned.
*
+ * @param inclusive if true, the given fraction (rank) is considered inclusive
+ *
* @return the approximation to the value at the above fraction
*/
- public double getQuantile(final double fraction) {
+ public double getQuantile(final double fraction, final boolean inclusive) {
if (isEmpty()) { return Double.NaN; }
if (fraction < 0.0 || fraction > 1.0) {
throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0");
}
if (fraction == 0.0) { return getMinValue(); }
if (fraction == 1.0) { return getMaxValue(); }
- final DoublesAuxiliary aux = new DoublesAuxiliary(this);
+ final DoublesAuxiliary aux = new DoublesAuxiliary(this, inclusive);
return aux.getQuantile(fraction);
}
+ /**
+ * Same as {@link #getQuantile(double, boolean) getQuantile(double fraction, false)}
+ * @param fraction fractional rank
+ * @return quantile
+ */
+ public double getQuantile(final double fraction) {
+ return getQuantile(fraction, false);
+ }
+
/**
* Gets the upper bound of the value interval in which the true quantile of the given rank
* exists with a confidence of at least 99%.
@@ -261,10 +272,12 @@ public abstract class DoublesSketch {
* sorted stream of all the input values seen so far.
* These fRanks must all be in the interval [0.0, 1.0] inclusively.
*
+ * @param inclusive if true, the given fractional ranks are considered inclusive
+ *
* @return array of approximate quantiles of the given fRanks in the same order as in the given
* fRanks array.
*/
- public double[] getQuantiles(final double[] fRanks) {
+ public double[] getQuantiles(final double[] fRanks, final boolean inclusive) {
if (isEmpty()) { return null; }
DoublesAuxiliary aux = null;
final double[] quantiles = new double[fRanks.length];
@@ -274,7 +287,7 @@ public abstract class DoublesSketch {
else if (fRank == 1.0) { quantiles[i] = getMaxValue(); }
else {
if (aux == null) {
- aux = new DoublesAuxiliary(this);
+ aux = new DoublesAuxiliary(this, inclusive);
}
quantiles[i] = aux.getQuantile(fRank);
}
@@ -282,23 +295,43 @@ public abstract class DoublesSketch {
return quantiles;
}
+ /**
+ * Same as {@link #getQuantiles(double[], boolean) getQuantiles(double[] fRanks, false)}
+ * @param fRanks fractional ranks
+ * @return quantiles
+ */
+ public double[] getQuantiles(final double[] fRanks) {
+ return getQuantiles(fRanks, false);
+ }
+
/**
* This is also a more efficient multiple-query version of getQuantile() and allows the caller to
* specify the number of evenly spaced fractional ranks.
*
* <p>If the sketch is empty this returns null.
*
- * @param evenlySpaced an integer that specifies the number of evenly spaced fractional ranks.
+ * @param numEvenlySpaced an integer that specifies the number of evenly spaced fractional ranks.
* This must be a positive integer greater than 1.
* A value of 2 will return the min and the max value. A value of 3 will return the min,
* the median and the max value, etc.
*
+ * @param inclusive if true, fractional ranks are considered inclusive
+ *
* @return array of approximations to the given fractions in the same order as given fractions
* array.
*/
- public double[] getQuantiles(final int evenlySpaced) {
+ public double[] getQuantiles(final int numEvenlySpaced, final boolean inclusive) {
if (isEmpty()) { return null; }
- return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, evenlySpaced));
+ return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced));
+ }
+
+ /**
+ * Same as {@link #getQuantiles(int, boolean) getQuantiles(int numEvenlySpaced, false)}
+ * @param numEvenlySpaced number of evenly spaced fractional ranks
+ * @return quantiles
+ */
+ public double[] getQuantiles(final int numEvenlySpaced) {
+ return getQuantiles(numEvenlySpaced, false);
}
/**
@@ -311,16 +344,17 @@ public abstract class DoublesSketch {
* <p>If the sketch is empty this returns NaN.</p>
*
* @param value to be ranked
+ * @param inclusive if true the weight of the given value is included into the rank.
* @return an approximate rank of the given value
*/
- public double getRank(final double value) {
+ public double getRank(final double value, final boolean inclusive) {
if (isEmpty()) { return Double.NaN; }
final DoublesSketchAccessor samples = DoublesSketchAccessor.wrap(this);
long total = 0;
int weight = 1;
samples.setLevel(DoublesSketchAccessor.BB_LVL_IDX);
for (int i = 0; i < samples.numItems(); i++) {
- if (samples.get(i) < value) {
+ if (inclusive ? samples.get(i) <= value : samples.get(i) < value) {
total += weight;
}
}
@@ -330,7 +364,7 @@ public abstract class DoublesSketch {
if ((bitPattern & 1L) > 0) { // level is not empty
samples.setLevel(lvl);
for (int i = 0; i < samples.numItems(); i++) {
- if (samples.get(i) < value) {
+ if (inclusive ? samples.get(i) <= value : samples.get(i) < value) {
total += weight;
} else {
break; // levels are sorted, no point comparing further
@@ -341,6 +375,15 @@ public abstract class DoublesSketch {
return (double) total / getN();
}
+ /**
+ * Same as {@link #getRank(double, boolean) getRank(double value, false)}
+ * @param value value to be ranked
+ * @return fractional rank
+ */
+ public double getRank(final double value) {
+ return getRank(value, false);
+ }
+
/**
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
* given a set of splitPoints (values).
@@ -357,14 +400,25 @@ public abstract class DoublesSketch {
* the maximum value.
* It is not necessary to include either the min or max values in these splitpoints.
*
+ * @param inclusive if true the weight of the given value is included into the rank.
+ *
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
* The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
* splitPoint, with the exception that the last interval will include maximum value.
*/
- public double[] getPMF(final double[] splitPoints) {
+ public double[] getPMF(final double[] splitPoints, final boolean inclusive) {
if (isEmpty()) { return null; }
- return DoublesPmfCdfImpl.getPMFOrCDF(this, splitPoints, false);
+ return DoublesPmfCdfImpl.getPMFOrCDF(this, splitPoints, false, inclusive);
+ }
+
+ /**
+ * Same as {@link #getPMF(double[], boolean) getPMF(double[] splitPoints, false)}
+ * @param splitPoints splitPoints
+ * @return PMF
+ */
+ public double[] getPMF(final double[] splitPoints) {
+ return getPMF(splitPoints, false);
}
/**
@@ -383,14 +437,25 @@ public abstract class DoublesSketch {
* the maximum value.
* It is not necessary to include either the min or max values in these splitpoints.
*
+ * @param inclusive if true the weight of the given value is included into the rank.
+ *
* @return an array of m+1 double values, which are a consecutive approximation to the CDF
* of the input stream given the splitPoints. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
* array.
*/
- public double[] getCDF(final double[] splitPoints) {
+ public double[] getCDF(final double[] splitPoints, final boolean inclusive) {
if (isEmpty()) { return null; }
- return DoublesPmfCdfImpl.getPMFOrCDF(this, splitPoints, true);
+ return DoublesPmfCdfImpl.getPMFOrCDF(this, splitPoints, true, inclusive);
+ }
+
+ /**
+ * Same as {@link #getCDF(double[], boolean) getCDF(double[] splitPoints, false)}
+ * @param splitPoints splitPoints
+ * @return CDF
+ */
+ public double[] getCDF(final double[] splitPoints) {
+ return getCDF(splitPoints, false);
}
/**
diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java b/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java
index 4905fc22..b28740bf 100644
--- a/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java
+++ b/src/main/java/org/apache/datasketches/quantiles/ItemsAuxiliary.java
@@ -42,7 +42,7 @@ final class ItemsAuxiliary<T> {
* @param qs an ItemsSketch
*/
@SuppressWarnings("unchecked")
- ItemsAuxiliary(final ItemsSketch<T> qs) {
+ ItemsAuxiliary(final ItemsSketch<T> qs, final boolean inclusive) {
final int k = qs.getK();
final long n = qs.getN();
final long bitPattern = qs.getBitPattern();
@@ -64,9 +64,9 @@ final class ItemsAuxiliary<T> {
// convert the item weights into totals of the weights preceding each item
long subtot = 0;
- for (int i = 0; i < (numSamples + 1); i++ ) {
+ for (int i = 0; i < (numSamples + 1); i++) {
final long newSubtot = subtot + cumWtsArr[i];
- cumWtsArr[i] = subtot;
+ cumWtsArr[i] = inclusive ? newSubtot : subtot;
subtot = newSubtot;
}
diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsPmfCdfImpl.java b/src/main/java/org/apache/datasketches/quantiles/ItemsPmfCdfImpl.java
index 4f7c0780..247a3058 100644
--- a/src/main/java/org/apache/datasketches/quantiles/ItemsPmfCdfImpl.java
+++ b/src/main/java/org/apache/datasketches/quantiles/ItemsPmfCdfImpl.java
@@ -24,8 +24,9 @@ import java.util.Comparator;
class ItemsPmfCdfImpl {
- static <T> double[] getPMFOrCDF(final ItemsSketch<T> sketch, final T[] splitPoints, final boolean isCDF) {
- final double[] buckets = internalBuildHistogram(splitPoints, sketch);
+ static <T> double[] getPMFOrCDF(final ItemsSketch<T> sketch, final T[] splitPoints,
+ final boolean isCDF, final boolean inclusive) {
+ final double[] buckets = internalBuildHistogram(splitPoints, sketch, inclusive);
final long n = sketch.getN();
if (isCDF) {
double subtotal = 0;
@@ -51,7 +52,8 @@ class ItemsPmfCdfImpl {
* @return the unnormalized, accumulated counts of <i>m + 1</i> intervals.
*/
@SuppressWarnings("unchecked")
- private static <T> double[] internalBuildHistogram(final T[] splitPoints, final ItemsSketch<T> sketch) {
+ private static <T> double[] internalBuildHistogram(final T[] splitPoints, final ItemsSketch<T> sketch,
+ final boolean inclusive) {
final Object[] samples = sketch.getCombinedBuffer();
final int bbCount = sketch.getBaseBufferCount();
ItemsUtil.validateValues(splitPoints, sketch.getComparator());
@@ -63,14 +65,13 @@ class ItemsPmfCdfImpl {
long weight = 1;
if (numSplitPoints < 50) { // empirically determined crossover
// sort not worth it when few split points
- ItemsPmfCdfImpl.bilinearTimeIncrementHistogramCounters(
- (T[]) samples, 0, bbCount, weight, splitPoints, counters, sketch.getComparator());
+ bilinearTimeIncrementHistogramCounters(
+ (T[]) samples, 0, bbCount, weight, splitPoints, counters, sketch.getComparator(), inclusive);
} else {
// sort is worth it when many split points
Arrays.sort((T[]) samples, 0, bbCount, sketch.getComparator());
linearTimeIncrementHistogramCounters(
- (T[]) samples, 0, bbCount, weight, splitPoints, counters, sketch.getComparator()
- );
+ (T[]) samples, 0, bbCount, weight, splitPoints, counters, sketch.getComparator(), inclusive);
}
long myBitPattern = sketch.getBitPattern();
@@ -81,7 +82,7 @@ class ItemsPmfCdfImpl {
if ((myBitPattern & 1L) > 0L) { //valid level exists
// the levels are already sorted so we can use the fast version
linearTimeIncrementHistogramCounters(
- (T[]) samples, (2 + lvl) * k, k, weight, splitPoints, counters, sketch.getComparator());
+ (T[]) samples, (2 + lvl) * k, k, weight, splitPoints, counters, sketch.getComparator(), inclusive);
}
}
return counters;
@@ -98,17 +99,18 @@ class ItemsPmfCdfImpl {
* @param splitPoints must be unique and sorted. Number of splitPoints + 1 == counters.length.
* @param counters array of counters
* @param comparator the comparator for data type T
+ * @param inclusive if true, rank is considered inclusive
*/
private static <T> void bilinearTimeIncrementHistogramCounters(final T[] samples, final int offset,
final int numSamples, final long weight, final T[] splitPoints, final double[] counters,
- final Comparator<? super T> comparator) {
+ final Comparator<? super T> comparator, final boolean inclusive) {
assert ((splitPoints.length + 1) == counters.length);
for (int i = 0; i < numSamples; i++) {
final T sample = samples[i + offset];
int j = 0;
for (j = 0; j < splitPoints.length; j++) {
final T splitpoint = splitPoints[j];
- if (comparator.compare(sample, splitpoint) < 0) {
+ if (inclusive ? comparator.compare(sample, splitpoint) <= 0 : comparator.compare(sample, splitpoint) < 0) {
break;
}
}
@@ -133,14 +135,17 @@ class ItemsPmfCdfImpl {
* @param splitPoints must be unique and sorted. Number of splitPoints + 1 = counters.length.
* @param counters array of counters
* @param comparator the comparator for data type T
+ * @param inclusive if true, rank is considered inclusive
*/
private static <T> void linearTimeIncrementHistogramCounters(final T[] samples, final int offset,
final int numSamples, final long weight, final T[] splitPoints, final double[] counters,
- final Comparator<? super T> comparator) {
+ final Comparator<? super T> comparator, final boolean inclusive) {
int i = 0;
int j = 0;
while ((i < numSamples) && (j < splitPoints.length)) {
- if (comparator.compare(samples[i + offset], splitPoints[j]) < 0) {
+ final T sample = samples[i + offset];
+ final T value = splitPoints[j];
+ if (inclusive ? comparator.compare(sample, value) <= 0 : comparator.compare(sample, value) < 0) {
counters[j] += weight; // this sample goes into this bucket
i++; // move on to next sample and see whether it also goes into this bucket
} else {
diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
index 0726fda8..0400d545 100644
--- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
+++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketch.java
@@ -271,20 +271,31 @@ public final class ItemsSketch<T> {
* If fraction = 0.0, the true minimum value of the stream is returned.
* If fraction = 1.0, the true maximum value of the stream is returned.
*
+ * @param inclusive if true, the given fraction (rank) is considered inclusive
+ *
* @return the approximation to the value at the above fraction
*/
- public T getQuantile(final double fraction) {
+ public T getQuantile(final double fraction, final boolean inclusive) {
if (fraction < 0.0 || fraction > 1.0) {
throw new SketchesArgumentException("Fraction cannot be less than zero or greater than 1.0");
}
if (fraction == 0.0) { return minValue_; }
else if (fraction == 1.0) { return maxValue_; }
else {
- final ItemsAuxiliary<T> aux = constructAuxiliary();
+ final ItemsAuxiliary<T> aux = new ItemsAuxiliary<>(this, inclusive);
return aux.getQuantile(fraction);
}
}
+ /**
+ * Same as {@link #getQuantile(double, boolean) getQuantile(double fraction, false)}
+ * @param fraction fractional rank
+ * @return quantile
+ */
+ public T getQuantile(final double fraction) {
+ return getQuantile(fraction, false);
+ }
+
/**
* Gets the upper bound of the value interval in which the true quantile of the given rank
* exists with a confidence of at least 99%.
@@ -322,10 +333,12 @@ public final class ItemsSketch<T> {
* sorted stream of all the input values seen so far.
* These fRanks must all be in the interval [0.0, 1.0] inclusively.
*
+ * @param inclusive if true, the given fractional ranks are considered inclusive
+ *
* @return array of approximate quantiles of the given fRanks in the same order as in the given
* fRanks array.
*/
- public T[] getQuantiles(final double[] fRanks) {
+ public T[] getQuantiles(final double[] fRanks, final boolean inclusive) {
if (isEmpty()) { return null; }
ItemsAuxiliary<T> aux = null;
@SuppressWarnings("unchecked")
@@ -336,7 +349,7 @@ public final class ItemsSketch<T> {
else if (fRank == 1.0) { quantiles[i] = maxValue_; }
else {
if (aux == null) {
- aux = this.constructAuxiliary();
+ aux = new ItemsAuxiliary<>(this, inclusive);
}
quantiles[i] = aux.getQuantile(fRank);
}
@@ -344,21 +357,41 @@ public final class ItemsSketch<T> {
return quantiles;
}
+ /**
+ * Same as {@link #getQuantiles(double[], boolean) getQuantiles(double[] fRanks, false)}
+ * @param fRanks fractional ranks
+ * @return quantiles
+ */
+ public T[] getQuantiles(final double[] fRanks) {
+ return getQuantiles(fRanks, false);
+ }
+
/**
* This is also a more efficient multiple-query version of getQuantile() and allows the caller to
* specify the number of evenly spaced fractional ranks.
*
- * @param evenlySpaced an integer that specifies the number of evenly spaced fractional ranks.
+ * @param numEvenlySpaced an integer that specifies the number of evenly spaced fractional ranks.
* This must be a positive integer greater than 1.
* A value of 2 will return the min and the max value. A value of 3 will return the min,
* the median and the max value, etc.
*
+ * @param inclusive if true, fractional ranks are considered inclusive
+ *
* @return array of approximations to the given fractions in the same order as given fractions
* array.
*/
- public T[] getQuantiles(final int evenlySpaced) {
+ public T[] getQuantiles(final int numEvenlySpaced, final boolean inclusive) {
if (isEmpty()) { return null; }
- return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, evenlySpaced));
+ return getQuantiles(org.apache.datasketches.Util.evenlySpaced(0.0, 1.0, numEvenlySpaced), inclusive);
+ }
+
+ /**
+ * Same as {@link #getQuantiles(int, boolean) getQuantiles(int numEvenlySpaced, false)}
+ * @param numEvenlySpaced number of evenly spaced fractional ranks
+ * @return quantiles
+ */
+ public T[] getQuantiles(final int numEvenlySpaced) {
+ return getQuantiles(numEvenlySpaced, false);
}
/**
@@ -371,15 +404,17 @@ public final class ItemsSketch<T> {
* <p>If the sketch is empty this returns NaN.</p>
*
* @param value to be ranked
+ * @param inclusive if true the weight of the given value is included into the rank.
* @return an approximate rank of the given value
*/
@SuppressWarnings("unchecked")
- public double getRank(final T value) {
+ public double getRank(final T value, final boolean inclusive) {
if (isEmpty()) { return Double.NaN; }
long total = 0;
int weight = 1;
for (int i = 0; i < baseBufferCount_; i++) {
- if (comparator_.compare((T) combinedBuffer_[i], value) < 0) {
+ final T sample = (T) combinedBuffer_[i];
+ if (inclusive ? comparator_.compare(sample, value) <= 0 : comparator_.compare(sample, value) < 0) {
total += weight;
}
}
@@ -389,7 +424,8 @@ public final class ItemsSketch<T> {
if ((bitPattern & 1L) > 0) { // level is not empty
final int offset = (2 + lvl) * k_;
for (int i = 0; i < k_; i++) {
- if (comparator_.compare((T) combinedBuffer_[i + offset], value) < 0) {
+ final T sample = (T) combinedBuffer_[i + offset];
+ if (inclusive ? comparator_.compare(sample, value) <= 0 : comparator_.compare(sample, value) < 0) {
total += weight;
} else {
break; // levels are sorted, no point comparing further
@@ -400,6 +436,10 @@ public final class ItemsSketch<T> {
return (double) total / n_;
}
+ public double getRank(final T value) {
+ return getRank(value, false);
+ }
+
/**
* Returns an approximation to the Probability Mass Function (PMF) of the input stream
* given a set of splitPoints (values).
@@ -416,14 +456,25 @@ public final class ItemsSketch<T> {
* the maximum value.
* It is not necessary to include either the min or max values in these splitpoints.
*
+ * @param inclusive if true the weight of the given value is included into the rank.
+ *
* @return an array of m+1 doubles each of which is an approximation
* to the fraction of the input stream values (the mass) that fall into one of those intervals.
* The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
* splitPoint, with the exception that the last interval will include maximum value.
*/
- public double[] getPMF(final T[] splitPoints) {
+ public double[] getPMF(final T[] splitPoints, final boolean inclusive) {
if (isEmpty()) { return null; }
- return ItemsPmfCdfImpl.getPMFOrCDF(this, splitPoints, false);
+ return ItemsPmfCdfImpl.getPMFOrCDF(this, splitPoints, false, inclusive);
+ }
+
+ /**
+ * Same as {@link #getPMF(T[], boolean) getPMF(T[] splitPoints, false)}
+ * @param splitPoints splitPoints
+ * @return PMF
+ */
+ public double[] getPMF(final T[] splitPoints) {
+ return getPMF(splitPoints, false);
}
/**
@@ -442,14 +493,25 @@ public final class ItemsSketch<T> {
* the maximum value.
* It is not necessary to include either the min or max values in these splitpoints.
*
+ * @param inclusive if true the weight of the given value is included into the rank.
+ *
* @return an array of m+1 double values, which are a consecutive approximation to the CDF
* of the input stream given the splitPoints. The value at array position j of the returned
* CDF array is the sum of the returned values in positions 0 through j of the returned PMF
* array.
*/
- public double[] getCDF(final T[] splitPoints) {
+ public double[] getCDF(final T[] splitPoints, final boolean inclusive) {
if (isEmpty()) { return null; }
- return ItemsPmfCdfImpl.getPMFOrCDF(this, splitPoints, true);
+ return ItemsPmfCdfImpl.getPMFOrCDF(this, splitPoints, true, inclusive);
+ }
+
+ /**
+ * Same as {@link #getCDF(T[], boolean) getCDF(T[] splitPoints, false)}
+ * @param splitPoints splitPoints
+ * @return CDF
+ */
+ public double[] getCDF(final T[] splitPoints) {
+ return getCDF(splitPoints, false);
}
/**
@@ -723,15 +785,6 @@ public final class ItemsSketch<T> {
}
}
- /**
- * Returns the Auxiliary data structure which is only used for getQuantile() and getQuantiles()
- * queries.
- * @return the Auxiliary data structure
- */
- private ItemsAuxiliary<T> constructAuxiliary() {
- return new ItemsAuxiliary<>(this);
- }
-
private static <T> void growBaseBuffer(final ItemsSketch<T> sketch) {
final Object[] baseBuffer = sketch.getCombinedBuffer();
final int oldSize = sketch.getCombinedBufferAllocatedCount();
diff --git a/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java
index 2217fb52..d1cffa38 100644
--- a/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java
+++ b/src/main/java/org/apache/datasketches/quantiles/KolmogorovSmirnov.java
@@ -34,8 +34,8 @@ final class KolmogorovSmirnov {
* @return the raw delta area between two quantile sketches
*/
public static double computeKSDelta(final DoublesSketch sketch1, final DoublesSketch sketch2) {
- final DoublesAuxiliary p = new DoublesAuxiliary(sketch1);
- final DoublesAuxiliary q = new DoublesAuxiliary(sketch2);
+ final DoublesAuxiliary p = new DoublesAuxiliary(sketch1, false);
+ final DoublesAuxiliary q = new DoublesAuxiliary(sketch2, false);
final double[] pSamplesArr = p.auxSamplesArr_;
final double[] qSamplesArr = q.auxSamplesArr_;
diff --git a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java
index de08b9c4..563885a8 100644
--- a/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java
+++ b/src/test/java/org/apache/datasketches/quantiles/HeapUpdateDoublesSketchTest.java
@@ -116,7 +116,7 @@ public class HeapUpdateDoublesSketchTest {
for (int k = 2; k <= 32; k *= 2) {
HeapUpdateDoublesSketch qs = HeapUpdateDoublesSketch.newInstance(k);
for (int numItemsSoFar = 0; numItemsSoFar < 1000; numItemsSoFar++) {
- DoublesAuxiliary aux = new DoublesAuxiliary(qs);
+ DoublesAuxiliary aux = new DoublesAuxiliary(qs, false);
int numSamples = qs.getRetainedItems();
double[] auxItems = aux.auxSamplesArr_;
long[] auxAccum = aux.auxCumWtsArr_;
@@ -984,9 +984,17 @@ public class HeapUpdateDoublesSketchTest {
sketch.update(i);
values[i] = i;
}
- final double[] ranks = sketch.getCDF(values);
- for (int i = 0; i < n; i++) {
- assertEquals(ranks[i], sketch.getRank(values[i]), 0.00001, "CDF vs rank for value " + i);
+ { // inclusive = false (default)
+ final double[] ranks = sketch.getCDF(values);
+ for (int i = 0; i < n; i++) {
+ assertEquals(ranks[i], sketch.getRank(values[i]), 0.00001, "CDF vs rank for value " + i);
+ }
+ }
+ { // inclusive = true
+ final double[] ranks = sketch.getCDF(values, true);
+ for (int i = 0; i < n; i++) {
+ assertEquals(ranks[i], sketch.getRank(values[i], true), 0.00001, "CDF vs rank for value " + i);
+ }
}
}
@@ -1025,6 +1033,62 @@ public class HeapUpdateDoublesSketchTest {
assertEquals(kEpsPmf, k);
}
+ @Test
+ public void tenItems() {
+ final UpdateDoublesSketch sketch = DoublesSketch.builder().build();
+ for (int i = 1; i <= 10; i++) { sketch.update(i); }
+ assertFalse(sketch.isEmpty());
+ assertEquals(sketch.getN(), 10);
+ assertEquals(sketch.getRetainedItems(), 10);
+ for (int i = 1; i <= 10; i++) {
+ assertEquals(sketch.getRank(i), (i - 1) / 10.0);
+ assertEquals(sketch.getRank(i, false), (i - 1) / 10.0);
+ assertEquals(sketch.getRank(i, true), i / 10.0);
+ }
+ // inclusive = false (default)
+ assertEquals(sketch.getQuantile(0), 1); // always min value
+ assertEquals(sketch.getQuantile(0.1), 2);
+ assertEquals(sketch.getQuantile(0.2), 3);
+ assertEquals(sketch.getQuantile(0.3), 4);
+ assertEquals(sketch.getQuantile(0.4), 5);
+ assertEquals(sketch.getQuantile(0.5), 6);
+ assertEquals(sketch.getQuantile(0.6), 7);
+ assertEquals(sketch.getQuantile(0.7), 8);
+ assertEquals(sketch.getQuantile(0.8), 9);
+ assertEquals(sketch.getQuantile(0.9), 10);
+ assertEquals(sketch.getQuantile(1), 10); // always max value
+ // inclusive = true
+ assertEquals(sketch.getQuantile(0, true), 1); // always min value
+ assertEquals(sketch.getQuantile(0.1, true), 1);
+ assertEquals(sketch.getQuantile(0.2, true), 2);
+ assertEquals(sketch.getQuantile(0.3, true), 3);
+ assertEquals(sketch.getQuantile(0.4, true), 4);
+ assertEquals(sketch.getQuantile(0.5, true), 5);
+ assertEquals(sketch.getQuantile(0.6, true), 6);
+ assertEquals(sketch.getQuantile(0.7, true), 7);
+ assertEquals(sketch.getQuantile(0.8, true), 8);
+ assertEquals(sketch.getQuantile(0.9, true), 9);
+ assertEquals(sketch.getQuantile(1, true), 10); // always max value
+
+ // getQuantile() and getQuantiles() equivalence
+ {
+ // inclusive = false (default)
+ final double[] quantiles =
+ sketch.getQuantiles(new double[] {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1});
+ for (int i = 0; i <= 10; i++) {
+ assertEquals(sketch.getQuantile(i / 10.0), quantiles[i]);
+ }
+ }
+ {
+ // inclusive = true
+ final double[] quantiles =
+ sketch.getQuantiles(new double[] {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}, true);
+ for (int i = 0; i <= 10; i++) {
+ assertEquals(sketch.getQuantile(i / 10.0, true), quantiles[i]);
+ }
+ }
+ }
+
//private methods
private static void checksForImproperK(final int k) {
diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
index df4e864c..c50c1a32 100644
--- a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
+++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchTest.java
@@ -20,6 +20,7 @@
package org.apache.datasketches.quantiles;
import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
import java.util.Arrays;
import java.util.Comparator;
@@ -108,6 +109,62 @@ public class ItemsSketchTest {
Assert.assertNull(sketch.getQuantile(0.5));
}
+ @Test
+ public void tenItems() {
+ final ItemsSketch<Integer> sketch = ItemsSketch.getInstance(128, Comparator.naturalOrder());
+ for (int i = 1; i <= 10; i++) { sketch.update(i); }
+ assertFalse(sketch.isEmpty());
+ assertEquals(sketch.getN(), 10);
+ assertEquals(sketch.getRetainedItems(), 10);
+ for (int i = 1; i <= 10; i++) {
+ assertEquals(sketch.getRank(i), (i - 1) / 10.0);
+ assertEquals(sketch.getRank(i, false), (i - 1) / 10.0);
+ assertEquals(sketch.getRank(i, true), i / 10.0);
+ }
+ // inclusive = false (default)
+ assertEquals(sketch.getQuantile(0), 1); // always min value
+ assertEquals(sketch.getQuantile(0.1), 2);
+ assertEquals(sketch.getQuantile(0.2), 3);
+ assertEquals(sketch.getQuantile(0.3), 4);
+ assertEquals(sketch.getQuantile(0.4), 5);
+ assertEquals(sketch.getQuantile(0.5), 6);
+ assertEquals(sketch.getQuantile(0.6), 7);
+ assertEquals(sketch.getQuantile(0.7), 8);
+ assertEquals(sketch.getQuantile(0.8), 9);
+ assertEquals(sketch.getQuantile(0.9), 10);
+ assertEquals(sketch.getQuantile(1), 10); // always max value
+ // inclusive = true
+ assertEquals(sketch.getQuantile(0, true), 1); // always min value
+ assertEquals(sketch.getQuantile(0.1, true), 1);
+ assertEquals(sketch.getQuantile(0.2, true), 2);
+ assertEquals(sketch.getQuantile(0.3, true), 3);
+ assertEquals(sketch.getQuantile(0.4, true), 4);
+ assertEquals(sketch.getQuantile(0.5, true), 5);
+ assertEquals(sketch.getQuantile(0.6, true), 6);
+ assertEquals(sketch.getQuantile(0.7, true), 7);
+ assertEquals(sketch.getQuantile(0.8, true), 8);
+ assertEquals(sketch.getQuantile(0.9, true), 9);
+ assertEquals(sketch.getQuantile(1, true), 10); // always max value
+
+ // getQuantile() and getQuantiles() equivalence
+ {
+ // inclusive = false (default)
+ final Integer[] quantiles =
+ sketch.getQuantiles(new double[] {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1});
+ for (int i = 0; i <= 10; i++) {
+ assertEquals(sketch.getQuantile(i / 10.0), quantiles[i]);
+ }
+ }
+ {
+ // inclusive = true
+ final Integer[] quantiles =
+ sketch.getQuantiles(new double[] {0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1}, true);
+ for (int i = 0; i <= 10; i++) {
+ assertEquals(sketch.getQuantile(i / 10.0, true), quantiles[i]);
+ }
+ }
+ }
+
@Test
public void estimation() {
final ItemsSketch<Integer> sketch = ItemsSketch.getInstance(128, Comparator.naturalOrder());
@@ -432,9 +489,17 @@ public class ItemsSketchTest {
sketch.update(i);
values[i] = i;
}
- final double[] ranks = sketch.getCDF(values);
- for (int i = 0; i < n; i++) {
- Assert.assertEquals(ranks[i], sketch.getRank(values[i]), 0.00001, "CDF vs rank for value " + i);
+ { // inclusive = false (default)
+ final double[] ranks = sketch.getCDF(values);
+ for (int i = 0; i < n; i++) {
+ Assert.assertEquals(ranks[i], sketch.getRank(values[i]), 0.00001, "CDF vs rank for value " + i);
+ }
+ }
+ { // inclusive = true
+ final double[] ranks = sketch.getCDF(values, true);
+ for (int i = 0; i < n; i++) {
+ Assert.assertEquals(ranks[i], sketch.getRank(values[i], true), 0.00001, "CDF vs rank for value " + i);
+ }
}
}
diff --git a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
index 275ed7e4..54aed0e0 100644
--- a/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
+++ b/src/test/java/org/apache/datasketches/quantiles/UtilTest.java
@@ -90,7 +90,7 @@ public class UtilTest {
final double[] splitPoints = {0.25, 0.4};
final double[] counters = {0, 0, 0};
final long[] answers = {200, 100, 200};
- DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters);
+ DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters, false);
for (int j = 0; j < counters.length; j++) {
assertEquals(counters[j], answers[j], 0.00001);
// System.out.printf ("counter[%d] = %d%n", j, counters[j]);
@@ -102,7 +102,7 @@ public class UtilTest {
final double[] splitPoints = {0.01, 0.02};
final double[] counters = {0, 0, 0};
final long[] answers = {0, 0, 500};
- DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters);
+ DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters, false);
for (int j = 0; j < counters.length; j++) {
assertEquals(counters[j], answers[j], 0.00001);
// System.out.printf ("counter[%d] = %d%n", j, counters[j]);
@@ -114,7 +114,7 @@ public class UtilTest {
final double[] splitPoints = {0.8, 0.9};
final double[] counters = {0, 0, 0};
final long[] answers = {500, 0, 0};
- DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters);
+ DoublesPmfCdfImpl.bilinearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters, false);
for (int j = 0; j < counters.length; j++) {
assertEquals(counters[j], answers[j], 0.00001);
// System.out.printf ("counter[%d] = %d%n", j, counters[j]);
@@ -136,7 +136,7 @@ public class UtilTest {
final double[] splitPoints = {0.25, 0.4};
final double[] counters = {0, 0, 0};
final long[] answers = {200, 100, 200};
- DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters);
+ DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters, false);
for (int j = 0; j < counters.length; j++) {
assertEquals(counters[j], answers[j], 0.00001);
// System.out.printf ("counter[%d] = %d%n", j, counters[j]);
@@ -148,7 +148,7 @@ public class UtilTest {
final double[] splitPoints = {0.01, 0.02};
final double[] counters = {0, 0, 0};
final long[] answers = {0, 0, 500};
- DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters);
+ DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters, false);
for (int j = 0; j < counters.length; j++) {
assertEquals(counters[j], answers[j], 0.00001);
// System.out.printf ("counter[%d] = %d%n", j, counters[j]);
@@ -160,7 +160,7 @@ public class UtilTest {
final double[] splitPoints = {0.8, 0.9};
final double[] counters = {0, 0, 0};
final long[] answers = {500, 0, 0};
- DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters);
+ DoublesPmfCdfImpl.linearTimeIncrementHistogramCounters(accessor, 100, splitPoints, counters, false);
for (int j = 0; j < counters.length; j++) {
assertEquals(counters[j], answers[j], 0.00001);
// System.out.printf ("counter[%d] = %d%n", j, counters[j]);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org