You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/08/10 22:48:12 UTC
[incubator-datasketches-java] 03/03: Moving methods around to make
them easier to find.
This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch UpdateKLL
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git
commit 09cab7b1fa16e2fa2dcaf5a66aea48d8d8758925
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Mon Aug 10 15:47:29 2020 -0700
Moving methods around to make them easier to find.
---
.../apache/datasketches/kll/KllFloatsSketch.java | 505 +++++++++++----------
1 file changed, 253 insertions(+), 252 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
index 98b6169..e611233 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
@@ -190,12 +190,12 @@ public class KllFloatsSketch {
* || 23 | 22 | 21 | 20 | 19 | 18 | 17 | 16 |
* 2 ||<--------------data----------------| unused |numLevels|-------min K-----------|
*
- * Serialized sketch layout, Single Item:
+ * Serialized sketch layout, Empty and Single Item:
* Adr:
* || 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 |
* 0 || unused | M |--------K--------| Flags | FamID | SerVer | PreambleInts |
* || 15 | 14 | 13 | 12 | 11 | 10 | 9 | 8 |
- * 1 ||<--------------------------------data-----------------------------------------|
+ * 1 || |-------------------data-------------------|
*/
// Preamble byte addresses
@@ -205,15 +205,17 @@ public class KllFloatsSketch {
private static final int FLAGS_BYTE = 3;
private static final int K_SHORT = 4; // to 5
private static final int M_BYTE = 6;
+ // 7 is reserved for future use
private static final int N_LONG = 8; // to 15
private static final int MIN_K_SHORT = 16; // to 17
private static final int NUM_LEVELS_BYTE = 18;
+ // 19 is reserved for future use
private static final int DATA_START = 20; // if using items larger than 4 bytes, use 24
private static final int DATA_START_SINGLE_ITEM = 8;
// Other static values
- private static final byte serialVersionUID1 = 1;
- private static final byte serialVersionUID2 = 2;
+ private static final byte serialVersionUID1 = 1;
+ private static final byte serialVersionUID2 = 2;
private static final int PREAMBLE_INTS_SMALL = 2; // for empty and single item
private static final int PREAMBLE_INTS_FULL = 5; // if using items larger than 4 bytes, use 6
@@ -240,15 +242,14 @@ public class KllFloatsSketch {
private int minK_; // for error estimation after merging with different k
private long n_; // number of items input into this sketch
private int numLevels_; // one-based number of current levels,
- private int[] levels_; // array of index offsets to the levels. Size = numLevels + 1.
+ private int[] levels_; // array of index offsets into the items[]. Size = numLevels + 1.
private boolean isLevelZeroSorted_;
// Specific to the floats sketch
- private float[] items_;
+ private float[] items_; // the continuous array of float items
private float minValue_;
private float maxValue_;
-
/**
* Heap constructor with the default <em>k = 200</em>, which has a rank error of about 1.65%.
*/
@@ -385,6 +386,31 @@ public class KllFloatsSketch {
// public functions
/**
+ * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+ * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
+ *
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+ * getNormalizedRankError(false) function.
+ *
+ * <p>If the sketch is empty this returns null.</p>
+ *
+ * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+ * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+ * exclusive of the right splitPoint, with the exception that the last interval will include
+ * the maximum value.
+ * It is not necessary to include either the min or max values in these split points.
+ *
+ * @return an array of m+1 double values, which are a consecutive approximation to the CDF
+ * of the input stream given the splitPoints. The value at array position j of the returned
+ * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
+ * array.
+ */
+ public double[] getCDF(final float[] splitPoints) {
+ return getPmfOrCdf(splitPoints, true);
+ }
+
+ /**
* Returns the parameter k
* @return parameter k
*/
@@ -393,6 +419,50 @@ public class KllFloatsSketch {
}
/**
+ * Gets the approximate value of <em>k</em> to use given epsilon, the normalized rank error.
+ * @param epsilon the normalized rank error between zero and one.
+ * @param pmf if true, this function returns the value of <em>k</em> assuming the input epsilon
+ * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function
+ * returns the value of <em>k</em> assuming the input epsilon is the desired "single-sided"
+ * epsilon for all the other queries.
+ * @return the value of <i>k</i> given a value of epsilon.
+ * @see KllFloatsSketch
+ */
+ // constants were derived as the best fit to 99 percentile empirically measured max error in
+ // thousands of trials
+ public static int getKFromEpsilon(final double epsilon, final boolean pmf) {
+ //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false.
+ final double eps = max(epsilon, 4.7634E-5);
+ final double kdbl = pmf
+ ? exp(log(2.446 / eps) / 0.9433)
+ : exp(log(2.296 / eps) / 0.9723);
+ final double krnd = round(kdbl);
+ final double del = abs(krnd - kdbl);
+ final int k = (int) ((del < 1E-6) ? krnd : ceil(kdbl));
+ return max(MIN_K, min(MAX_K, k));
+ }
+
+ /**
+ * Returns the max value of the stream.
+ * If the sketch is empty this returns NaN.
+ *
+ * @return the max value of the stream
+ */
+ public float getMaxValue() {
+ return maxValue_;
+ }
+
+ /**
+ * Returns the min value of the stream.
+ * If the sketch is empty this returns NaN.
+ *
+ * @return the min value of the stream
+ */
+ public float getMinValue() {
+ return minValue_;
+ }
+
+ /**
* Returns the length of the input stream.
* @return stream length
*/
@@ -401,100 +471,107 @@ public class KllFloatsSketch {
}
/**
- * Returns true if this sketch is empty.
- * @return empty flag
+ * Gets the approximate "double-sided" rank error for the <i>getPMF()</i> function of this
+ * sketch normalized as a fraction between zero and one.
+ *
+ * @return the rank error normalized as a fraction between zero and one.
+ * @deprecated replaced by {@link #getNormalizedRankError(boolean)}
+ * @see KllFloatsSketch
*/
- public boolean isEmpty() {
- return n_ == 0;
+ @Deprecated
+ public double getNormalizedRankError() {
+ return getNormalizedRankError(true);
}
/**
- * Returns the number of retained items (samples) in the sketch.
- * @return the number of retained items (samples) in the sketch
+ * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
+ * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+ * @return if pmf is true, returns the normalized rank error for the getPMF() function.
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+ * @see KllFloatsSketch
*/
- public int getNumRetained() {
- return levels_[numLevels_] - levels_[0];
+ public double getNormalizedRankError(final boolean pmf) {
+ return getNormalizedRankError(minK_, pmf);
}
/**
- * Returns true if this sketch is in estimation mode.
- * @return estimation mode flag
+ * Static method version of the double-sided {@link #getNormalizedRankError()} that
+ * specifies <em>k</em>.
+ * @param k the configuration parameter
+ * @return the normalized "double-sided" rank error as a function of <em>k</em>.
+ * @see KllFloatsSketch
+ * @deprecated replaced by {@link #getNormalizedRankError(int, boolean)}
*/
- public boolean isEstimationMode() {
- return numLevels_ > 1;
+ @Deprecated
+ public static double getNormalizedRankError(final int k) {
+ return getNormalizedRankError(k, true);
}
/**
- * Updates this sketch with the given data item.
- *
- * @param value an item from a stream of items. NaNs are ignored.
+ * Gets the normalized rank error given k and pmf.
+ * Static method version of the {@link #getNormalizedRankError(boolean)}.
+ * @param k the configuation parameter
+ * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+ * @return if pmf is true, the normalized rank error for the getPMF() function.
+ * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+ * @see KllFloatsSketch
*/
- public void update(final float value) {
- if (Float.isNaN(value)) { return; }
- if (isEmpty()) {
- minValue_ = value;
- maxValue_ = value;
- } else {
- if (value < minValue_) { minValue_ = value; }
- if (value > maxValue_) { maxValue_ = value; }
- }
- if (levels_[0] == 0) {
- compressWhileUpdating();
- }
- n_++;
- isLevelZeroSorted_ = false;
- final int nextPos = levels_[0] - 1;
- assert levels_[0] >= 0;
- levels_[0] = nextPos;
- items_[nextPos] = value;
+ // constants were derived as the best fit to 99 percentile empirically measured max error in
+ // thousands of trials
+ public static double getNormalizedRankError(final int k, final boolean pmf) {
+ return pmf
+ ? 2.446 / pow(k, 0.9433)
+ : 2.296 / pow(k, 0.9723);
}
/**
- * Merges another sketch into this one.
- * @param other sketch to merge into this one
+ * Returns the number of retained items (samples) in the sketch.
+ * @return the number of retained items (samples) in the sketch
*/
- public void merge(final KllFloatsSketch other) {
- if ((other == null) || other.isEmpty()) { return; }
- if (m_ != other.m_) {
- throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_);
- }
- final long finalN = n_ + other.n_;
- //update this sketch with level0 items from the other sketch
- for (int i = other.levels_[0]; i < other.levels_[1]; i++) {
- update(other.items_[i]);
- }
- if (other.numLevels_ >= 2) { //now merge other levels if they exist
- mergeHigherLevels(other, finalN);
- }
- //update min, max values, n
- if (Float.isNaN(minValue_) || (other.minValue_ < minValue_)) { minValue_ = other.minValue_; }
- if (Float.isNaN(maxValue_) || (other.maxValue_ > maxValue_)) { maxValue_ = other.maxValue_; }
- n_ = finalN;
-
- assertCorrectTotalWeight();
- if (other.isEstimationMode()) {
- minK_ = min(minK_, other.minK_);
- }
+ public int getNumRetained() {
+ return levels_[numLevels_] - levels_[0];
}
/**
- * Returns the min value of the stream.
- * If the sketch is empty this returns NaN.
- *
- * @return the min value of the stream
+ * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
+ * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
+ * This method can be used if allocation of storage is necessary beforehand, but it is not
+ * optimal.
+ * @param k parameter that controls size of the sketch and accuracy of estimates
+ * @param n stream length
+ * @return upper bound on the serialized size
*/
- public float getMinValue() {
- return minValue_;
+ public static int getMaxSerializedSizeBytes(final int k, final long n) {
+ final int numLevels = KllHelper.ubOnNumLevels(n);
+ final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels);
+ return getSerializedSizeBytes(numLevels, maxNumItems);
}
/**
- * Returns the max value of the stream.
- * If the sketch is empty this returns NaN.
+ * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+ * given a set of splitPoints (values).
*
- * @return the max value of the stream
+ * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+ * getNormalizedRankError(true) function.
+ *
+ * <p>If the sketch is empty this returns null.</p>
+ *
+ * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+ * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+ * exclusive of the right splitPoint, with the exception that the last interval will include
+ * the maximum value.
+ * It is not necessary to include either the min or max values in these split points.
+ *
+ * @return an array of m+1 doubles each of which is an approximation
+ * to the fraction of the input stream values (the mass) that fall into one of those intervals.
+ * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
+ * splitPoint, with the exception that the last interval will include maximum value.
*/
- public float getMaxValue() {
- return maxValue_;
+ public double[] getPMF(final float[] splitPoints) {
+ return getPmfOrCdf(splitPoints, false);
}
/**
@@ -640,157 +717,105 @@ public class KllFloatsSketch {
}
/**
- * Returns an approximation to the Probability Mass Function (PMF) of the input stream
- * given a set of splitPoints (values).
- *
- * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
- * getNormalizedRankError(true) function.
- *
- * <p>If the sketch is empty this returns null.</p>
- *
- * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
- * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
- * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
- * exclusive of the right splitPoint, with the exception that the last interval will include
- * the maximum value.
- * It is not necessary to include either the min or max values in these split points.
- *
- * @return an array of m+1 doubles each of which is an approximation
- * to the fraction of the input stream values (the mass) that fall into one of those intervals.
- * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
- * splitPoint, with the exception that the last interval will include maximum value.
- */
- public double[] getPMF(final float[] splitPoints) {
- return getPmfOrCdf(splitPoints, false);
- }
-
- /**
- * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
- * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
- *
- * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
- * getNormalizedRankError(false) function.
- *
- * <p>If the sketch is empty this returns null.</p>
- *
- * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
- * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
- * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
- * exclusive of the right splitPoint, with the exception that the last interval will include
- * the maximum value.
- * It is not necessary to include either the min or max values in these split points.
- *
- * @return an array of m+1 double values, which are a consecutive approximation to the CDF
- * of the input stream given the splitPoints. The value at array position j of the returned
- * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
- * array.
- */
- public double[] getCDF(final float[] splitPoints) {
- return getPmfOrCdf(splitPoints, true);
- }
-
- /**
- * Gets the approximate "double-sided" rank error for the <i>getPMF()</i> function of this
- * sketch normalized as a fraction between zero and one.
- *
- * @return the rank error normalized as a fraction between zero and one.
- * @deprecated replaced by {@link #getNormalizedRankError(boolean)}
- * @see KllFloatsSketch
+ * Returns the number of bytes this sketch would require to store.
+ * @return the number of bytes this sketch would require to store.
*/
- @Deprecated
- public double getNormalizedRankError() {
- return getNormalizedRankError(true);
+ public int getSerializedSizeBytes() {
+ if (isEmpty()) { return N_LONG; }
+ return getSerializedSizeBytes(numLevels_, getNumRetained());
}
/**
- * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
- * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
- * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
- * @return if pmf is true, returns the normalized rank error for the getPMF() function.
- * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
- * @see KllFloatsSketch
+ * Returns true if this sketch is empty.
+ * @return empty flag
*/
- public double getNormalizedRankError(final boolean pmf) {
- return getNormalizedRankError(minK_, pmf);
+ public boolean isEmpty() {
+ return n_ == 0;
}
/**
- * Static method version of the double-sided {@link #getNormalizedRankError()} that
- * specifies <em>k</em>.
- * @param k the configuration parameter
- * @return the normalized "double-sided" rank error as a function of <em>k</em>.
- * @see KllFloatsSketch
- * @deprecated replaced by {@link #getNormalizedRankError(int, boolean)}
+ * Returns true if this sketch is in estimation mode.
+ * @return estimation mode flag
*/
- @Deprecated
- public static double getNormalizedRankError(final int k) {
- return getNormalizedRankError(k, true);
+ public boolean isEstimationMode() {
+ return numLevels_ > 1;
}
/**
- * Gets the normalized rank error given k and pmf.
- * Static method version of the {@link #getNormalizedRankError(boolean)}.
- * @param k the configuation parameter
- * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
- * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
- * @return if pmf is true, the normalized rank error for the getPMF() function.
- * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
- * @see KllFloatsSketch
+ * @return the iterator for this class
*/
- // constants were derived as the best fit to 99 percentile empirically measured max error in
- // thousands of trials
- public static double getNormalizedRankError(final int k, final boolean pmf) {
- return pmf
- ? 2.446 / pow(k, 0.9433)
- : 2.296 / pow(k, 0.9723);
+ public KllFloatsSketchIterator iterator() {
+ return new KllFloatsSketchIterator(items_, levels_, numLevels_);
}
/**
- * Gets the approximate value of <em>k</em> to use given epsilon, the normalized rank error.
- * @param epsilon the normalized rank error between zero and one.
- * @param pmf if true, this function returns the value of <em>k</em> assuming the input epsilon
- * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function
- * returns the value of <em>k</em> assuming the input epsilon is the desired "single-sided"
- * epsilon for all the other queries.
- * @return the value of <i>k</i> given a value of epsilon.
- * @see KllFloatsSketch
+ * Merges another sketch into this one.
+ * @param other sketch to merge into this one
*/
- // constants were derived as the best fit to 99 percentile empirically measured max error in
- // thousands of trials
- public static int getKFromEpsilon(final double epsilon, final boolean pmf) {
- //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false.
- final double eps = max(epsilon, 4.7634E-5);
- final double kdbl = pmf
- ? exp(log(2.446 / eps) / 0.9433)
- : exp(log(2.296 / eps) / 0.9723);
- final double krnd = round(kdbl);
- final double del = abs(krnd - kdbl);
- final int k = (int) ((del < 1E-6) ? krnd : ceil(kdbl));
- return max(MIN_K, min(MAX_K, k));
- }
+ public void merge(final KllFloatsSketch other) {
+ if ((other == null) || other.isEmpty()) { return; }
+ if (m_ != other.m_) {
+ throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_);
+ }
+ final long finalN = n_ + other.n_;
+ //update this sketch with level0 items from the other sketch
+ for (int i = other.levels_[0]; i < other.levels_[1]; i++) {
+ update(other.items_[i]);
+ }
+ if (other.numLevels_ >= 2) { //now merge other levels if they exist
+ mergeHigherLevels(other, finalN);
+ }
+ //update min, max values, n
+ if (Float.isNaN(minValue_) || (other.minValue_ < minValue_)) { minValue_ = other.minValue_; }
+ if (Float.isNaN(maxValue_) || (other.maxValue_ > maxValue_)) { maxValue_ = other.maxValue_; }
+ n_ = finalN;
- /**
- * Returns the number of bytes this sketch would require to store.
- * @return the number of bytes this sketch would require to store.
- */
- public int getSerializedSizeBytes() {
- if (isEmpty()) { return N_LONG; }
- return getSerializedSizeBytes(numLevels_, getNumRetained());
+ assertCorrectTotalWeight();
+ if (other.isEstimationMode()) {
+ minK_ = min(minK_, other.minK_);
+ }
}
/**
- * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
- * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
- * This method can be used if allocation of storage is necessary beforehand, but it is not
- * optimal.
- * @param k parameter that controls size of the sketch and accuracy of estimates
- * @param n stream length
- * @return upper bound on the serialized size
+ * Returns serialized sketch in a byte array form.
+ * @return serialized sketch in a byte array form.
*/
- public static int getMaxSerializedSizeBytes(final int k, final long n) {
- final int numLevels = KllHelper.ubOnNumLevels(n);
- final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels);
- return getSerializedSizeBytes(numLevels, maxNumItems);
+ public byte[] toByteArray() {
+ final byte[] bytes = new byte[getSerializedSizeBytes()];
+ final boolean isSingleItem = n_ == 1;
+ bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SMALL : PREAMBLE_INTS_FULL);
+ bytes[SER_VER_BYTE] = isSingleItem ? serialVersionUID2 : serialVersionUID1;
+ bytes[FAMILY_BYTE] = (byte) Family.KLL.getID();
+ bytes[FLAGS_BYTE] = (byte) (
+ (isEmpty() ? 1 << Flags.IS_EMPTY.ordinal() : 0)
+ | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0)
+ | (isSingleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)
+ );
+ ByteArrayUtil.putShortLE(bytes, K_SHORT, (short) k_);
+ bytes[M_BYTE] = (byte) m_;
+ if (isEmpty()) { return bytes; }
+ int offset = DATA_START_SINGLE_ITEM;
+ if (!isSingleItem) {
+ ByteArrayUtil.putLongLE(bytes, N_LONG, n_);
+ ByteArrayUtil.putShortLE(bytes, MIN_K_SHORT, (short) minK_);
+ bytes[NUM_LEVELS_BYTE] = (byte) numLevels_;
+ offset = DATA_START;
+ // the last integer in levels_ is not serialized because it can be derived
+ for (int i = 0; i < numLevels_; i++) {
+ ByteArrayUtil.putIntLE(bytes, offset, levels_[i]);
+ offset += Integer.BYTES;
+ }
+ ByteArrayUtil.putFloatLE(bytes, offset, minValue_);
+ offset += Float.BYTES;
+ ByteArrayUtil.putFloatLE(bytes, offset, maxValue_);
+ offset += Float.BYTES;
+ }
+ final int numItems = getNumRetained();
+ for (int i = 0; i < numItems; i++) {
+ ByteArrayUtil.putFloatLE(bytes, offset, items_[levels_[0] + i]);
+ offset += Float.BYTES;
+ }
+ return bytes;
}
@Override
@@ -828,9 +853,9 @@ public class KllFloatsSketch {
if (withLevels) {
sb.append("### KLL sketch levels:").append(Util.LS)
- .append(" index: nominal capacity, actual size").append(Util.LS);
+ .append(" level, offset: nominal capacity, actual size").append(Util.LS);
for (int i = 0; i < numLevels_; i++) {
- sb.append(" ").append(i).append(": ")
+ sb.append(" ").append(i).append(", ").append(levels_[i]).append(": ")
.append(KllHelper.levelCapacity(k_, numLevels_, i, m_))
.append(", ").append(safeLevelSize(i)).append(Util.LS);
}
@@ -858,52 +883,28 @@ public class KllFloatsSketch {
}
/**
- * Returns serialized sketch in a byte array form.
- * @return serialized sketch in a byte array form.
+ * Updates this sketch with the given data item.
+ *
+ * @param value an item from a stream of items. NaNs are ignored.
*/
- public byte[] toByteArray() {
- final byte[] bytes = new byte[getSerializedSizeBytes()];
- final boolean isSingleItem = n_ == 1;
- bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SMALL : PREAMBLE_INTS_FULL);
- bytes[SER_VER_BYTE] = isSingleItem ? serialVersionUID2 : serialVersionUID1;
- bytes[FAMILY_BYTE] = (byte) Family.KLL.getID();
- bytes[FLAGS_BYTE] = (byte) (
- (isEmpty() ? 1 << Flags.IS_EMPTY.ordinal() : 0)
- | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0)
- | (isSingleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)
- );
- ByteArrayUtil.putShortLE(bytes, K_SHORT, (short) k_);
- bytes[M_BYTE] = (byte) m_;
- if (isEmpty()) { return bytes; }
- int offset = DATA_START_SINGLE_ITEM;
- if (!isSingleItem) {
- ByteArrayUtil.putLongLE(bytes, N_LONG, n_);
- ByteArrayUtil.putShortLE(bytes, MIN_K_SHORT, (short) minK_);
- bytes[NUM_LEVELS_BYTE] = (byte) numLevels_;
- offset = DATA_START;
- // the last integer in levels_ is not serialized because it can be derived
- for (int i = 0; i < numLevels_; i++) {
- ByteArrayUtil.putIntLE(bytes, offset, levels_[i]);
- offset += Integer.BYTES;
- }
- ByteArrayUtil.putFloatLE(bytes, offset, minValue_);
- offset += Float.BYTES;
- ByteArrayUtil.putFloatLE(bytes, offset, maxValue_);
- offset += Float.BYTES;
+ public void update(final float value) {
+ if (Float.isNaN(value)) { return; }
+ if (isEmpty()) {
+ minValue_ = value;
+ maxValue_ = value;
+ } else {
+ if (value < minValue_) { minValue_ = value; }
+ if (value > maxValue_) { maxValue_ = value; }
}
- final int numItems = getNumRetained();
- for (int i = 0; i < numItems; i++) {
- ByteArrayUtil.putFloatLE(bytes, offset, items_[levels_[0] + i]);
- offset += Float.BYTES;
+ if (levels_[0] == 0) {
+ compressWhileUpdating();
}
- return bytes;
- }
-
- /**
- * @return the iterator for this class
- */
- public KllFloatsSketchIterator iterator() {
- return new KllFloatsSketchIterator(items_, levels_, numLevels_);
+ n_++;
+ isLevelZeroSorted_ = false;
+ final int nextPos = levels_[0] - 1;
+ assert levels_[0] >= 0;
+ levels_[0] = nextPos;
+ items_[nextPos] = value;
}
// Restricted Methods
@@ -912,7 +913,7 @@ public class KllFloatsSketch {
* Checks the validity of the given value k
* @param k must be greater than 7 and less than 65536.
*/
- static void checkK(final int k) {
+ private static void checkK(final int k) {
if ((k < MIN_K) || (k > MAX_K)) {
throw new SketchesArgumentException(
"K must be >= " + MIN_K + " and <= " + MAX_K + ": " + k);
@@ -1022,12 +1023,12 @@ public class KllFloatsSketch {
KllHelper.mergeSortedArrays(items_, adjBeg, halfAdjPop, items_, rawLim, popAbove,
items_, adjBeg + halfAdjPop);
}
- levels_[level + 1] -= halfAdjPop; // adjust boundaries of the level above
+ levels_[level + 1] -= halfAdjPop; // adjust boundaries of the level above
if (oddPop) {
levels_[level] = levels_[level + 1] - 1; // the current level now contains one item
items_[levels_[level]] = items_[rawBeg]; // namely this leftover guy
} else {
- levels_[level] = levels_[level + 1]; // the current level is now empty
+ levels_[level] = levels_[level + 1]; // the current level is now empty
}
// verify that we freed up halfAdjPop array slots just below the current level
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org