You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/08/10 22:48:12 UTC

[incubator-datasketches-java] 03/03: Moving methods around to make them easier to find.

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch UpdateKLL
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit 09cab7b1fa16e2fa2dcaf5a66aea48d8d8758925
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Mon Aug 10 15:47:29 2020 -0700

    Moving methods around to make them easier to find.
---
 .../apache/datasketches/kll/KllFloatsSketch.java   | 505 +++++++++++----------
 1 file changed, 253 insertions(+), 252 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
index 98b6169..e611233 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
@@ -190,12 +190,12 @@ public class KllFloatsSketch {
    *      ||   23    |   22  |   21   |   20   |   19   |    18   |   17   |      16      |
    *  2   ||<--------------data----------------| unused |numLevels|-------min K-----------|
    *
-   * Serialized sketch layout, Single Item:
+   * Serialized sketch layout, Empty and Single Item:
    *  Adr:
    *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
    *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
    *      ||   15    |   14  |   13   |   12   |   11   |   10    |    9   |      8       |
-   *  1   ||<--------------------------------data-----------------------------------------|
+   *  1   ||                                   |-------------------data-------------------|
    */
 
   // Preamble byte addresses
@@ -205,15 +205,17 @@ public class KllFloatsSketch {
   private static final int FLAGS_BYTE         = 3;
   private static final int K_SHORT            = 4;  // to 5
   private static final int M_BYTE             = 6;
+  //                                            7 is reserved for future use
   private static final int N_LONG             = 8;  // to 15
   private static final int MIN_K_SHORT        = 16; // to 17
   private static final int NUM_LEVELS_BYTE    = 18;
+  //                                            19 is reserved for future use
   private static final int DATA_START         = 20; // if using items larger than 4 bytes, use 24
   private static final int DATA_START_SINGLE_ITEM = 8;
 
   // Other static values
-  private static final byte serialVersionUID1 = 1;
-  private static final byte serialVersionUID2 = 2;
+  private static final byte serialVersionUID1  = 1;
+  private static final byte serialVersionUID2  = 2;
   private static final int PREAMBLE_INTS_SMALL = 2; // for empty and single item
   private static final int PREAMBLE_INTS_FULL  = 5; // if using items larger than 4 bytes, use 6
 
@@ -240,15 +242,14 @@ public class KllFloatsSketch {
   private int minK_;      // for error estimation after merging with different k
   private long n_;        // number of items input into this sketch
   private int numLevels_; // one-based number of current levels,
-  private int[] levels_;  // array of index offsets to the levels. Size = numLevels + 1.
+  private int[] levels_;  // array of index offsets into the items[]. Size = numLevels + 1.
   private boolean isLevelZeroSorted_;
 
   // Specific to the floats sketch
-  private float[] items_;
+  private float[] items_; // the continuous array of float items
   private float minValue_;
   private float maxValue_;
 
-
   /**
    * Heap constructor with the default <em>k = 200</em>, which has a rank error of about 1.65%.
    */
@@ -385,6 +386,31 @@ public class KllFloatsSketch {
   // public functions
 
   /**
+   * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+   * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
+   *
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(false) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @return an array of m+1 double values, which are a consecutive approximation to the CDF
+   * of the input stream given the splitPoints. The value at array position j of the returned
+   * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
+   * array.
+   */
+  public double[] getCDF(final float[] splitPoints) {
+    return getPmfOrCdf(splitPoints, true);
+  }
+
+  /**
    * Returns the parameter k
    * @return parameter k
    */
@@ -393,6 +419,50 @@ public class KllFloatsSketch {
   }
 
   /**
+   * Gets the approximate value of <em>k</em> to use given epsilon, the normalized rank error.
+   * @param epsilon the normalized rank error between zero and one.
+   * @param pmf if true, this function returns the value of <em>k</em> assuming the input epsilon
+   * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function
+   * returns the value of <em>k</em> assuming the input epsilon is the desired "single-sided"
+   * epsilon for all the other queries.
+   * @return the value of <i>k</i> given a value of epsilon.
+   * @see KllFloatsSketch
+   */
+  // constants were derived as the best fit to 99 percentile empirically measured max error in
+  // thousands of trials
+  public static int getKFromEpsilon(final double epsilon, final boolean pmf) {
+    //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false.
+    final double eps = max(epsilon, 4.7634E-5);
+    final double kdbl = pmf
+        ? exp(log(2.446 / eps) / 0.9433)
+        : exp(log(2.296 / eps) / 0.9723);
+    final double krnd = round(kdbl);
+    final double del = abs(krnd - kdbl);
+    final int k = (int) ((del < 1E-6) ? krnd : ceil(kdbl));
+    return max(MIN_K, min(MAX_K, k));
+  }
+
+  /**
+   * Returns the max value of the stream.
+   * If the sketch is empty this returns NaN.
+   *
+   * @return the max value of the stream
+   */
+  public float getMaxValue() {
+    return maxValue_;
+  }
+
+  /**
+   * Returns the min value of the stream.
+   * If the sketch is empty this returns NaN.
+   *
+   * @return the min value of the stream
+   */
+  public float getMinValue() {
+    return minValue_;
+  }
+
+  /**
    * Returns the length of the input stream.
    * @return stream length
    */
@@ -401,100 +471,107 @@ public class KllFloatsSketch {
   }
 
   /**
-   * Returns true if this sketch is empty.
-   * @return empty flag
+   * Gets the approximate "double-sided" rank error for the <i>getPMF()</i> function of this
+   * sketch normalized as a fraction between zero and one.
+   *
+   * @return the rank error normalized as a fraction between zero and one.
+   * @deprecated replaced by {@link #getNormalizedRankError(boolean)}
+   * @see KllFloatsSketch
    */
-  public boolean isEmpty() {
-    return n_ == 0;
+  @Deprecated
+  public double getNormalizedRankError() {
+    return getNormalizedRankError(true);
   }
 
   /**
-   * Returns the number of retained items (samples) in the sketch.
-   * @return the number of retained items (samples) in the sketch
+   * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
+   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @return if pmf is true, returns the normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @see KllFloatsSketch
    */
-  public int getNumRetained() {
-    return levels_[numLevels_] - levels_[0];
+  public double getNormalizedRankError(final boolean pmf) {
+    return getNormalizedRankError(minK_, pmf);
   }
 
   /**
-   * Returns true if this sketch is in estimation mode.
-   * @return estimation mode flag
+   * Static method version of the double-sided {@link #getNormalizedRankError()} that
+   * specifies <em>k</em>.
+   * @param k the configuration parameter
+   * @return the normalized "double-sided" rank error as a function of <em>k</em>.
+   * @see KllFloatsSketch
+   * @deprecated replaced by {@link #getNormalizedRankError(int, boolean)}
    */
-  public boolean isEstimationMode() {
-    return numLevels_ > 1;
+  @Deprecated
+  public static double getNormalizedRankError(final int k) {
+    return getNormalizedRankError(k, true);
   }
 
   /**
-   * Updates this sketch with the given data item.
-   *
-   * @param value an item from a stream of items. NaNs are ignored.
+   * Gets the normalized rank error given k and pmf.
+   * Static method version of the {@link #getNormalizedRankError(boolean)}.
+   * @param k the configuation parameter
+   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @return if pmf is true, the normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @see KllFloatsSketch
    */
-  public void update(final float value) {
-    if (Float.isNaN(value)) { return; }
-    if (isEmpty()) {
-      minValue_ = value;
-      maxValue_ = value;
-    } else {
-      if (value < minValue_) { minValue_ = value; }
-      if (value > maxValue_) { maxValue_ = value; }
-    }
-    if (levels_[0] == 0) {
-      compressWhileUpdating();
-    }
-    n_++;
-    isLevelZeroSorted_ = false;
-    final int nextPos = levels_[0] - 1;
-    assert levels_[0] >= 0;
-    levels_[0] = nextPos;
-    items_[nextPos] = value;
+  // constants were derived as the best fit to 99 percentile empirically measured max error in
+  // thousands of trials
+  public static double getNormalizedRankError(final int k, final boolean pmf) {
+    return pmf
+        ? 2.446 / pow(k, 0.9433)
+        : 2.296 / pow(k, 0.9723);
   }
 
   /**
-   * Merges another sketch into this one.
-   * @param other sketch to merge into this one
+   * Returns the number of retained items (samples) in the sketch.
+   * @return the number of retained items (samples) in the sketch
    */
-  public void merge(final KllFloatsSketch other) {
-    if ((other == null) || other.isEmpty()) { return; }
-    if (m_ != other.m_) {
-      throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_);
-    }
-    final long finalN = n_ + other.n_;
-    //update this sketch with level0 items from the other sketch
-    for (int i = other.levels_[0]; i < other.levels_[1]; i++) {
-      update(other.items_[i]);
-    }
-    if (other.numLevels_ >= 2) { //now merge other levels if they exist
-      mergeHigherLevels(other, finalN);
-    }
-    //update min, max values, n
-    if (Float.isNaN(minValue_) || (other.minValue_ < minValue_)) { minValue_ = other.minValue_; }
-    if (Float.isNaN(maxValue_) || (other.maxValue_ > maxValue_)) { maxValue_ = other.maxValue_; }
-    n_ = finalN;
-
-    assertCorrectTotalWeight();
-    if (other.isEstimationMode()) {
-      minK_ = min(minK_, other.minK_);
-    }
+  public int getNumRetained() {
+    return levels_[numLevels_] - levels_[0];
   }
 
   /**
-   * Returns the min value of the stream.
-   * If the sketch is empty this returns NaN.
-   *
-   * @return the min value of the stream
+   * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
+   * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
+   * This method can be used if allocation of storage is necessary beforehand, but it is not
+   * optimal.
+   * @param k parameter that controls size of the sketch and accuracy of estimates
+   * @param n stream length
+   * @return upper bound on the serialized size
    */
-  public float getMinValue() {
-    return minValue_;
+  public static int getMaxSerializedSizeBytes(final int k, final long n) {
+    final int numLevels = KllHelper.ubOnNumLevels(n);
+    final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels);
+    return getSerializedSizeBytes(numLevels, maxNumItems);
   }
 
   /**
-   * Returns the max value of the stream.
-   * If the sketch is empty this returns NaN.
+   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+   * given a set of splitPoints (values).
    *
-   * @return the max value of the stream
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(true) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @return an array of m+1 doubles each of which is an approximation
+   * to the fraction of the input stream values (the mass) that fall into one of those intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
+   * splitPoint, with the exception that the last interval will include maximum value.
    */
-  public float getMaxValue() {
-    return maxValue_;
+  public double[] getPMF(final float[] splitPoints) {
+    return getPmfOrCdf(splitPoints, false);
   }
 
   /**
@@ -640,157 +717,105 @@ public class KllFloatsSketch {
   }
 
   /**
-   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
-   * given a set of splitPoints (values).
-   *
-   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
-   * getNormalizedRankError(true) function.
-   *
-   * <p>If the sketch is empty this returns null.</p>
-   *
-   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
-   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
-   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
-   * exclusive of the right splitPoint, with the exception that the last interval will include
-   * the maximum value.
-   * It is not necessary to include either the min or max values in these split points.
-   *
-   * @return an array of m+1 doubles each of which is an approximation
-   * to the fraction of the input stream values (the mass) that fall into one of those intervals.
-   * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
-   * splitPoint, with the exception that the last interval will include maximum value.
-   */
-  public double[] getPMF(final float[] splitPoints) {
-    return getPmfOrCdf(splitPoints, false);
-  }
-
-  /**
-   * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
-   * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
-   *
-   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
-   * getNormalizedRankError(false) function.
-   *
-   * <p>If the sketch is empty this returns null.</p>
-   *
-   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
-   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
-   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
-   * exclusive of the right splitPoint, with the exception that the last interval will include
-   * the maximum value.
-   * It is not necessary to include either the min or max values in these split points.
-   *
-   * @return an array of m+1 double values, which are a consecutive approximation to the CDF
-   * of the input stream given the splitPoints. The value at array position j of the returned
-   * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
-   * array.
-   */
-  public double[] getCDF(final float[] splitPoints) {
-    return getPmfOrCdf(splitPoints, true);
-  }
-
-  /**
-   * Gets the approximate "double-sided" rank error for the <i>getPMF()</i> function of this
-   * sketch normalized as a fraction between zero and one.
-   *
-   * @return the rank error normalized as a fraction between zero and one.
-   * @deprecated replaced by {@link #getNormalizedRankError(boolean)}
-   * @see KllFloatsSketch
+   * Returns the number of bytes this sketch would require to store.
+   * @return the number of bytes this sketch would require to store.
    */
-  @Deprecated
-  public double getNormalizedRankError() {
-    return getNormalizedRankError(true);
+  public int getSerializedSizeBytes() {
+    if (isEmpty()) { return N_LONG; }
+    return getSerializedSizeBytes(numLevels_, getNumRetained());
   }
 
   /**
-   * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
-   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @return if pmf is true, returns the normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @see KllFloatsSketch
+   * Returns true if this sketch is empty.
+   * @return empty flag
    */
-  public double getNormalizedRankError(final boolean pmf) {
-    return getNormalizedRankError(minK_, pmf);
+  public boolean isEmpty() {
+    return n_ == 0;
   }
 
   /**
-   * Static method version of the double-sided {@link #getNormalizedRankError()} that
-   * specifies <em>k</em>.
-   * @param k the configuration parameter
-   * @return the normalized "double-sided" rank error as a function of <em>k</em>.
-   * @see KllFloatsSketch
-   * @deprecated replaced by {@link #getNormalizedRankError(int, boolean)}
+   * Returns true if this sketch is in estimation mode.
+   * @return estimation mode flag
    */
-  @Deprecated
-  public static double getNormalizedRankError(final int k) {
-    return getNormalizedRankError(k, true);
+  public boolean isEstimationMode() {
+    return numLevels_ > 1;
   }
 
   /**
-   * Gets the normalized rank error given k and pmf.
-   * Static method version of the {@link #getNormalizedRankError(boolean)}.
-   * @param k the configuation parameter
-   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @return if pmf is true, the normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @see KllFloatsSketch
+   * @return the iterator for this class
    */
-  // constants were derived as the best fit to 99 percentile empirically measured max error in
-  // thousands of trials
-  public static double getNormalizedRankError(final int k, final boolean pmf) {
-    return pmf
-        ? 2.446 / pow(k, 0.9433)
-        : 2.296 / pow(k, 0.9723);
+  public KllFloatsSketchIterator iterator() {
+    return new KllFloatsSketchIterator(items_, levels_, numLevels_);
   }
 
   /**
-   * Gets the approximate value of <em>k</em> to use given epsilon, the normalized rank error.
-   * @param epsilon the normalized rank error between zero and one.
-   * @param pmf if true, this function returns the value of <em>k</em> assuming the input epsilon
-   * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function
-   * returns the value of <em>k</em> assuming the input epsilon is the desired "single-sided"
-   * epsilon for all the other queries.
-   * @return the value of <i>k</i> given a value of epsilon.
-   * @see KllFloatsSketch
+   * Merges another sketch into this one.
+   * @param other sketch to merge into this one
    */
-  // constants were derived as the best fit to 99 percentile empirically measured max error in
-  // thousands of trials
-  public static int getKFromEpsilon(final double epsilon, final boolean pmf) {
-    //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false.
-    final double eps = max(epsilon, 4.7634E-5);
-    final double kdbl = pmf
-        ? exp(log(2.446 / eps) / 0.9433)
-        : exp(log(2.296 / eps) / 0.9723);
-    final double krnd = round(kdbl);
-    final double del = abs(krnd - kdbl);
-    final int k = (int) ((del < 1E-6) ? krnd : ceil(kdbl));
-    return max(MIN_K, min(MAX_K, k));
-  }
+  public void merge(final KllFloatsSketch other) {
+    if ((other == null) || other.isEmpty()) { return; }
+    if (m_ != other.m_) {
+      throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_);
+    }
+    final long finalN = n_ + other.n_;
+    //update this sketch with level0 items from the other sketch
+    for (int i = other.levels_[0]; i < other.levels_[1]; i++) {
+      update(other.items_[i]);
+    }
+    if (other.numLevels_ >= 2) { //now merge other levels if they exist
+      mergeHigherLevels(other, finalN);
+    }
+    //update min, max values, n
+    if (Float.isNaN(minValue_) || (other.minValue_ < minValue_)) { minValue_ = other.minValue_; }
+    if (Float.isNaN(maxValue_) || (other.maxValue_ > maxValue_)) { maxValue_ = other.maxValue_; }
+    n_ = finalN;
 
-  /**
-   * Returns the number of bytes this sketch would require to store.
-   * @return the number of bytes this sketch would require to store.
-   */
-  public int getSerializedSizeBytes() {
-    if (isEmpty()) { return N_LONG; }
-    return getSerializedSizeBytes(numLevels_, getNumRetained());
+    assertCorrectTotalWeight();
+    if (other.isEstimationMode()) {
+      minK_ = min(minK_, other.minK_);
+    }
   }
 
   /**
-   * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
-   * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
-   * This method can be used if allocation of storage is necessary beforehand, but it is not
-   * optimal.
-   * @param k parameter that controls size of the sketch and accuracy of estimates
-   * @param n stream length
-   * @return upper bound on the serialized size
+   * Returns serialized sketch in a byte array form.
+   * @return serialized sketch in a byte array form.
    */
-  public static int getMaxSerializedSizeBytes(final int k, final long n) {
-    final int numLevels = KllHelper.ubOnNumLevels(n);
-    final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels);
-    return getSerializedSizeBytes(numLevels, maxNumItems);
+  public byte[] toByteArray() {
+    final byte[] bytes = new byte[getSerializedSizeBytes()];
+    final boolean isSingleItem = n_ == 1;
+    bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SMALL : PREAMBLE_INTS_FULL);
+    bytes[SER_VER_BYTE] = isSingleItem ? serialVersionUID2 : serialVersionUID1;
+    bytes[FAMILY_BYTE] = (byte) Family.KLL.getID();
+    bytes[FLAGS_BYTE] = (byte) (
+        (isEmpty() ? 1 << Flags.IS_EMPTY.ordinal() : 0)
+      | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0)
+      | (isSingleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)
+    );
+    ByteArrayUtil.putShortLE(bytes, K_SHORT, (short) k_);
+    bytes[M_BYTE] = (byte) m_;
+    if (isEmpty()) { return bytes; }
+    int offset = DATA_START_SINGLE_ITEM;
+    if (!isSingleItem) {
+      ByteArrayUtil.putLongLE(bytes, N_LONG, n_);
+      ByteArrayUtil.putShortLE(bytes, MIN_K_SHORT, (short) minK_);
+      bytes[NUM_LEVELS_BYTE] = (byte) numLevels_;
+      offset = DATA_START;
+      // the last integer in levels_ is not serialized because it can be derived
+      for (int i = 0; i < numLevels_; i++) {
+        ByteArrayUtil.putIntLE(bytes, offset, levels_[i]);
+        offset += Integer.BYTES;
+      }
+      ByteArrayUtil.putFloatLE(bytes, offset, minValue_);
+      offset += Float.BYTES;
+      ByteArrayUtil.putFloatLE(bytes, offset, maxValue_);
+      offset += Float.BYTES;
+    }
+    final int numItems = getNumRetained();
+    for (int i = 0; i < numItems; i++) {
+      ByteArrayUtil.putFloatLE(bytes, offset, items_[levels_[0] + i]);
+      offset += Float.BYTES;
+    }
+    return bytes;
   }
 
   @Override
@@ -828,9 +853,9 @@ public class KllFloatsSketch {
 
     if (withLevels) {
       sb.append("### KLL sketch levels:").append(Util.LS)
-      .append("   index: nominal capacity, actual size").append(Util.LS);
+      .append("   level, offset: nominal capacity, actual size").append(Util.LS);
       for (int i = 0; i < numLevels_; i++) {
-        sb.append("   ").append(i).append(": ")
+        sb.append("   ").append(i).append(", ").append(levels_[i]).append(": ")
         .append(KllHelper.levelCapacity(k_, numLevels_, i, m_))
         .append(", ").append(safeLevelSize(i)).append(Util.LS);
       }
@@ -858,52 +883,28 @@ public class KllFloatsSketch {
   }
 
   /**
-   * Returns serialized sketch in a byte array form.
-   * @return serialized sketch in a byte array form.
+   * Updates this sketch with the given data item.
+   *
+   * @param value an item from a stream of items. NaNs are ignored.
    */
-  public byte[] toByteArray() {
-    final byte[] bytes = new byte[getSerializedSizeBytes()];
-    final boolean isSingleItem = n_ == 1;
-    bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SMALL : PREAMBLE_INTS_FULL);
-    bytes[SER_VER_BYTE] = isSingleItem ? serialVersionUID2 : serialVersionUID1;
-    bytes[FAMILY_BYTE] = (byte) Family.KLL.getID();
-    bytes[FLAGS_BYTE] = (byte) (
-        (isEmpty() ? 1 << Flags.IS_EMPTY.ordinal() : 0)
-      | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0)
-      | (isSingleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)
-    );
-    ByteArrayUtil.putShortLE(bytes, K_SHORT, (short) k_);
-    bytes[M_BYTE] = (byte) m_;
-    if (isEmpty()) { return bytes; }
-    int offset = DATA_START_SINGLE_ITEM;
-    if (!isSingleItem) {
-      ByteArrayUtil.putLongLE(bytes, N_LONG, n_);
-      ByteArrayUtil.putShortLE(bytes, MIN_K_SHORT, (short) minK_);
-      bytes[NUM_LEVELS_BYTE] = (byte) numLevels_;
-      offset = DATA_START;
-      // the last integer in levels_ is not serialized because it can be derived
-      for (int i = 0; i < numLevels_; i++) {
-        ByteArrayUtil.putIntLE(bytes, offset, levels_[i]);
-        offset += Integer.BYTES;
-      }
-      ByteArrayUtil.putFloatLE(bytes, offset, minValue_);
-      offset += Float.BYTES;
-      ByteArrayUtil.putFloatLE(bytes, offset, maxValue_);
-      offset += Float.BYTES;
+  public void update(final float value) {
+    if (Float.isNaN(value)) { return; }
+    if (isEmpty()) {
+      minValue_ = value;
+      maxValue_ = value;
+    } else {
+      if (value < minValue_) { minValue_ = value; }
+      if (value > maxValue_) { maxValue_ = value; }
     }
-    final int numItems = getNumRetained();
-    for (int i = 0; i < numItems; i++) {
-      ByteArrayUtil.putFloatLE(bytes, offset, items_[levels_[0] + i]);
-      offset += Float.BYTES;
+    if (levels_[0] == 0) {
+      compressWhileUpdating();
     }
-    return bytes;
-  }
-
-  /**
-   * @return the iterator for this class
-   */
-  public KllFloatsSketchIterator iterator() {
-    return new KllFloatsSketchIterator(items_, levels_, numLevels_);
+    n_++;
+    isLevelZeroSorted_ = false;
+    final int nextPos = levels_[0] - 1;
+    assert levels_[0] >= 0;
+    levels_[0] = nextPos;
+    items_[nextPos] = value;
   }
 
   // Restricted Methods
@@ -912,7 +913,7 @@ public class KllFloatsSketch {
    * Checks the validity of the given value k
    * @param k must be greater than 7 and less than 65536.
    */
-  static void checkK(final int k) {
+  private static void checkK(final int k) {
     if ((k < MIN_K) || (k > MAX_K)) {
       throw new SketchesArgumentException(
           "K must be >= " + MIN_K + " and <= " + MAX_K + ": " + k);
@@ -1022,12 +1023,12 @@ public class KllFloatsSketch {
       KllHelper.mergeSortedArrays(items_, adjBeg, halfAdjPop, items_, rawLim, popAbove,
           items_, adjBeg + halfAdjPop);
     }
-    levels_[level + 1] -= halfAdjPop; // adjust boundaries of the level above
+    levels_[level + 1] -= halfAdjPop;          // adjust boundaries of the level above
     if (oddPop) {
       levels_[level] = levels_[level + 1] - 1; // the current level now contains one item
       items_[levels_[level]] = items_[rawBeg]; // namely this leftover guy
     } else {
-      levels_[level] = levels_[level + 1]; // the current level is now empty
+      levels_[level] = levels_[level + 1];     // the current level is now empty
     }
 
     // verify that we freed up halfAdjPop array slots just below the current level


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org