You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/08/10 22:48:09 UTC

[incubator-datasketches-java] branch UpdateKLL created (now 09cab7b)

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a change to branch UpdateKLL
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git.


      at 09cab7b  Moving methods around to make them easier to find.

This branch includes the following new commits:

     new a8eaf9e  Some minor bug fixes and rearrange KllFloatsSketch methods.
     new 81a5810  Merge branch 'master' into UpdateKLL
     new 09cab7b  Moving methods around to make them easier to find.

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 03/03: Moving methods around to make them easier to find.

Posted by le...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch UpdateKLL
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit 09cab7b1fa16e2fa2dcaf5a66aea48d8d8758925
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Mon Aug 10 15:47:29 2020 -0700

    Moving methods around to make them easier to find.
---
 .../apache/datasketches/kll/KllFloatsSketch.java   | 505 +++++++++++----------
 1 file changed, 253 insertions(+), 252 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
index 98b6169..e611233 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
@@ -190,12 +190,12 @@ public class KllFloatsSketch {
    *      ||   23    |   22  |   21   |   20   |   19   |    18   |   17   |      16      |
    *  2   ||<--------------data----------------| unused |numLevels|-------min K-----------|
    *
-   * Serialized sketch layout, Single Item:
+   * Serialized sketch layout, Empty and Single Item:
    *  Adr:
    *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
    *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
    *      ||   15    |   14  |   13   |   12   |   11   |   10    |    9   |      8       |
-   *  1   ||<--------------------------------data-----------------------------------------|
+   *  1   ||                                   |-------------------data-------------------|
    */
 
   // Preamble byte addresses
@@ -205,15 +205,17 @@ public class KllFloatsSketch {
   private static final int FLAGS_BYTE         = 3;
   private static final int K_SHORT            = 4;  // to 5
   private static final int M_BYTE             = 6;
+  //                                            7 is reserved for future use
   private static final int N_LONG             = 8;  // to 15
   private static final int MIN_K_SHORT        = 16; // to 17
   private static final int NUM_LEVELS_BYTE    = 18;
+  //                                            19 is reserved for future use
   private static final int DATA_START         = 20; // if using items larger than 4 bytes, use 24
   private static final int DATA_START_SINGLE_ITEM = 8;
 
   // Other static values
-  private static final byte serialVersionUID1 = 1;
-  private static final byte serialVersionUID2 = 2;
+  private static final byte serialVersionUID1  = 1;
+  private static final byte serialVersionUID2  = 2;
   private static final int PREAMBLE_INTS_SMALL = 2; // for empty and single item
   private static final int PREAMBLE_INTS_FULL  = 5; // if using items larger than 4 bytes, use 6
 
@@ -240,15 +242,14 @@ public class KllFloatsSketch {
   private int minK_;      // for error estimation after merging with different k
   private long n_;        // number of items input into this sketch
   private int numLevels_; // one-based number of current levels,
-  private int[] levels_;  // array of index offsets to the levels. Size = numLevels + 1.
+  private int[] levels_;  // array of index offsets into the items[]. Size = numLevels + 1.
   private boolean isLevelZeroSorted_;
 
   // Specific to the floats sketch
-  private float[] items_;
+  private float[] items_; // the continuous array of float items
   private float minValue_;
   private float maxValue_;
 
-
   /**
    * Heap constructor with the default <em>k = 200</em>, which has a rank error of about 1.65%.
    */
@@ -385,6 +386,31 @@ public class KllFloatsSketch {
   // public functions
 
   /**
+   * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
+   * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
+   *
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(false) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @return an array of m+1 double values, which are a consecutive approximation to the CDF
+   * of the input stream given the splitPoints. The value at array position j of the returned
+   * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
+   * array.
+   */
+  public double[] getCDF(final float[] splitPoints) {
+    return getPmfOrCdf(splitPoints, true);
+  }
+
+  /**
    * Returns the parameter k
    * @return parameter k
    */
@@ -393,6 +419,50 @@ public class KllFloatsSketch {
   }
 
   /**
+   * Gets the approximate value of <em>k</em> to use given epsilon, the normalized rank error.
+   * @param epsilon the normalized rank error between zero and one.
+   * @param pmf if true, this function returns the value of <em>k</em> assuming the input epsilon
+   * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function
+   * returns the value of <em>k</em> assuming the input epsilon is the desired "single-sided"
+   * epsilon for all the other queries.
+   * @return the value of <i>k</i> given a value of epsilon.
+   * @see KllFloatsSketch
+   */
+  // constants were derived as the best fit to 99 percentile empirically measured max error in
+  // thousands of trials
+  public static int getKFromEpsilon(final double epsilon, final boolean pmf) {
+    //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false.
+    final double eps = max(epsilon, 4.7634E-5);
+    final double kdbl = pmf
+        ? exp(log(2.446 / eps) / 0.9433)
+        : exp(log(2.296 / eps) / 0.9723);
+    final double krnd = round(kdbl);
+    final double del = abs(krnd - kdbl);
+    final int k = (int) ((del < 1E-6) ? krnd : ceil(kdbl));
+    return max(MIN_K, min(MAX_K, k));
+  }
+
+  /**
+   * Returns the max value of the stream.
+   * If the sketch is empty this returns NaN.
+   *
+   * @return the max value of the stream
+   */
+  public float getMaxValue() {
+    return maxValue_;
+  }
+
+  /**
+   * Returns the min value of the stream.
+   * If the sketch is empty this returns NaN.
+   *
+   * @return the min value of the stream
+   */
+  public float getMinValue() {
+    return minValue_;
+  }
+
+  /**
    * Returns the length of the input stream.
    * @return stream length
    */
@@ -401,100 +471,107 @@ public class KllFloatsSketch {
   }
 
   /**
-   * Returns true if this sketch is empty.
-   * @return empty flag
+   * Gets the approximate "double-sided" rank error for the <i>getPMF()</i> function of this
+   * sketch normalized as a fraction between zero and one.
+   *
+   * @return the rank error normalized as a fraction between zero and one.
+   * @deprecated replaced by {@link #getNormalizedRankError(boolean)}
+   * @see KllFloatsSketch
    */
-  public boolean isEmpty() {
-    return n_ == 0;
+  @Deprecated
+  public double getNormalizedRankError() {
+    return getNormalizedRankError(true);
   }
 
   /**
-   * Returns the number of retained items (samples) in the sketch.
-   * @return the number of retained items (samples) in the sketch
+   * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
+   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @return if pmf is true, returns the normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @see KllFloatsSketch
    */
-  public int getNumRetained() {
-    return levels_[numLevels_] - levels_[0];
+  public double getNormalizedRankError(final boolean pmf) {
+    return getNormalizedRankError(minK_, pmf);
   }
 
   /**
-   * Returns true if this sketch is in estimation mode.
-   * @return estimation mode flag
+   * Static method version of the double-sided {@link #getNormalizedRankError()} that
+   * specifies <em>k</em>.
+   * @param k the configuration parameter
+   * @return the normalized "double-sided" rank error as a function of <em>k</em>.
+   * @see KllFloatsSketch
+   * @deprecated replaced by {@link #getNormalizedRankError(int, boolean)}
    */
-  public boolean isEstimationMode() {
-    return numLevels_ > 1;
+  @Deprecated
+  public static double getNormalizedRankError(final int k) {
+    return getNormalizedRankError(k, true);
   }
 
   /**
-   * Updates this sketch with the given data item.
-   *
-   * @param value an item from a stream of items. NaNs are ignored.
+   * Gets the normalized rank error given k and pmf.
+   * Static method version of the {@link #getNormalizedRankError(boolean)}.
+   * @param k the configuation parameter
+   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @return if pmf is true, the normalized rank error for the getPMF() function.
+   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
+   * @see KllFloatsSketch
    */
-  public void update(final float value) {
-    if (Float.isNaN(value)) { return; }
-    if (isEmpty()) {
-      minValue_ = value;
-      maxValue_ = value;
-    } else {
-      if (value < minValue_) { minValue_ = value; }
-      if (value > maxValue_) { maxValue_ = value; }
-    }
-    if (levels_[0] == 0) {
-      compressWhileUpdating();
-    }
-    n_++;
-    isLevelZeroSorted_ = false;
-    final int nextPos = levels_[0] - 1;
-    assert levels_[0] >= 0;
-    levels_[0] = nextPos;
-    items_[nextPos] = value;
+  // constants were derived as the best fit to 99 percentile empirically measured max error in
+  // thousands of trials
+  public static double getNormalizedRankError(final int k, final boolean pmf) {
+    return pmf
+        ? 2.446 / pow(k, 0.9433)
+        : 2.296 / pow(k, 0.9723);
   }
 
   /**
-   * Merges another sketch into this one.
-   * @param other sketch to merge into this one
+   * Returns the number of retained items (samples) in the sketch.
+   * @return the number of retained items (samples) in the sketch
    */
-  public void merge(final KllFloatsSketch other) {
-    if ((other == null) || other.isEmpty()) { return; }
-    if (m_ != other.m_) {
-      throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_);
-    }
-    final long finalN = n_ + other.n_;
-    //update this sketch with level0 items from the other sketch
-    for (int i = other.levels_[0]; i < other.levels_[1]; i++) {
-      update(other.items_[i]);
-    }
-    if (other.numLevels_ >= 2) { //now merge other levels if they exist
-      mergeHigherLevels(other, finalN);
-    }
-    //update min, max values, n
-    if (Float.isNaN(minValue_) || (other.minValue_ < minValue_)) { minValue_ = other.minValue_; }
-    if (Float.isNaN(maxValue_) || (other.maxValue_ > maxValue_)) { maxValue_ = other.maxValue_; }
-    n_ = finalN;
-
-    assertCorrectTotalWeight();
-    if (other.isEstimationMode()) {
-      minK_ = min(minK_, other.minK_);
-    }
+  public int getNumRetained() {
+    return levels_[numLevels_] - levels_[0];
   }
 
   /**
-   * Returns the min value of the stream.
-   * If the sketch is empty this returns NaN.
-   *
-   * @return the min value of the stream
+   * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
+   * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
+   * This method can be used if allocation of storage is necessary beforehand, but it is not
+   * optimal.
+   * @param k parameter that controls size of the sketch and accuracy of estimates
+   * @param n stream length
+   * @return upper bound on the serialized size
    */
-  public float getMinValue() {
-    return minValue_;
+  public static int getMaxSerializedSizeBytes(final int k, final long n) {
+    final int numLevels = KllHelper.ubOnNumLevels(n);
+    final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels);
+    return getSerializedSizeBytes(numLevels, maxNumItems);
   }
 
   /**
-   * Returns the max value of the stream.
-   * If the sketch is empty this returns NaN.
+   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
+   * given a set of splitPoints (values).
    *
-   * @return the max value of the stream
+   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
+   * getNormalizedRankError(true) function.
+   *
+   * <p>If the sketch is empty this returns null.</p>
+   *
+   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
+   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
+   * exclusive of the right splitPoint, with the exception that the last interval will include
+   * the maximum value.
+   * It is not necessary to include either the min or max values in these split points.
+   *
+   * @return an array of m+1 doubles each of which is an approximation
+   * to the fraction of the input stream values (the mass) that fall into one of those intervals.
+   * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
+   * splitPoint, with the exception that the last interval will include maximum value.
    */
-  public float getMaxValue() {
-    return maxValue_;
+  public double[] getPMF(final float[] splitPoints) {
+    return getPmfOrCdf(splitPoints, false);
   }
 
   /**
@@ -640,157 +717,105 @@ public class KllFloatsSketch {
   }
 
   /**
-   * Returns an approximation to the Probability Mass Function (PMF) of the input stream
-   * given a set of splitPoints (values).
-   *
-   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
-   * getNormalizedRankError(true) function.
-   *
-   * <p>If the sketch is empty this returns null.</p>
-   *
-   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
-   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
-   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
-   * exclusive of the right splitPoint, with the exception that the last interval will include
-   * the maximum value.
-   * It is not necessary to include either the min or max values in these split points.
-   *
-   * @return an array of m+1 doubles each of which is an approximation
-   * to the fraction of the input stream values (the mass) that fall into one of those intervals.
-   * The definition of an "interval" is inclusive of the left splitPoint and exclusive of the right
-   * splitPoint, with the exception that the last interval will include maximum value.
-   */
-  public double[] getPMF(final float[] splitPoints) {
-    return getPmfOrCdf(splitPoints, false);
-  }
-
-  /**
-   * Returns an approximation to the Cumulative Distribution Function (CDF), which is the
-   * cumulative analog of the PMF, of the input stream given a set of splitPoint (values).
-   *
-   * <p>The resulting approximations have a probabilistic guarantee that can be obtained from the
-   * getNormalizedRankError(false) function.
-   *
-   * <p>If the sketch is empty this returns null.</p>
-   *
-   * @param splitPoints an array of <i>m</i> unique, monotonically increasing float values
-   * that divide the real number line into <i>m+1</i> consecutive disjoint intervals.
-   * The definition of an "interval" is inclusive of the left splitPoint (or minimum value) and
-   * exclusive of the right splitPoint, with the exception that the last interval will include
-   * the maximum value.
-   * It is not necessary to include either the min or max values in these split points.
-   *
-   * @return an array of m+1 double values, which are a consecutive approximation to the CDF
-   * of the input stream given the splitPoints. The value at array position j of the returned
-   * CDF array is the sum of the returned values in positions 0 through j of the returned PMF
-   * array.
-   */
-  public double[] getCDF(final float[] splitPoints) {
-    return getPmfOrCdf(splitPoints, true);
-  }
-
-  /**
-   * Gets the approximate "double-sided" rank error for the <i>getPMF()</i> function of this
-   * sketch normalized as a fraction between zero and one.
-   *
-   * @return the rank error normalized as a fraction between zero and one.
-   * @deprecated replaced by {@link #getNormalizedRankError(boolean)}
-   * @see KllFloatsSketch
+   * Returns the number of bytes this sketch would require to store.
+   * @return the number of bytes this sketch would require to store.
    */
-  @Deprecated
-  public double getNormalizedRankError() {
-    return getNormalizedRankError(true);
+  public int getSerializedSizeBytes() {
+    if (isEmpty()) { return N_LONG; }
+    return getSerializedSizeBytes(numLevels_, getNumRetained());
   }
 
   /**
-   * Gets the approximate rank error of this sketch normalized as a fraction between zero and one.
-   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @return if pmf is true, returns the normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @see KllFloatsSketch
+   * Returns true if this sketch is empty.
+   * @return empty flag
    */
-  public double getNormalizedRankError(final boolean pmf) {
-    return getNormalizedRankError(minK_, pmf);
+  public boolean isEmpty() {
+    return n_ == 0;
   }
 
   /**
-   * Static method version of the double-sided {@link #getNormalizedRankError()} that
-   * specifies <em>k</em>.
-   * @param k the configuration parameter
-   * @return the normalized "double-sided" rank error as a function of <em>k</em>.
-   * @see KllFloatsSketch
-   * @deprecated replaced by {@link #getNormalizedRankError(int, boolean)}
+   * Returns true if this sketch is in estimation mode.
+   * @return estimation mode flag
    */
-  @Deprecated
-  public static double getNormalizedRankError(final int k) {
-    return getNormalizedRankError(k, true);
+  public boolean isEstimationMode() {
+    return numLevels_ > 1;
   }
 
   /**
-   * Gets the normalized rank error given k and pmf.
-   * Static method version of the {@link #getNormalizedRankError(boolean)}.
-   * @param k the configuation parameter
-   * @param pmf if true, returns the "double-sided" normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @return if pmf is true, the normalized rank error for the getPMF() function.
-   * Otherwise, it is the "single-sided" normalized rank error for all the other queries.
-   * @see KllFloatsSketch
+   * @return the iterator for this class
    */
-  // constants were derived as the best fit to 99 percentile empirically measured max error in
-  // thousands of trials
-  public static double getNormalizedRankError(final int k, final boolean pmf) {
-    return pmf
-        ? 2.446 / pow(k, 0.9433)
-        : 2.296 / pow(k, 0.9723);
+  public KllFloatsSketchIterator iterator() {
+    return new KllFloatsSketchIterator(items_, levels_, numLevels_);
   }
 
   /**
-   * Gets the approximate value of <em>k</em> to use given epsilon, the normalized rank error.
-   * @param epsilon the normalized rank error between zero and one.
-   * @param pmf if true, this function returns the value of <em>k</em> assuming the input epsilon
-   * is the desired "double-sided" epsilon for the getPMF() function. Otherwise, this function
-   * returns the value of <em>k</em> assuming the input epsilon is the desired "single-sided"
-   * epsilon for all the other queries.
-   * @return the value of <i>k</i> given a value of epsilon.
-   * @see KllFloatsSketch
+   * Merges another sketch into this one.
+   * @param other sketch to merge into this one
    */
-  // constants were derived as the best fit to 99 percentile empirically measured max error in
-  // thousands of trials
-  public static int getKFromEpsilon(final double epsilon, final boolean pmf) {
-    //Ensure that eps is >= than the lowest possible eps given MAX_K and pmf=false.
-    final double eps = max(epsilon, 4.7634E-5);
-    final double kdbl = pmf
-        ? exp(log(2.446 / eps) / 0.9433)
-        : exp(log(2.296 / eps) / 0.9723);
-    final double krnd = round(kdbl);
-    final double del = abs(krnd - kdbl);
-    final int k = (int) ((del < 1E-6) ? krnd : ceil(kdbl));
-    return max(MIN_K, min(MAX_K, k));
-  }
+  public void merge(final KllFloatsSketch other) {
+    if ((other == null) || other.isEmpty()) { return; }
+    if (m_ != other.m_) {
+      throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_);
+    }
+    final long finalN = n_ + other.n_;
+    //update this sketch with level0 items from the other sketch
+    for (int i = other.levels_[0]; i < other.levels_[1]; i++) {
+      update(other.items_[i]);
+    }
+    if (other.numLevels_ >= 2) { //now merge other levels if they exist
+      mergeHigherLevels(other, finalN);
+    }
+    //update min, max values, n
+    if (Float.isNaN(minValue_) || (other.minValue_ < minValue_)) { minValue_ = other.minValue_; }
+    if (Float.isNaN(maxValue_) || (other.maxValue_ > maxValue_)) { maxValue_ = other.maxValue_; }
+    n_ = finalN;
 
-  /**
-   * Returns the number of bytes this sketch would require to store.
-   * @return the number of bytes this sketch would require to store.
-   */
-  public int getSerializedSizeBytes() {
-    if (isEmpty()) { return N_LONG; }
-    return getSerializedSizeBytes(numLevels_, getNumRetained());
+    assertCorrectTotalWeight();
+    if (other.isEstimationMode()) {
+      minK_ = min(minK_, other.minK_);
+    }
   }
 
   /**
-   * Returns upper bound on the serialized size of a sketch given a parameter <em>k</em> and stream
-   * length. The resulting size is an overestimate to make sure actual sketches don't exceed it.
-   * This method can be used if allocation of storage is necessary beforehand, but it is not
-   * optimal.
-   * @param k parameter that controls size of the sketch and accuracy of estimates
-   * @param n stream length
-   * @return upper bound on the serialized size
+   * Returns serialized sketch in a byte array form.
+   * @return serialized sketch in a byte array form.
    */
-  public static int getMaxSerializedSizeBytes(final int k, final long n) {
-    final int numLevels = KllHelper.ubOnNumLevels(n);
-    final int maxNumItems = KllHelper.computeTotalCapacity(k, DEFAULT_M, numLevels);
-    return getSerializedSizeBytes(numLevels, maxNumItems);
+  public byte[] toByteArray() {
+    final byte[] bytes = new byte[getSerializedSizeBytes()];
+    final boolean isSingleItem = n_ == 1;
+    bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SMALL : PREAMBLE_INTS_FULL);
+    bytes[SER_VER_BYTE] = isSingleItem ? serialVersionUID2 : serialVersionUID1;
+    bytes[FAMILY_BYTE] = (byte) Family.KLL.getID();
+    bytes[FLAGS_BYTE] = (byte) (
+        (isEmpty() ? 1 << Flags.IS_EMPTY.ordinal() : 0)
+      | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0)
+      | (isSingleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)
+    );
+    ByteArrayUtil.putShortLE(bytes, K_SHORT, (short) k_);
+    bytes[M_BYTE] = (byte) m_;
+    if (isEmpty()) { return bytes; }
+    int offset = DATA_START_SINGLE_ITEM;
+    if (!isSingleItem) {
+      ByteArrayUtil.putLongLE(bytes, N_LONG, n_);
+      ByteArrayUtil.putShortLE(bytes, MIN_K_SHORT, (short) minK_);
+      bytes[NUM_LEVELS_BYTE] = (byte) numLevels_;
+      offset = DATA_START;
+      // the last integer in levels_ is not serialized because it can be derived
+      for (int i = 0; i < numLevels_; i++) {
+        ByteArrayUtil.putIntLE(bytes, offset, levels_[i]);
+        offset += Integer.BYTES;
+      }
+      ByteArrayUtil.putFloatLE(bytes, offset, minValue_);
+      offset += Float.BYTES;
+      ByteArrayUtil.putFloatLE(bytes, offset, maxValue_);
+      offset += Float.BYTES;
+    }
+    final int numItems = getNumRetained();
+    for (int i = 0; i < numItems; i++) {
+      ByteArrayUtil.putFloatLE(bytes, offset, items_[levels_[0] + i]);
+      offset += Float.BYTES;
+    }
+    return bytes;
   }
 
   @Override
@@ -828,9 +853,9 @@ public class KllFloatsSketch {
 
     if (withLevels) {
       sb.append("### KLL sketch levels:").append(Util.LS)
-      .append("   index: nominal capacity, actual size").append(Util.LS);
+      .append("   level, offset: nominal capacity, actual size").append(Util.LS);
       for (int i = 0; i < numLevels_; i++) {
-        sb.append("   ").append(i).append(": ")
+        sb.append("   ").append(i).append(", ").append(levels_[i]).append(": ")
         .append(KllHelper.levelCapacity(k_, numLevels_, i, m_))
         .append(", ").append(safeLevelSize(i)).append(Util.LS);
       }
@@ -858,52 +883,28 @@ public class KllFloatsSketch {
   }
 
   /**
-   * Returns serialized sketch in a byte array form.
-   * @return serialized sketch in a byte array form.
+   * Updates this sketch with the given data item.
+   *
+   * @param value an item from a stream of items. NaNs are ignored.
    */
-  public byte[] toByteArray() {
-    final byte[] bytes = new byte[getSerializedSizeBytes()];
-    final boolean isSingleItem = n_ == 1;
-    bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SMALL : PREAMBLE_INTS_FULL);
-    bytes[SER_VER_BYTE] = isSingleItem ? serialVersionUID2 : serialVersionUID1;
-    bytes[FAMILY_BYTE] = (byte) Family.KLL.getID();
-    bytes[FLAGS_BYTE] = (byte) (
-        (isEmpty() ? 1 << Flags.IS_EMPTY.ordinal() : 0)
-      | (isLevelZeroSorted_ ? 1 << Flags.IS_LEVEL_ZERO_SORTED.ordinal() : 0)
-      | (isSingleItem ? 1 << Flags.IS_SINGLE_ITEM.ordinal() : 0)
-    );
-    ByteArrayUtil.putShortLE(bytes, K_SHORT, (short) k_);
-    bytes[M_BYTE] = (byte) m_;
-    if (isEmpty()) { return bytes; }
-    int offset = DATA_START_SINGLE_ITEM;
-    if (!isSingleItem) {
-      ByteArrayUtil.putLongLE(bytes, N_LONG, n_);
-      ByteArrayUtil.putShortLE(bytes, MIN_K_SHORT, (short) minK_);
-      bytes[NUM_LEVELS_BYTE] = (byte) numLevels_;
-      offset = DATA_START;
-      // the last integer in levels_ is not serialized because it can be derived
-      for (int i = 0; i < numLevels_; i++) {
-        ByteArrayUtil.putIntLE(bytes, offset, levels_[i]);
-        offset += Integer.BYTES;
-      }
-      ByteArrayUtil.putFloatLE(bytes, offset, minValue_);
-      offset += Float.BYTES;
-      ByteArrayUtil.putFloatLE(bytes, offset, maxValue_);
-      offset += Float.BYTES;
+  public void update(final float value) {
+    if (Float.isNaN(value)) { return; }
+    if (isEmpty()) {
+      minValue_ = value;
+      maxValue_ = value;
+    } else {
+      if (value < minValue_) { minValue_ = value; }
+      if (value > maxValue_) { maxValue_ = value; }
     }
-    final int numItems = getNumRetained();
-    for (int i = 0; i < numItems; i++) {
-      ByteArrayUtil.putFloatLE(bytes, offset, items_[levels_[0] + i]);
-      offset += Float.BYTES;
+    if (levels_[0] == 0) {
+      compressWhileUpdating();
     }
-    return bytes;
-  }
-
-  /**
-   * @return the iterator for this class
-   */
-  public KllFloatsSketchIterator iterator() {
-    return new KllFloatsSketchIterator(items_, levels_, numLevels_);
+    n_++;
+    isLevelZeroSorted_ = false;
+    final int nextPos = levels_[0] - 1;
+    assert levels_[0] >= 0;
+    levels_[0] = nextPos;
+    items_[nextPos] = value;
   }
 
   // Restricted Methods
@@ -912,7 +913,7 @@ public class KllFloatsSketch {
    * Checks the validity of the given value k
    * @param k must be greater than 7 and less than 65536.
    */
-  static void checkK(final int k) {
+  private static void checkK(final int k) {
     if ((k < MIN_K) || (k > MAX_K)) {
       throw new SketchesArgumentException(
           "K must be >= " + MIN_K + " and <= " + MAX_K + ": " + k);
@@ -1022,12 +1023,12 @@ public class KllFloatsSketch {
       KllHelper.mergeSortedArrays(items_, adjBeg, halfAdjPop, items_, rawLim, popAbove,
           items_, adjBeg + halfAdjPop);
     }
-    levels_[level + 1] -= halfAdjPop; // adjust boundaries of the level above
+    levels_[level + 1] -= halfAdjPop;          // adjust boundaries of the level above
     if (oddPop) {
       levels_[level] = levels_[level + 1] - 1; // the current level now contains one item
       items_[levels_[level]] = items_[rawBeg]; // namely this leftover guy
     } else {
-      levels_[level] = levels_[level + 1]; // the current level is now empty
+      levels_[level] = levels_[level + 1];     // the current level is now empty
     }
 
     // verify that we freed up halfAdjPop array slots just below the current level


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 02/03: Merge branch 'master' into UpdateKLL

Posted by le...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch UpdateKLL
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit 81a58101ab50f09979cbe61b98f71ef331fab4de
Merge: a8eaf9e 5874474
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Fri Aug 7 11:01:07 2020 -0700

    Merge branch 'master' into UpdateKLL



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 01/03: Some minor bug fixes and rearrange KllFloatsSketch methods.

Posted by le...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch UpdateKLL
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit a8eaf9ec6eefe965654dd909c8f0a627cf5fa2f6
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Fri Aug 7 10:59:41 2020 -0700

    Some minor bug fixes and rearrange KllFloatsSketch methods.
---
 .../apache/datasketches/kll/KllFloatsSketch.java   | 203 ++++++++++++---------
 .../org/apache/datasketches/kll/KllHelper.java     | 139 +++++++++-----
 .../datasketches/kll/KllFloatsSketchTest.java      |  24 +--
 3 files changed, 221 insertions(+), 145 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
index a287f46..98b6169 100644
--- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
+++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java
@@ -177,20 +177,28 @@ public class KllFloatsSketch {
    * The default value of K.
    */
   public static final int DEFAULT_K = 200;
-  static final int DEFAULT_M = 8;
+  private static final int DEFAULT_M = 8;
   static final int MIN_K = DEFAULT_M;
   static final int MAX_K = (1 << 16) - 1; // serialized as an unsigned short
 
-  /* Serialized sketch layout:
+  /* Serialized sketch layout, more than one item:
    *  Adr:
    *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
    *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
    *      ||   15    |   14  |   13   |   12   |   11   |   10    |    9   |      8       |
    *  1   ||---------------------------------N_LONG---------------------------------------|
    *      ||   23    |   22  |   21   |   20   |   19   |    18   |   17   |      16      |
-   *  2   ||---------------data----------------|--------|numLevels|-------min K-----------|
+   *  2   ||<--------------data----------------| unused |numLevels|-------min K-----------|
+   *
+   * Serialized sketch layout, Single Item:
+   *  Adr:
+   *      ||    7    |   6   |    5   |    4   |    3   |    2    |    1   |      0       |
+   *  0   || unused  |   M   |--------K--------|  Flags |  FamID  | SerVer | PreambleInts |
+   *      ||   15    |   14  |   13   |   12   |   11   |   10    |    9   |      8       |
+   *  1   ||<--------------------------------data-----------------------------------------|
    */
 
+  // Preamble byte addresses
   private static final int PREAMBLE_INTS_BYTE = 0;
   private static final int SER_VER_BYTE       = 1;
   private static final int FAMILY_BYTE        = 2;
@@ -198,20 +206,19 @@ public class KllFloatsSketch {
   private static final int K_SHORT            = 4;  // to 5
   private static final int M_BYTE             = 6;
   private static final int N_LONG             = 8;  // to 15
-  private static final int MIN_K_SHORT        = 16;  // to 17
+  private static final int MIN_K_SHORT        = 16; // to 17
   private static final int NUM_LEVELS_BYTE    = 18;
-  private static final int DATA_START         = 20;
-
+  private static final int DATA_START         = 20; // if using items larger than 4 bytes, use 24
   private static final int DATA_START_SINGLE_ITEM = 8;
 
+  // Other static values
   private static final byte serialVersionUID1 = 1;
   private static final byte serialVersionUID2 = 2;
+  private static final int PREAMBLE_INTS_SMALL = 2; // for empty and single item
+  private static final int PREAMBLE_INTS_FULL  = 5; // if using items larger than 4 bytes, use 6
 
   private enum Flags { IS_EMPTY, IS_LEVEL_ZERO_SORTED, IS_SINGLE_ITEM }
 
-  private static final int PREAMBLE_INTS_SHORT = 2; // for empty and single item
-  private static final int PREAMBLE_INTS_FULL = 5;
-
   /*
    * Data is stored in items_.
    * The data for level i lies in positions levels_[i] through levels_[i + 1] - 1 inclusive.
@@ -227,18 +234,61 @@ public class KllFloatsSketch {
    *  the sketch is exactly filled to capacity and must be compacted.
    */
 
-  private final int k_;
-  private final int m_; // minimum buffer "width"
+  private final int k_; // configured value of K
+  private final int m_; // configured minimum buffer "width", Must always be DEFAULT_M for now.
+
+  private int minK_;      // for error estimation after merging with different k
+  private long n_;        // number of items input into this sketch
+  private int numLevels_; // one-based number of current levels,
+  private int[] levels_;  // array of index offsets to the levels. Size = numLevels + 1.
+  private boolean isLevelZeroSorted_;
 
-  private int minK_; // for error estimation after merging with different k
-  private long n_;
-  private int numLevels_;
-  private int[] levels_;
+  // Specific to the floats sketch
   private float[] items_;
   private float minValue_;
   private float maxValue_;
-  private boolean isLevelZeroSorted_;
 
+
+  /**
+   * Heap constructor with the default <em>k = 200</em>, which has a rank error of about 1.65%.
+   */
+  public KllFloatsSketch() {
+    this(DEFAULT_K);
+  }
+
+  /**
+   * Heap constructor with a given parameter <em>k</em>. <em>k</em> can be any value between 8 and
+   * 65535, inclusive. The default <em>k</em> = 200 results in a normalized rank error of about
+   * 1.65%. Higher values of K will have smaller error but the sketch will be larger (and slower).
+   * @param k parameter that controls size of the sketch and accuracy of estimates
+   */
+  public KllFloatsSketch(final int k) {
+    this(k, DEFAULT_M);
+  }
+
+  /**
+   * Heap constructor.
+   * @param k configured size of sketch. Range [m, 2^16]
+   * @param m minimum level size. Default is 8.
+   */
+  private KllFloatsSketch(final int k, final int m) {
+    checkK(k);
+    k_ = k;
+    minK_ = k;
+    m_ = m;
+    numLevels_ = 1;
+    levels_ = new int[] {k, k};
+    items_ = new float[k];
+    minValue_ = Float.NaN;
+    maxValue_ = Float.NaN;
+    isLevelZeroSorted_ = false;
+
+  }
+
+  /**
+   * Off-heap constructor.
+   * @param mem Memory object that contains data serilized by this sketch.
+   */
   private KllFloatsSketch(final Memory mem) {
     m_ = DEFAULT_M;
     k_ = mem.getShort(K_SHORT) & 0xffff;
@@ -248,11 +298,11 @@ public class KllFloatsSketch {
     if (isEmpty) {
       numLevels_ = 1;
       levels_ = new int[] {k_, k_};
+      isLevelZeroSorted_ = false;
+      minK_ = k_;
       items_ = new float[k_];
       minValue_ = Float.NaN;
       maxValue_ = Float.NaN;
-      isLevelZeroSorted_ = false;
-      minK_ = k_;
     } else {
       if (isSingleItem) {
         n_ = 1;
@@ -290,35 +340,49 @@ public class KllFloatsSketch {
     }
   }
 
-  private KllFloatsSketch(final int k, final int m) {
-    checkK(k);
-    k_ = k;
-    m_ = m;
-    numLevels_ = 1;
-    levels_ = new int[] {k, k};
-    items_ = new float[k];
-    minValue_ = Float.NaN;
-    maxValue_ = Float.NaN;
-    isLevelZeroSorted_ = false;
-    minK_ = k;
-  }
-
   /**
-   * Constructor with the default <em>k</em> (rank error of about 1.65%)
+   * Factory heapify takes the sketch image in Memory and instantiates an on-heap sketch.
+   * The resulting sketch will not retain any link to the source Memory.
+   * @param mem a Memory image of a sketch serialized by this sketch.
+   * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
+   * @return a heap-based sketch based on the given Memory.
    */
-  public KllFloatsSketch() {
-    this(DEFAULT_K);
+  public static KllFloatsSketch heapify(final Memory mem) {
+    final int preambleInts = mem.getByte(PREAMBLE_INTS_BYTE) & 0xff;
+    final int serialVersion = mem.getByte(SER_VER_BYTE) & 0xff;
+    final int family = mem.getByte(FAMILY_BYTE) & 0xff;
+    final int flags = mem.getByte(FLAGS_BYTE) & 0xff;
+    final int m = mem.getByte(M_BYTE) & 0xff;
+    if (m != DEFAULT_M) {
+      throw new SketchesArgumentException(
+          "Possible corruption: M must be " + DEFAULT_M + ": " + m);
+    }
+    final boolean isEmpty = (flags & (1 << Flags.IS_EMPTY.ordinal())) > 0;
+    final boolean isSingleItem = (flags & (1 << Flags.IS_SINGLE_ITEM.ordinal())) > 0;
+    if (isEmpty || isSingleItem) {
+      if (preambleInts != PREAMBLE_INTS_SMALL) {
+        throw new SketchesArgumentException("Possible corruption: preambleInts must be "
+            + PREAMBLE_INTS_SMALL + " for an empty or single item sketch: " + preambleInts);
+      }
+    } else {
+      if (preambleInts != PREAMBLE_INTS_FULL) {
+        throw new SketchesArgumentException("Possible corruption: preambleInts must be "
+            + PREAMBLE_INTS_FULL + " for a sketch with more than one item: " + preambleInts);
+      }
+    }
+    if ((serialVersion != serialVersionUID1) && (serialVersion != serialVersionUID2)) {
+      throw new SketchesArgumentException(
+          "Possible corruption: serial version mismatch: expected " + serialVersionUID1 + " or "
+              + serialVersionUID2 + ", got " + serialVersion);
+    }
+    if (family != Family.KLL.getID()) {
+      throw new SketchesArgumentException(
+      "Possible corruption: family mismatch: expected " + Family.KLL.getID() + ", got " + family);
+    }
+    return new KllFloatsSketch(mem);
   }
 
-  /**
-   * Constructor with a given parameter <em>k</em>. <em>k</em> can be any value between 8 and
-   * 65535, inclusive. The default <em>k</em> = 200 results in a normalized rank error of about
-   * 1.65%. Higher values of K will have smaller error but the sketch will be larger (and slower).
-   * @param k parameter that controls size of the sketch and accuracy of estimates
-   */
-  public KllFloatsSketch(final int k) {
-    this(k, DEFAULT_M);
-  }
+  // public functions
 
   /**
    * Returns the parameter k
@@ -395,15 +459,18 @@ public class KllFloatsSketch {
       throw new SketchesArgumentException("incompatible M: " + m_ + " and " + other.m_);
     }
     final long finalN = n_ + other.n_;
+    //update this sketch with level0 items from the other sketch
     for (int i = other.levels_[0]; i < other.levels_[1]; i++) {
       update(other.items_[i]);
     }
-    if (other.numLevels_ >= 2) {
+    if (other.numLevels_ >= 2) { //now merge other levels if they exist
       mergeHigherLevels(other, finalN);
     }
+    //update min, max values, n
     if (Float.isNaN(minValue_) || (other.minValue_ < minValue_)) { minValue_ = other.minValue_; }
     if (Float.isNaN(maxValue_) || (other.maxValue_ > maxValue_)) { maxValue_ = other.maxValue_; }
     n_ = finalN;
+
     assertCorrectTotalWeight();
     if (other.isEstimationMode()) {
       minK_ = min(minK_, other.minK_);
@@ -797,7 +864,7 @@ public class KllFloatsSketch {
   public byte[] toByteArray() {
     final byte[] bytes = new byte[getSerializedSizeBytes()];
     final boolean isSingleItem = n_ == 1;
-    bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
+    bytes[PREAMBLE_INTS_BYTE] = (byte) (isEmpty() || isSingleItem ? PREAMBLE_INTS_SMALL : PREAMBLE_INTS_FULL);
     bytes[SER_VER_BYTE] = isSingleItem ? serialVersionUID2 : serialVersionUID1;
     bytes[FAMILY_BYTE] = (byte) Family.KLL.getID();
     bytes[FLAGS_BYTE] = (byte) (
@@ -833,54 +900,14 @@ public class KllFloatsSketch {
   }
 
   /**
-   * Heapify takes the sketch image in Memory and instantiates an on-heap sketch.
-   * The resulting sketch will not retain any link to the source Memory.
-   * @param mem a Memory image of a sketch.
-   * <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
-   * @return a heap-based sketch based on the given Memory
-   */
-  public static KllFloatsSketch heapify(final Memory mem) {
-    final int preambleInts = mem.getByte(PREAMBLE_INTS_BYTE) & 0xff;
-    final int serialVersion = mem.getByte(SER_VER_BYTE) & 0xff;
-    final int family = mem.getByte(FAMILY_BYTE) & 0xff;
-    final int flags = mem.getByte(FLAGS_BYTE) & 0xff;
-    final int m = mem.getByte(M_BYTE) & 0xff;
-    if (m != DEFAULT_M) {
-      throw new SketchesArgumentException(
-          "Possible corruption: M must be " + DEFAULT_M + ": " + m);
-    }
-    final boolean isEmpty = (flags & (1 << Flags.IS_EMPTY.ordinal())) > 0;
-    final boolean isSingleItem = (flags & (1 << Flags.IS_SINGLE_ITEM.ordinal())) > 0;
-    if (isEmpty || isSingleItem) {
-      if (preambleInts != PREAMBLE_INTS_SHORT) {
-        throw new SketchesArgumentException("Possible corruption: preambleInts must be "
-            + PREAMBLE_INTS_SHORT + " for an empty or single item sketch: " + preambleInts);
-      }
-    } else {
-      if (preambleInts != PREAMBLE_INTS_FULL) {
-        throw new SketchesArgumentException("Possible corruption: preambleInts must be "
-            + PREAMBLE_INTS_FULL + " for a sketch with more than one item: " + preambleInts);
-      }
-    }
-    if ((serialVersion != serialVersionUID1) && (serialVersion != serialVersionUID2)) {
-      throw new SketchesArgumentException(
-          "Possible corruption: serial version mismatch: expected " + serialVersionUID1 + " or "
-              + serialVersionUID2 + ", got " + serialVersion);
-    }
-    if (family != Family.KLL.getID()) {
-      throw new SketchesArgumentException(
-      "Possible corruption: family mismatch: expected " + Family.KLL.getID() + ", got " + family);
-    }
-    return new KllFloatsSketch(mem);
-  }
-
-  /**
    * @return the iterator for this class
    */
   public KllFloatsSketchIterator iterator() {
     return new KllFloatsSketchIterator(items_, levels_, numLevels_);
   }
 
+  // Restricted Methods
+
   /**
    * Checks the validity of the given value k
    * @param k must be greater than 7 and less than 65536.
@@ -1137,7 +1164,7 @@ public class KllFloatsSketch {
     return levels_[level + 1] - levels_[level];
   }
 
- private int getNumRetainedAboveLevelZero() {
+  private int getNumRetainedAboveLevelZero() {
     if (numLevels_ == 1) { return 0; }
     return levels_[numLevels_] - levels_[1];
   }
diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java
index 521cea3..bd62f08 100644
--- a/src/main/java/org/apache/datasketches/kll/KllHelper.java
+++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java
@@ -20,7 +20,6 @@
 package org.apache.datasketches.kll;
 
 import static org.apache.datasketches.Util.floorPowerOf2;
-import static org.apache.datasketches.Util.simpleLog2OfLong;
 
 import java.util.Arrays;
 import java.util.Random;
@@ -44,10 +43,6 @@ class KllHelper {
     return (value & 1) == 1;
   }
 
-  static int floorOfLog2OfFraction(final long numer, final long denom) {
-    return simpleLog2OfLong(floorPowerOf2(numer / denom));
-  }
-
   /**
    * Checks the sequential validity of the given array of float values.
    * They must be unique, monotonically increasing and not NaN.
@@ -65,6 +60,12 @@ class KllHelper {
     }
   }
 
+  /**
+   * Copy the old array into a new larger array.
+   * @param oldArr the given old array with data
+   * @param newLen the new length larger than the oldArr.length.
+   * @return the new array
+   */
   static int[] growIntArray(final int[] oldArr, final int newLen) {
     final int oldLen = oldArr.length;
     assert newLen > oldLen;
@@ -73,47 +74,83 @@ class KllHelper {
     return newArr;
   }
 
+  /**
+   * Returns the upper bound of the number of levels based on <i>n</i>.
+   * @param n the length of the stream
+   * @return floor( log_2(n) )
+   */
   static int ubOnNumLevels(final long n) {
-    if (n == 0) { return 1; }
-    return 1 + floorOfLog2OfFraction(n, 1);
+    return 1 + Long.numberOfTrailingZeros(floorPowerOf2(n));
   }
 
+  /**
+   * Returns the maximum number of items that this sketch can handle
+   * @param k The sizing / accuracy parameter of the sketch in items.
+   * Note: this method actually works for k values up to k = 2^29 and 61 levels,
+   * however only k values up to (2^16 - 1) are currently used by the sketch.
+   * @param m the size of the smallest level in items.
+   * @param numLevels the upper bound number of levels based on <i>n</i> items.
+   * @return the total item capacity of the sketch.
+   */
   static int computeTotalCapacity(final int k, final int m, final int numLevels) {
-    int total = 0;
+    long total = 0;
     for (int h = 0; h < numLevels; h++) {
       total += levelCapacity(k, numLevels, h, m);
     }
-    return total;
+    return (int) total;
   }
 
+  /**
+   * Returns the capacity of a specific level.
+   * @param k the accuracy parameter of the sketch. Maximum is 2^29.
+   * @param numLevels the number of current levels in the sketch. Maximum is 61.
+   * @param height the zero-based index of a level with respect to the smallest level.
+   * This varies from 0 to 60.
+   * @param minWidth the minimum level width. Default is 8.
+   * @return the capacity of a specific level
+   */
   static int levelCapacity(final int k, final int numLevels, final int height, final int minWidth) {
-    assert height >= 0;
-    assert height < numLevels;
+    assert (k <= (1 << 29));
+    assert (numLevels >= 1) && (numLevels <= 61);
+    assert (height >= 0) && (height < numLevels);
     final int depth = numLevels - height - 1;
-    return Math.max(minWidth, intCapAux(k, depth));
+    return (int) Math.max(minWidth, intCapAux(k, depth));
   }
 
-  private static int intCapAux(final int k, final int depth) {
-    assert (k <= (1 << 30));
-    assert (depth <= 60);
-    if (depth <= 30) { return intCapAuxAux(k, depth); }
+  /**
+   * Computes the actual capacity of a given level given its depth index.
+   * If the depth of levels exceeds 30, this uses a folding technique to accurately compute the
+   * actual leval capacity upto a depth of 60. Without folding, the internal calculations would
+   * excceed the capacity of a long.
+   * @param k the configured k of the sketch
+   * @param depth the zero-based index of the level being computed.
+   * @return the actual capacity of a given level given its depth index.
+   */
+  private static long intCapAux(final int k, final int depth) {
+    if (depth <= 30) { return (int) intCapAuxAux(k, depth); }
     final int half = depth / 2;
     final int rest = depth - half;
-    final int tmp = intCapAuxAux(k, half);
+    final long tmp = intCapAuxAux(k, half);
     return intCapAuxAux(tmp, rest);
   }
 
-  private static int intCapAuxAux(final int k, final int depth) {
-    assert (k <= (1 << 30));
-    assert (depth <= 30);
-    final int twok = k << 1; // for rounding, we pre-multiply by 2
-    final int tmp = (int) (((long) twok << depth) / powersOfThree[depth]);
-    final int result = ((tmp + 1) >> 1); // then here we add 1 and divide by 2
+  /**
+   * Performs the integer based calculation of an individual level (or folded level).
+   * @param k the configured k of the sketch
+   * @param depth depth the zero-based index of the level being computed.
+   * @return the actual capacity of a given level given its depth index.
+   */
+  private static long intCapAuxAux(final long k, final int depth) {
+    final long twok = k << 1; // for rounding, we pre-multiply by 2
+    final long tmp = ((twok << depth) / powersOfThree[depth]);
+    final long result = ((tmp + 1) >> 1); // then here we add 1 and divide by 2
     assert (result <= k);
     return result;
   }
 
-  // 0 <= index <= 30
+  /**
+   * This is the exact powers of 3 from 3^0 to 3^30 where the exponent is the index
+   */
   private static final long[] powersOfThree =
       new long[] {1, 3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441,
   1594323, 4782969, 14348907, 43046721, 129140163, 387420489, 1162261467,
@@ -124,8 +161,8 @@ class KllHelper {
   static long sumTheSampleWeights(final int num_levels, final int[] levels) {
     long total = 0;
     long weight = 1;
-    for (int lvl = 0; lvl < num_levels; lvl++) {
-      total += weight * (levels[lvl + 1] - levels[lvl]);
+    for (int i = 0; i < num_levels; i++) {
+      total += weight * (levels[i + 1] - levels[i]);
       weight *= 2;
     }
     return total;
@@ -160,31 +197,47 @@ class KllHelper {
     assert b == limB;
   }
 
-  /*
-   * Here is what we do for each level:
-   * If it does not need to be compacted, then simply copy it over.
-   *
-   * Otherwise, it does need to be compacted, so...
-   *   Copy zero or one guy over.
-   *   If the level above is empty, halve up.
-   *   Else the level above is nonempty, so...
-   *        halve down, then merge up.
-   *   Adjust the boundaries of the level above.
+  /**
+   * Compression algorithm used to merge higher levels.
+   * <p>Here is what we do for each level:</p>
+   * <ul><li>If it does not need to be compacted, then simply copy it over.</li>
+   * <li>Otherwise, it does need to be compacted, so...
+   *   <ul><li>Copy zero or one guy over.</li>
+   *       <li>If the level above is empty, halve up.</li>
+   *       <li>Else the level above is nonempty, so halve down, then merge up.</li>
+   *   </ul></li>
+   * <li>Adjust the boundaries of the level above.</li>
+   * </ul>
    *
-   * It can be proved that generalCompress returns a sketch that satisfies the space constraints
+   * <p>It can be proved that generalCompress returns a sketch that satisfies the space constraints
    * no matter how much data is passed in.
    * We are pretty sure that it works correctly when inBuf and outBuf are the same.
    * All levels except for level zero must be sorted before calling this, and will still be
    * sorted afterwards.
-   * Level zero is not required to be sorted before, and may not be sorted afterwards.
+   * Level zero is not required to be sorted before, and may not be sorted afterwards.</p>
    *
-   * trashes inBuf and inLevels
-   * modifies outBuf and outLevels
+   * <p>This trashes inBuf and inLevels and modifies outBuf and outLevels.</p>
    *
-   * returns (finalNumLevels, finalCapacity, finalItemCount)
+   * @param k The sketch parameter k
+   * @param m The minimum level size
+   * @param numLevelsIn provisional number of number of levels = max(this.numLevels, other.numLevels)
+   * @param inBuf work buffer of size = this.getNumRetained() + other.getNumRetainedAboveLevelZero().
+   * This contains the float[] of the other sketch
+   * @param inLevels work levels array size = ubOnNumLevels(this.n + other.n) + 2
+   * @param outBuf the same array as inBuf
+   * @param outLevels the same size as inLevels
+   * @param isLevelZeroSorted true if this.level 0 is sorted
+   * @return int array of: {numLevels, targetItemCount, currentItemCount)
    */
-  static int[] generalCompress(final int k, final int m, final int numLevelsIn, final float[] inBuf,
-      final int[] inLevels, final float[] outBuf, final int[] outLevels, final boolean isLevelZeroSorted) {
+  static int[] generalCompress(
+      final int k,
+      final int m,
+      final int numLevelsIn,
+      final float[] inBuf,
+      final int[] inLevels,
+      final float[] outBuf,
+      final int[] outLevels,
+      final boolean isLevelZeroSorted) {
     assert numLevelsIn > 0; // things are too weird if zero levels are allowed
     int numLevels = numLevelsIn;
     int currentItemCount = inLevels[numLevels] - inLevels[0]; // decreases with each compaction
diff --git a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
index facc32c..d439757 100644
--- a/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
+++ b/src/test/java/org/apache/datasketches/kll/KllFloatsSketchTest.java
@@ -152,19 +152,6 @@ public class KllFloatsSketchTest {
   }
 
   @Test
-  public void floorLog2() {
-    assertEquals(KllHelper.floorOfLog2OfFraction(0, 1), 0);
-    assertEquals(KllHelper.floorOfLog2OfFraction(1, 2), 0);
-    assertEquals(KllHelper.floorOfLog2OfFraction(2, 2), 0);
-    assertEquals(KllHelper.floorOfLog2OfFraction(3, 2), 0);
-    assertEquals(KllHelper.floorOfLog2OfFraction(4, 2), 1);
-    assertEquals(KllHelper.floorOfLog2OfFraction(5, 2), 1);
-    assertEquals(KllHelper.floorOfLog2OfFraction(6, 2), 1);
-    assertEquals(KllHelper.floorOfLog2OfFraction(7, 2), 1);
-    assertEquals(KllHelper.floorOfLog2OfFraction(8, 2), 2);
-  }
-
-  @Test
   public void merge() {
     final KllFloatsSketch sketch1 = new KllFloatsSketch();
     final KllFloatsSketch sketch2 = new KllFloatsSketch();
@@ -407,8 +394,17 @@ public class KllFloatsSketchTest {
 
   @Test
   public void checkIntCapAux() {
-    final int lvlCap = KllHelper.levelCapacity(10, 100, 50, 8);
+    int lvlCap = KllHelper.levelCapacity(10, 61, 0, 8);
     assertEquals(lvlCap, 8);
+    lvlCap = KllHelper.levelCapacity(10, 61, 60, 8);
+    assertEquals(lvlCap, 10);
+  }
+
+  @Test
+  public void checkSuperLargeKandLevels() {
+    //This is beyond what the sketch can be configured for.
+    int size = KllHelper.computeTotalCapacity(1 << 29, 8, 61);
+    assertEquals(size, 1_610_612_846);
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org