You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2024/02/14 02:29:28 UTC

(datasketches-java) 01/06: Primary changes (so far) are to ItemsSketchSortedView::getPartitionBoundaries(..).

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch improve_partition_boundaries
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit cc77ef0815af8c18632c82893dfa2c1120307545
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Sat Feb 10 11:54:33 2024 -0800

    Primary changes (so far) are to
    ItemsSketchSortedView::getPartitionBoundaries(..).
    
    This change dramatically improved the handling of the end points and
    isolates this adjustment from the SortedView class members.
    
    The changes to SortedViewIterator and GenericSortedViewIterator are to
    add some functionality of getting the key iteration variables without
    having to specify the sorting criteria.
    
    Added a specific test to check the upper and lower boundary adjustments
    of a partition.
---
 .../quantiles/ItemsSketchSortedView.java           | 73 +++++++++++++-----
 .../quantilescommon/GenericSortedViewIterator.java | 25 ++++++-
 .../quantilescommon/SortedViewIterator.java        | 35 +++++++--
 .../ItemsSketchPartitionBoundariesTest.java        | 87 ++++++++++++++++++++++
 4 files changed, 191 insertions(+), 29 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java
index 9638a9a9..fa6d9541 100644
--- a/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java
+++ b/src/main/java/org/apache/datasketches/quantiles/ItemsSketchSortedView.java
@@ -156,26 +156,58 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T>, Partition
     if (isEmpty()) { throw new IllegalArgumentException(QuantilesAPI.EMPTY_MSG); }
     final long totalN = this.totalN;
     final int svLen = cumWeights.length;
-    //adjust ends of sortedView arrays
-    cumWeights[0] = 1L;
-    cumWeights[svLen - 1] = totalN;
-    quantiles[0] = this.getMinItem();
-    quantiles[svLen - 1] = this.getMaxItem();
-
-    final double[] evSpNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1);
-    final int len = evSpNormRanks.length;
-    final T[] evSpQuantiles = (T[]) Array.newInstance(clazz, len);
-    final long[] evSpNatRanks = new long[len];
-    for (int i = 0; i < len; i++) {
-      final int index = getQuantileIndex(evSpNormRanks[i], searchCrit);
-      evSpQuantiles[i] = quantiles[index];
-      evSpNatRanks[i] = cumWeights[index];
+    if (numEquallySized > svLen / 2) {
+      throw new IllegalArgumentException(QuantilesAPI.UNSUPPORTED_MSG
+          + " Number of requested partitions is too large for this sized sketch. "
+          + "Requested Partitions: " + numEquallySized + " > "
+          + "Retained Items: " + svLen + " / 2.");
+    }
+
+    final double[] searchNormRanks = evenlySpacedDoubles(0, 1.0, numEquallySized + 1);
+    final int partArrLen = searchNormRanks.length;
+    final T[] partQuantiles = (T[]) Array.newInstance(clazz, partArrLen);
+    final long[] partNatRanks = new long[partArrLen];
+    final double[] partNormRanks = new double[partArrLen];
+    //adjust ends
+    int adjLen = svLen;
+    final boolean adjLow = quantiles[0] != minItem;
+    final boolean adjHigh = quantiles[svLen - 1] != maxItem;
+    adjLen += adjLow ? 1 : 0;
+    adjLen += adjHigh ? 1 : 0;
+
+    final T[] adjQuantiles;
+    final long[] adjCumWeights;
+    if (adjLen > svLen) {
+      adjQuantiles = (T[]) new Object[adjLen];
+      adjCumWeights = new long[adjLen];
+      final int offset = adjLow ? 1 : 0;
+      System.arraycopy(quantiles, 0, adjQuantiles, offset, svLen);
+      System.arraycopy(cumWeights,0, adjCumWeights, offset, svLen);
+      if (adjLow) {
+        adjQuantiles[0] = minItem;
+        adjCumWeights[0] = 1;
+      }
+      if (adjHigh) {
+        adjQuantiles[adjLen - 1] = maxItem;
+        adjCumWeights[adjLen - 1] = cumWeights[svLen - 1];
+        adjCumWeights[adjLen - 2] = cumWeights[svLen - 1] - 1;
+      }
+    } else {
+      adjQuantiles = quantiles;
+      adjCumWeights = cumWeights;
+    }
+    for (int i = 0; i < partArrLen; i++) {
+      final int index = getQuantileIndex(searchNormRanks[i], adjCumWeights, searchCrit);
+      partQuantiles[i] = adjQuantiles[index];
+      final long cumWt = adjCumWeights[index];
+      partNatRanks[i] = cumWt;
+      partNormRanks[i] = (double)cumWt / totalN;
     }
     final GenericPartitionBoundaries<T> gpb = new GenericPartitionBoundaries<>(
         this.totalN,
-        evSpQuantiles,
-        evSpNatRanks,
-        evSpNormRanks,
+        partQuantiles,
+        partNatRanks,
+        partNormRanks,
         getMaxItem(),
         getMinItem(),
         searchCrit);
@@ -198,13 +230,14 @@ public class ItemsSketchSortedView<T> implements GenericSortedView<T>, Partition
   public T getQuantile(final double rank, final QuantileSearchCriteria searchCrit) {
     if (isEmpty()) { throw new IllegalArgumentException(EMPTY_MSG); }
     QuantilesUtil.checkNormalizedRankBounds(rank);
-    final int index = getQuantileIndex(rank, searchCrit);
+    final int index = getQuantileIndex(rank, cumWeights, searchCrit);
     return quantiles[index];
   }
 
-  private int getQuantileIndex(final double rank, final QuantileSearchCriteria searchCrit) {
+  private int getQuantileIndex(final double normRank, final long[] cumWeights,
+      final QuantileSearchCriteria searchCrit) {
     final int len = cumWeights.length;
-    final double naturalRank = getNaturalRank(rank, totalN, searchCrit);
+    final double naturalRank = getNaturalRank(normRank, totalN, searchCrit);
     final InequalitySearch crit = (searchCrit == INCLUSIVE) ? InequalitySearch.GE : InequalitySearch.GT;
     final int index = InequalitySearch.find(cumWeights, 0, len - 1, naturalRank, crit);
     if (index == -1) { return len - 1; }
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java
index 5a5c00e2..fcfefa12 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/GenericSortedViewIterator.java
@@ -19,6 +19,8 @@
 
 package org.apache.datasketches.quantilescommon;
 
+import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
+
 /**
  * Iterator over quantile sketches of generic type.
  * @param <T> The generic quantile type
@@ -32,10 +34,10 @@ public class GenericSortedViewIterator<T> extends SortedViewIterator {
   }
 
   /**
-   * Gets the quantile at the current index.
+   * Gets the quantile at the current index
+   * This is equivalent to <i>getQuantile(INCLUSIVE)</i>.
    *
-   * <p>Don't call this before calling next() for the first time
-   * or after getting false from next().</p>
+   * <p>Don't call this before calling next() for the first time or after getting false from next().</p>
    *
    * @return the quantile at the current index.
    */
@@ -43,4 +45,21 @@ public class GenericSortedViewIterator<T> extends SortedViewIterator {
     return quantiles[index];
   }
 
+  /**
+   * Gets the quantile at the current index (or previous index)
+   * based on the chosen search criterion.
+   *
+   * <p>Don't call this before calling next() for the first time or after getting false from next().</p>
+   *
+   * @param searchCrit if INCLUSIVE, includes the quantile at the current index.
+   * Otherwise, returns the quantile of the previous index.
+   *
+   * @return the quantile at the current index (or previous index)
+   * based on the chosen search criterion. If the chosen search criterion is <i>EXCLUSIVE</i> and
+   * the current index is at zero, this will return <i>null</i>.
+   */
+  public T getQuantile(final QuantileSearchCriteria searchCrit) {
+    if (searchCrit == INCLUSIVE) { return quantiles[index]; }
+    return (index == 0) ? null : quantiles[index - 1];
+  }
 }
diff --git a/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java b/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java
index 06c298d4..01af51ea 100644
--- a/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java
+++ b/src/main/java/org/apache/datasketches/quantilescommon/SortedViewIterator.java
@@ -48,17 +48,29 @@ public class SortedViewIterator {
     index = -1;
   }
 
+  /**
+   * Gets the natural rank at the current index.
+   * This is equivalent to <i>getNaturalRank(INCLUSIVE)</i>.
+   *
+   * <p>Don't call this before calling next() for the first time or after getting false from next().</p>
+   *
+   * @return the natural rank at the current index.
+   */
+  public long getNaturalRank() {
+    return cumWeights[index];
+  }
+
   /**
    * Gets the natural rank at the current index (or previous index) based on the chosen search criterion.
    * This is also referred to as the "cumulative weight". The natural rank is a number in the range <i>[1, N]</i>,
    * where <i>N</i> ({@link #getN()}) is the total number of items fed to the sketch.
    *
-   * <p>Don't call this before calling next() for the first time
-   * or after getting false from next().</p>
+   * <p>Don't call this before calling next() for the first time or after getting false from next().</p>
    *
    * @param searchCrit if INCLUSIVE, includes the weight of the item at the current index in the computation of
    * the natural rank.
    * Otherwise, it will return the natural rank of the previous index.
+   *
    * @return the natural rank at the current index (or previous index) based on the chosen search criterion.
    */
   public long getNaturalRank(final QuantileSearchCriteria searchCrit) {
@@ -74,16 +86,28 @@ public class SortedViewIterator {
     return totalN;
   }
 
+  /**
+   * Gets the normalized rank at the current index.
+   * This is equivalent to <i>getNormalizedRank(INCLUSIVE)</i>.
+   *
+   * <p>Don't call this before calling next() for the first time or after getting false from next().</p>
+   *
+   * @return the normalized rank at the current index
+   */
+  public double getNormalizedRank() {
+    return (double) getNaturalRank() / totalN;
+  }
+
   /**
    * Gets the normalized rank at the current index (or previous index)
    * based on the chosen search criterion. Where <i>normalized rank = natural rank / N</i> ({@link #getN()})
    * and is a fraction in the range (0,1.0].
    *
-   * <p>Don't call this before calling next() for the first time
-   * or after getting false from next().</p>
+   * <p>Don't call this before calling next() for the first time or after getting false from next().</p>
    *
    * @param searchCrit if INCLUSIVE, includes the normalized rank at the current index.
    * Otherwise, returns the normalized rank of the previous index.
+   *
    * @return the normalized rank at the current index (or previous index)
    * based on the chosen search criterion.
    */
@@ -94,8 +118,7 @@ public class SortedViewIterator {
   /**
    * Gets the weight contribution of the item at the current index.
    *
-   * <p>Don't call this before calling next() for the first time
-   * or after getting false from next().</p>
+   * <p>Don't call this before calling next() for the first time or after getting false from next().</p>
    *
    * @return the weight contribution of the item at the current index.
    */
diff --git a/src/test/java/org/apache/datasketches/quantiles/ItemsSketchPartitionBoundariesTest.java b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchPartitionBoundariesTest.java
new file mode 100644
index 00000000..a980c01b
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/quantiles/ItemsSketchPartitionBoundariesTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.quantiles;
+
+import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE;
+import static org.testng.Assert.assertEquals;
+
+import java.util.Comparator;
+
+import org.apache.datasketches.quantilescommon.GenericPartitionBoundaries;
+import org.apache.datasketches.quantilescommon.GenericSortedViewIterator;
+import org.testng.annotations.Test;
+
+public class ItemsSketchPartitionBoundariesTest {
+
+  @Test
+  public void checkSimpleEndsAdjustment() {
+    final String[] quantiles = {"2","4","6","7"};
+    final long[] cumWeights = {2, 4, 6, 8};
+    final long totalN = 8;
+    final Comparator<String> comparator = Comparator.naturalOrder();
+    final String maxItem = "8";
+    final String minItem = "1";
+    ItemsSketchSortedViewString sv = new ItemsSketchSortedViewString(
+        quantiles, cumWeights, totalN, comparator, maxItem, minItem);
+
+    GenericSortedViewIterator<String> itr = sv.iterator();
+    while (itr.next()) {
+      println(itr.getNaturalRank(INCLUSIVE) + ", " + itr.getQuantile(INCLUSIVE));
+    }
+    GenericPartitionBoundaries<String> gpb = sv.getPartitionBoundaries(2);
+    String[] boundaries = gpb.getBoundaries();
+    long[] natRanks = gpb.getNaturalRanks();
+    double[] normRanks = gpb.getNormalizedRanks();
+    long[] deltaItems = gpb.getNumDeltaItems();
+    int numParts = gpb.getNumPartitions();
+    String maxItm = gpb.getMaxItem();
+    String minItm = gpb.getMinItem();
+    assertEquals(boundaries, new String[] {"1","4","8"});
+    assertEquals(natRanks, new long[] {1,4,8});
+    assertEquals(normRanks, new double[] {.125,.5,1.0});
+    assertEquals(deltaItems, new long[] {0,4,4});
+    assertEquals(numParts, 2);
+    assertEquals(maxItm, "8");
+    assertEquals(minItm, "1");
+  }
+
+  @Test
+  public void printlnTest() {
+    println("PRINTING: "+this.getClass().getName());
+  }
+
+  private final static boolean enablePrinting = true;
+
+  /**
+   * @param format the format
+   * @param args the args
+   */
+  static final void printf(final String format, final Object ...args) {
+    if (enablePrinting) { System.out.printf(format, args); }
+  }
+
+  /**
+   * @param o the Object to println
+   */
+  static final void println(final Object o) {
+    if (enablePrinting) { System.out.println(o.toString()); }
+  }
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org