You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/01/17 00:36:58 UTC

[incubator-datasketches-java] branch fix_tuple_union created (now dcaa2dd)

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a change to branch fix_tuple_union
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git.


      at dcaa2dd  removed obsolete section

This branch includes the following new commits:

     new 8be970c  fixed getResult()
     new dcaa2dd  removed obsolete section

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 01/02: fixed getResult()

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch fix_tuple_union
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit 8be970c09331ec5f52072e1d179556a39069c956
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Jan 16 16:36:18 2020 -0800

    fixed getResult()
---
 .../java/org/apache/datasketches/tuple/Union.java  | 54 +++++++++++++++++++---
 .../UpdatableSketchWithDoubleSummaryTest.java      | 23 ++++++---
 2 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/tuple/Union.java b/src/main/java/org/apache/datasketches/tuple/Union.java
index 6b024c1..f37155c 100644
--- a/src/main/java/org/apache/datasketches/tuple/Union.java
+++ b/src/main/java/org/apache/datasketches/tuple/Union.java
@@ -19,8 +19,13 @@
 
 package org.apache.datasketches.tuple;
 
+import static java.lang.Math.min;
 import static org.apache.datasketches.Util.DEFAULT_NOMINAL_ENTRIES;
 
+import java.lang.reflect.Array;
+
+import org.apache.datasketches.QuickSelect;
+
 /**
  * Compute a union of two or more tuple sketches.
  * A new instance represents an empty set.
@@ -33,6 +38,7 @@ public class Union<S extends Summary> {
   private final SummarySetOperations<S> summarySetOps_;
   private QuickSelectSketch<S> sketch_;
   private long theta_; // need to maintain outside of the sketch
+  private boolean isEmpty_;
 
   /**
    * Creates new instance with default nominal entries
@@ -53,6 +59,7 @@ public class Union<S extends Summary> {
     summarySetOps_ = summarySetOps;
     sketch_ = new QuickSelectSketch<S>(nomEntries, null);
     theta_ = sketch_.getThetaLong();
+    isEmpty_ = true;
   }
 
   /**
@@ -61,30 +68,65 @@ public class Union<S extends Summary> {
    */
   public void update(final Sketch<S> sketchIn) {
     if (sketchIn == null || sketchIn.isEmpty()) { return; }
+    isEmpty_ = false;
     if (sketchIn.theta_ < theta_) { theta_ = sketchIn.theta_; }
     final SketchIterator<S> it = sketchIn.iterator();
     while (it.next()) {
       sketch_.merge(it.getKey(), it.getSummary(), summarySetOps_);
     }
+    if (sketch_.theta_ < theta_) theta_ = sketch_.theta_;
   }
 
   /**
    * Gets the internal set as a CompactSketch
    * @return result of the unions so far
    */
+  @SuppressWarnings("unchecked")
   public CompactSketch<S> getResult() {
-    sketch_.trim();
-    if (theta_ < sketch_.theta_) {
-      sketch_.setThetaLong(theta_);
-      sketch_.rebuild();
+    if (isEmpty_) return sketch_.compact();
+    if (theta_ >= sketch_.theta_ && sketch_.getRetainedEntries() <= sketch_.getNominalEntries()) {
+      return sketch_.compact();
+    }
+    long theta = min(theta_, sketch_.theta_);
+
+    int num = 0;
+    {
+      final SketchIterator<S> it = sketch_.iterator();
+      while (it.next()) {
+        if (it.getKey() < theta) { num++; }
+      }
     }
-    return sketch_.compact();
+    if (num == 0) return new CompactSketch<>(null, null, theta, isEmpty_);
+    if (num > sketch_.getNominalEntries()) {
+      final long[] keys = new long[num]; // temporary since the order will be destroyed by quick select
+      final SketchIterator<S> it = sketch_.iterator();
+      int i = 0;
+      while (it.next()) {
+        if (it.getKey() < theta) { keys[i++] = it.getKey(); }
+      }
+      theta = QuickSelect.select(keys, 0, num - 1, sketch_.getNominalEntries());
+      num = sketch_.getNominalEntries();
+    }
+    final long[] keys = new long[num];
+    final S[] summaries = (S[]) Array.newInstance(sketch_.summaries_.getClass().getComponentType(), num);
+    final SketchIterator<S> it = sketch_.iterator();
+    int i = 0;
+    while (it.next()) {
+      if (it.getKey() < theta) {
+        keys[i] = it.getKey();
+        summaries[i] = (S) it.getSummary().copy();
+        i++;
+      }
+    }
+    return new CompactSketch<>(keys, summaries, theta, isEmpty_);
   }
 
   /**
    * Resets the internal set to the initial state, which represents an empty set
    */
   public void reset() {
-    sketch_ = new QuickSelectSketch<S>(nomEntries_, null);
+    sketch_.reset();
+    theta_ = sketch_.getThetaLong();
+    isEmpty_ = true;
   }
 }
diff --git a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
index 8e1aefa..cfc4dcf 100644
--- a/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
+++ b/src/test/java/org/apache/datasketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
@@ -230,13 +230,6 @@ public class UpdatableSketchWithDoubleSummaryTest {
     Assert.assertEquals(sketch.getEstimate(), 6.0);
   }
 
-//  @Test
-//  public void updateDoubleSummary() {
-//    DoubleSummary ds = new DoubleSummary();
-//    ds.update(1.0);
-//    Assert.assertEquals(ds.getValue(), 1.0);
-//  }
-
   @Test
   public void doubleSummaryDefaultSumMode() {
     UpdatableSketch<Double, DoubleSummary> sketch =
@@ -403,6 +396,22 @@ public class UpdatableSketchWithDoubleSummaryTest {
   }
 
   @Test
+  public void unionEmptySampling() {
+    UpdatableSketch<Double, DoubleSummary> sketch =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).setSamplingProbability(0.01f).build();
+    sketch.update(1, 1.0);
+    Assert.assertEquals(sketch.getRetainedEntries(), 0); // not retained due to low sampling probability
+
+    Union<DoubleSummary> union = new Union<>(new DoubleSummarySetOperations(mode));
+    union.update(sketch);
+    CompactSketch<DoubleSummary> result = union.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    Assert.assertFalse(result.isEmpty());
+    Assert.assertTrue(result.isEstimationMode());
+    Assert.assertEquals(result.getEstimate(), 0.0);
+  }
+
+  @Test
   public void unionExactMode() {
     UpdatableSketch<Double, DoubleSummary> sketch1 =
         new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 02/02: removed obsolete section

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch fix_tuple_union
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit dcaa2dd303a438a0f6a1f7f98ae52a918f3a198a
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Jan 16 16:36:42 2020 -0800

    removed obsolete section
---
 src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
index e32b735..a50d920 100644
--- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
@@ -172,11 +172,6 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
       count = mem.getInt(offset);
       offset += Integer.BYTES;
     }
-    //    if (version == serialVersionWithSummaryFactoryUID) {
-    //      final DeserializeResult<SummaryFactory<S>> factoryResult =
-    //          SerializerDeserializer.deserializeFromMemory(mem, offset);
-    //      offset += factoryResult.getSize();
-    //    }
     final int currentCapacity = 1 << lgCurrentCapacity_;
     keys_ = new long[currentCapacity];
     for (int i = 0; i < count; i++) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org