You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2019/07/23 00:34:19 UTC

[incubator-datasketches-java] branch tuple_a_not_b_fix created (now 9de9f94)

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a change to branch tuple_a_not_b_fix
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git.


      at 9de9f94  fixed tuple sketch A-not-B

This branch includes the following new commits:

     new 9de9f94  fixed tuple sketch A-not-B

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 01/01: fixed tuple sketch A-not-B

Posted by al...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_a_not_b_fix
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit 9de9f9451befcfa7f2dcb9d54bf2f227b9fbd8d9
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Jul 22 17:34:04 2019 -0700

    fixed tuple sketch A-not-B
---
 src/main/java/com/yahoo/sketches/tuple/AnotB.java  |  2 +-
 .../sketches/tuple/HeapArrayOfDoublesAnotB.java    | 20 ++++++-----
 .../sketches/tuple/ArrayOfDoublesAnotBTest.java    | 36 +++++++++++++++++++
 .../UpdatableSketchWithDoubleSummaryTest.java      | 40 ++++++++++++++++++++--
 4 files changed, 87 insertions(+), 11 deletions(-)

diff --git a/src/main/java/com/yahoo/sketches/tuple/AnotB.java b/src/main/java/com/yahoo/sketches/tuple/AnotB.java
index 280477d..af1f253 100644
--- a/src/main/java/com/yahoo/sketches/tuple/AnotB.java
+++ b/src/main/java/com/yahoo/sketches/tuple/AnotB.java
@@ -69,7 +69,7 @@ public final class AnotB<S extends Summary> {
       keys_ = new long[noMatchSize];
       summaries_ = (S[]) Array.newInstance(a.summaries_.getClass().getComponentType(), noMatchSize);
       for (int i = 0; i < a.keys_.length; i++) {
-        if (a.keys_[i] != 0) {
+        if (a.keys_[i] != 0 && a.keys_[i] < theta_) {
           final int index = HashOperations.hashSearch(hashTable, lgHashTableSize, a.keys_[i]);
           if (index == -1) {
             keys_[count_] = a.keys_[i];
diff --git a/src/main/java/com/yahoo/sketches/tuple/HeapArrayOfDoublesAnotB.java b/src/main/java/com/yahoo/sketches/tuple/HeapArrayOfDoublesAnotB.java
index e0a72bd..6e228a2 100644
--- a/src/main/java/com/yahoo/sketches/tuple/HeapArrayOfDoublesAnotB.java
+++ b/src/main/java/com/yahoo/sketches/tuple/HeapArrayOfDoublesAnotB.java
@@ -67,18 +67,20 @@ final class HeapArrayOfDoublesAnotB extends ArrayOfDoublesAnotB {
       getNoMatchSetFromSketch(a);
     } else {
       final long[] hashTable;
-      hashTable = convertToHashTable(b);
+      hashTable = convertToHashTable(b, theta_);
       final int lgHashTableSize = Integer.numberOfTrailingZeros(hashTable.length);
       final int noMatchSize = a.getRetainedEntries();
       keys_ = new long[noMatchSize];
       values_ = new double[noMatchSize * numValues_];
       final ArrayOfDoublesSketchIterator it = a.iterator();
       while (it.next()) {
-        final int index = HashOperations.hashSearch(hashTable, lgHashTableSize, it.getKey());
-        if (index == -1) {
-          keys_[count_] = it.getKey();
-          System.arraycopy(it.getValues(), 0, values_, count_ * numValues_, numValues_);
-          count_++;
+        if (it.getKey() < theta_) {
+          final int index = HashOperations.hashSearch(hashTable, lgHashTableSize, it.getKey());
+          if (index == -1) {
+            keys_[count_] = it.getKey();
+            System.arraycopy(it.getValues(), 0, values_, count_ * numValues_, numValues_);
+            count_++;
+          }
         }
       }
     }
@@ -118,7 +120,7 @@ final class HeapArrayOfDoublesAnotB extends ArrayOfDoublesAnotB {
     return result;
   }
 
-  private static long[] convertToHashTable(final ArrayOfDoublesSketch sketch) {
+  private static long[] convertToHashTable(final ArrayOfDoublesSketch sketch, long theta) {
     final int size = Math.max(
       ceilingPowerOf2((int) Math.ceil(sketch.getRetainedEntries() / REBUILD_THRESHOLD)),
       1 << MIN_LG_NOM_LONGS
@@ -127,7 +129,9 @@ final class HeapArrayOfDoublesAnotB extends ArrayOfDoublesAnotB {
     final ArrayOfDoublesSketchIterator it = sketch.iterator();
     final int lgSize = Integer.numberOfTrailingZeros(size);
     while (it.next()) {
-      HashOperations.hashInsertOnly(hashTable, lgSize, it.getKey());
+      if (it.getKey() < theta) {
+        HashOperations.hashInsertOnly(hashTable, lgSize, it.getKey());
+      }
     }
     return hashTable;
   }
diff --git a/src/test/java/com/yahoo/sketches/tuple/ArrayOfDoublesAnotBTest.java b/src/test/java/com/yahoo/sketches/tuple/ArrayOfDoublesAnotBTest.java
index a26090e..cc5c84e 100644
--- a/src/test/java/com/yahoo/sketches/tuple/ArrayOfDoublesAnotBTest.java
+++ b/src/test/java/com/yahoo/sketches/tuple/ArrayOfDoublesAnotBTest.java
@@ -250,6 +250,42 @@ public class ArrayOfDoublesAnotBTest {
     }
   }
 
+  @Test
+  public void estimationModeLargeB() {
+    int key = 0;
+    ArrayOfDoublesUpdatableSketch sketchA = new ArrayOfDoublesUpdatableSketchBuilder().build();
+    for (int i = 0; i < 10000; i++) sketchA.update(key++, new double[] {1});
+
+    key -= 2000; // overlap
+    ArrayOfDoublesUpdatableSketch sketchB = new ArrayOfDoublesUpdatableSketchBuilder().build();
+    for (int i = 0; i < 100000; i++) sketchB.update(key++, new double[] {1});
+
+    final int expected = 10000 - 2000;
+    ArrayOfDoublesAnotB aNotB = new ArrayOfDoublesSetOperationBuilder().buildAnotB();
+    aNotB.update(sketchA, sketchB);
+    ArrayOfDoublesCompactSketch result = aNotB.getResult();
+    Assert.assertFalse(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), expected, expected * 0.1); // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
+    Assert.assertTrue(result.getLowerBound(1) <= result.getEstimate());
+    Assert.assertTrue(result.getUpperBound(1) > result.getEstimate());
+    ArrayOfDoublesSketchIterator it = result.iterator();
+    while (it.next()) {
+      Assert.assertEquals(it.getValues(), new double[] {1});
+    }
+
+    // same operation, but compact sketches and off-heap result
+    aNotB.update(sketchA.compact(), sketchB.compact());
+    result = aNotB.getResult(WritableMemory.wrap(new byte[1000000]));
+    Assert.assertFalse(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), expected, expected * 0.1); // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
+    Assert.assertTrue(result.getLowerBound(1) <= result.getEstimate());
+    Assert.assertTrue(result.getUpperBound(1) > result.getEstimate());
+    it = result.iterator();
+    while (it.next()) {
+      Assert.assertEquals(it.getValues(), new double[] {1});
+    }
+  }
+
   @Test(expectedExceptions = SketchesArgumentException.class)
   public void incompatibleSeedA() {
     ArrayOfDoublesSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().setSeed(1).build();
diff --git a/src/test/java/com/yahoo/sketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java b/src/test/java/com/yahoo/sketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
index c35213c..e738461 100644
--- a/src/test/java/com/yahoo/sketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
+++ b/src/test/java/com/yahoo/sketches/tuple/adouble/UpdatableSketchWithDoubleSummaryTest.java
@@ -770,7 +770,7 @@ public class UpdatableSketchWithDoubleSummaryTest {
     aNotB.update(sketchA, sketchB);
     CompactSketch<DoubleSummary> result = aNotB.getResult();
     Assert.assertFalse(result.isEmpty());
- // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
+    // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
     Assert.assertEquals(result.getEstimate(), 4096.0, 4096 * 0.03);
     Assert.assertTrue(result.getLowerBound(1) <= result.getEstimate());
     Assert.assertTrue(result.getUpperBound(1) > result.getEstimate());
@@ -783,7 +783,7 @@ public class UpdatableSketchWithDoubleSummaryTest {
     aNotB.update(sketchA.compact(), sketchB.compact());
     result = aNotB.getResult();
     Assert.assertFalse(result.isEmpty());
- // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
+    // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
     Assert.assertEquals(result.getEstimate(), 4096.0, 4096 * 0.03);
     Assert.assertTrue(result.getLowerBound(1) <= result.getEstimate());
     Assert.assertTrue(result.getUpperBound(1) > result.getEstimate());
@@ -793,6 +793,42 @@ public class UpdatableSketchWithDoubleSummaryTest {
     }
   }
 
+  @Test
+  public void aNotBEstimationModeLargeB() {
+    int key = 0;
+    UpdatableSketch<Double, DoubleSummary> sketchA =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory()).build();
+    for (int i = 0; i < 10000; i++) {
+      sketchA.update(key++, 1.0);
+    }
+
+    key -= 2000; // overlap
+    UpdatableSketch<Double, DoubleSummary> sketchB =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory()).build();
+    for (int i = 0; i < 100000; i++) {
+      sketchB.update(key++, 1.0);
+    }
+
+    final int expected = 10000 - 2000;
+    AnotB<DoubleSummary> aNotB = new AnotB<>();
+    aNotB.update(sketchA, sketchB);
+    CompactSketch<DoubleSummary> result = aNotB.getResult();
+    Assert.assertFalse(result.isEmpty());
+    // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
+    Assert.assertEquals(result.getEstimate(), expected, expected * 0.1);
+    Assert.assertTrue(result.getLowerBound(1) <= result.getEstimate());
+    Assert.assertTrue(result.getUpperBound(1) > result.getEstimate());
+
+    // same thing, but compact sketches
+    aNotB.update(sketchA.compact(), sketchB.compact());
+    result = aNotB.getResult();
+    Assert.assertFalse(result.isEmpty());
+    // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
+    Assert.assertEquals(result.getEstimate(), expected, expected * 0.1);
+    Assert.assertTrue(result.getLowerBound(1) <= result.getEstimate());
+    Assert.assertTrue(result.getUpperBound(1) > result.getEstimate());
+  }
+
   @Test(expectedExceptions = SketchesArgumentException.class)
   public void invalidSamplingProbability() {
     new UpdatableSketchBuilder<>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org