You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by GitBox <gi...@apache.org> on 2020/06/01 15:02:20 UTC

[GitHub] [incubator-datasketches-java] davecromberge commented on a change in pull request #319: Tuple theta extension

davecromberge commented on a change in pull request #319:
URL: https://github.com/apache/incubator-datasketches-java/pull/319#discussion_r433281646



##########
File path: src/test/java/org/apache/datasketches/tuple/adouble/AdoubleAnotBTest.java
##########
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.tuple.adouble;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+
+import org.apache.datasketches.theta.UpdateSketch;
+import org.apache.datasketches.theta.UpdateSketchBuilder;
+import org.apache.datasketches.tuple.AnotB;
+import org.apache.datasketches.tuple.CompactSketch;
+import org.apache.datasketches.tuple.Sketch;
+import org.apache.datasketches.tuple.SketchIterator;
+import org.apache.datasketches.tuple.UpdatableSketch;
+import org.apache.datasketches.tuple.UpdatableSketchBuilder;
+import org.apache.datasketches.tuple.adouble.DoubleSummary.Mode;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+/**
+ * @author Lee Rhodes
+ */
+@SuppressWarnings("javadoc")
+public class AdoubleAnotBTest {
+  private final DoubleSummary.Mode mode = Mode.Sum;
+  private final Results results = new Results();
+
+  private static void threeMethodsWithTheta(
+      final AnotB<DoubleSummary> aNotB,
+      final Sketch<DoubleSummary> sketchA,
+      final Sketch<DoubleSummary> sketchB,
+      final org.apache.datasketches.theta.Sketch skThetaB,
+      final Results results) {
+    //deprecated
+    aNotB.update(sketchA, sketchB);
+    CompactSketch<DoubleSummary> result = aNotB.getResult();
+    results.check(result);
+    //Stateless
+    result = AnotB.aNotB(sketchA, sketchB);
+    results.check(result);
+    result = AnotB.aNotB(sketchA, skThetaB);
+    //Stateful w B = Tuple
+    aNotB.setA(sketchA);
+    aNotB.notB(sketchB);
+    result = aNotB.getResult(true);
+    results.check(result);
+    //Stateful w B = Theta
+    aNotB.setA(sketchA);
+    aNotB.notB(skThetaB);
+    result = aNotB.getResult(false);
+    results.check(result);
+    result = aNotB.getResult(true);
+    results.check(result);
+  }

Review comment:
       This is a very neat way to ensure consistency between all the variants on ANotB 👍 

##########
File path: src/test/java/org/apache/datasketches/tuple/adouble/AdoubleAnotBTest.java
##########
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.tuple.adouble;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+
+import org.apache.datasketches.theta.UpdateSketch;
+import org.apache.datasketches.theta.UpdateSketchBuilder;
+import org.apache.datasketches.tuple.AnotB;
+import org.apache.datasketches.tuple.CompactSketch;
+import org.apache.datasketches.tuple.Sketch;
+import org.apache.datasketches.tuple.SketchIterator;
+import org.apache.datasketches.tuple.UpdatableSketch;
+import org.apache.datasketches.tuple.UpdatableSketchBuilder;
+import org.apache.datasketches.tuple.adouble.DoubleSummary.Mode;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+/**
+ * @author Lee Rhodes
+ */
+@SuppressWarnings("javadoc")
+public class AdoubleAnotBTest {
+  private final DoubleSummary.Mode mode = Mode.Sum;
+  private final Results results = new Results();
+
+  private static void threeMethodsWithTheta(
+      final AnotB<DoubleSummary> aNotB,
+      final Sketch<DoubleSummary> sketchA,
+      final Sketch<DoubleSummary> sketchB,
+      final org.apache.datasketches.theta.Sketch skThetaB,
+      final Results results) {
+    //deprecated
+    aNotB.update(sketchA, sketchB);
+    CompactSketch<DoubleSummary> result = aNotB.getResult();
+    results.check(result);
+    //Stateless
+    result = AnotB.aNotB(sketchA, sketchB);
+    results.check(result);
+    result = AnotB.aNotB(sketchA, skThetaB);
+    //Stateful w B = Tuple
+    aNotB.setA(sketchA);
+    aNotB.notB(sketchB);
+    result = aNotB.getResult(true);
+    results.check(result);
+    //Stateful w B = Theta
+    aNotB.setA(sketchA);
+    aNotB.notB(skThetaB);
+    result = aNotB.getResult(false);
+    results.check(result);
+    result = aNotB.getResult(true);
+    results.check(result);
+  }
+
+  private static class Results {
+    private int retEnt = 0;
+    private boolean empty = true;
+    private double expect = 0.0;
+    private double toll = 0.0;

Review comment:
       Is `toll` shorthand for `tolerance`, as in error tolerance?

##########
File path: src/main/java/org/apache/datasketches/tuple/Intersection.java
##########
@@ -19,98 +19,176 @@
 
 package org.apache.datasketches.tuple;
 
+import static java.lang.Math.ceil;
+import static java.lang.Math.max;
 import static java.lang.Math.min;
+import static org.apache.datasketches.HashOperations.hashInsertOnly;
+import static org.apache.datasketches.HashOperations.hashSearch;
+import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
+import static org.apache.datasketches.Util.ceilingPowerOf2;
 
 import java.lang.reflect.Array;
 
-import org.apache.datasketches.ResizeFactor;
 import org.apache.datasketches.SketchesStateException;
 
+
 /**
  * Computes an intersection of two or more generic tuple sketches.
- * A new instance represents the Universal Set.
- * Every update() computes an intersection with the internal set
- * and can only reduce the internal set.
+ * A new instance represents the Universal Set. Because the Universal Set
+ * cannot be realized a <i>getResult()</i> on a new instance will produce an error.
+ * Every update() computes an intersection with the internal state, which will never
+ * grow larger and may be reduced to zero.
+ *
  * @param <S> Type of Summary
  */
+@SuppressWarnings("unchecked")
 public class Intersection<S extends Summary> {
-
   private final SummarySetOperations<S> summarySetOps_;
-  private QuickSelectSketch<S> sketch_;
-  private boolean isEmpty_;
-  private long theta_;
-  private boolean isFirstCall_;
+  //private QuickSelectSketch<S> sketch_;
+  private boolean empty_;
+  private long thetaLong_;
+  private HashTables hashTables_;
+  private boolean firstCall_;
 
   /**
    * Creates new instance
    * @param summarySetOps instance of SummarySetOperations
    */
   public Intersection(final SummarySetOperations<S> summarySetOps) {
     summarySetOps_ = summarySetOps;
-    isEmpty_ = false; // universal set at the start
-    theta_ = Long.MAX_VALUE;
-    isFirstCall_ = true;
+    empty_ = false; // universal set at the start
+    thetaLong_ = Long.MAX_VALUE;
+    hashTables_ = new HashTables();
+    firstCall_ = true;
   }
 
   /**
-   * Updates the internal set by intersecting it with the given sketch
-   * @param sketchIn input sketch to intersect with the internal set
+   * Updates the internal state by intersecting it with the given sketch
+   * @param sketchIn input sketch to intersect with the internal state
    */
-  @SuppressWarnings({ "unchecked", "null" })
   public void update(final Sketch<S> sketchIn) {
-    final boolean isFirstCall = isFirstCall_;
-    isFirstCall_ = false;
+    final boolean firstCall = firstCall_;
+    firstCall_ = false;
     if (sketchIn == null) {
-      isEmpty_ = true;
-      sketch_ = null;
+      empty_ = true;
+      thetaLong_ = Long.MAX_VALUE;
+      hashTables_.clear();
       return;
     }
-    theta_ = min(theta_, sketchIn.getThetaLong());
-    isEmpty_ |= sketchIn.isEmpty();
-    if (isEmpty_ || (sketchIn.getRetainedEntries() == 0)) {
-      sketch_ = null;
+    // input sketch is not null, could be first or next call
+    final long thetaLongIn = sketchIn.getThetaLong();
+    final int countIn = sketchIn.getRetainedEntries();
+    thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
+    // Empty rule extended in case incoming sketch does not have empty bit properly set
+    empty_ |= (countIn == 0) && (thetaLongIn == Long.MAX_VALUE);
+    if (countIn == 0) {
+      hashTables_.clear();
       return;
     }
-    // assumes that constructor of QuickSelectSketch bumps the requested size up to the nearest power of 2
-    if (isFirstCall) {
-      sketch_ = new QuickSelectSketch<>(sketchIn.getRetainedEntries(), ResizeFactor.X1.lg(), null);
-      final SketchIterator<S> it = sketchIn.iterator();
-      while (it.next()) {
-        final S summary = (S)it.getSummary().copy();
-        sketch_.insert(it.getKey(), summary);
-      }
-    } else {
-      if (sketch_ == null) {
+    // input sketch will have valid entries > 0
+
+    if (firstCall) {
+      final Sketch<S> firstSketch = sketchIn;
+      //Copy firstSketch data into local instance hashTables_
+      hashTables_.fromSketch(firstSketch);
+    }
+
+    //Next Call
+    else {
+      if (hashTables_.count_ == 0) {
         return;
       }
-      final int matchSize = min(sketch_.getRetainedEntries(), sketchIn.getRetainedEntries());
-      final long[] matchKeys = new long[matchSize];
+      final Sketch<S> nextSketch = sketchIn;
+      //Match nextSketch data with local instance data, filtering by theta
+      final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries());
+
+      final long[] matchKeys = new long[maxMatchSize];
       S[] matchSummaries = null;
       int matchCount = 0;
-      final SketchIterator<S> it = sketchIn.iterator();
+
+      final SketchIterator<S> it = nextSketch.iterator();
       while (it.next()) {
-        final S summary = sketch_.find(it.getKey());
-        if (summary != null) {
-          matchKeys[matchCount] = it.getKey();
-          if (matchSummaries == null) {
-            matchSummaries = (S[]) Array.newInstance(summary.getClass(), matchSize);
-          }
-          matchSummaries[matchCount] =
-              summarySetOps_.intersection(summary, it.getSummary());
-          matchCount++;
+        final long key = it.getKey();
+        if (key >= thetaLong_) { continue; }
+        final int index = hashSearch(hashTables_.keys_, hashTables_.lgTableSize_, key);
+        if (index < 0) { continue; }
+        //Copy the intersecting items from local hashTables_
+        // sequentially into local matchKeys_ and matchSummaries_
+        final S mySummary = hashTables_.summaries_[index];
+
+        if (matchSummaries == null) {
+          matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
         }
+        matchKeys[matchCount] = key;
+        matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, it.getSummary());
+        matchCount++;
+      }
+      hashTables_.fromArrays(matchKeys, matchSummaries, matchCount);
+    }
+  }
+
+  /**
+   * Updates the internal set by intersecting it with the given Theta sketch
+   * @param sketchIn input Theta Sketch to intersect with the internal state.
+   * @param summary the given proxy summary for the Theta Sketch, which doesn't have one.
+   */
+  public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S summary) {

Review comment:
       For unions, the summary is checked for null.  Is the same check necessary here, or does the internal intersection on summaries already verify this?

##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,327 @@
 import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
 import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
 
 import java.lang.reflect.Array;
 import java.util.Arrays;
 
 import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.theta.HashIterator;
 
 /**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
  * @param <S> Type of Summary
  */
 public final class AnotB<S extends Summary> {
-  private boolean isEmpty_ = true;
-  private long theta_ = Long.MAX_VALUE;
-  private long[] keys_;
-  private S[] summaries_;
-  private int count_;
+  private boolean empty_ = true;
+  private long thetaLong_ = Long.MAX_VALUE;
+  private long[] keys_ = null;   //always in compact form, not necessarily sorted
+  private S[] summaries_ = null; //always in compact form, not necessarily sorted
+  private int count_ = 0;
+
+  /**
+   * Sets the given Tuple sketch as the first argument <i>A</i>. This overwrites the internal state of
+   * this AnotB operator with the contents of the given sketch. This sets the stage for multiple
+   * following <i>notB</i> operations.
+   * @param skA The incoming sketch for the first argument, <i>A</i>.
+   */
+  public void setA(final Sketch<S> skA) {
+    if ((skA == null) || skA.isEmpty()) { return; }
+    //skA is not empty
+    empty_ = false;
+    thetaLong_ = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    keys_ = cskA.keys_;
+    summaries_ = cskA.summaries_;
+    count_ = cskA.getRetainedEntries();
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   * @param skB The incoming Tuple sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final Sketch<S> skB) {
+    if (empty_) { return; }
+    if ((skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB = convertToHashTable(skB.keys_, countB, thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summaries_[i];
+          nonMatches++;
+        }
+      }
+    }
+    keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    count_ = nonMatches;
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   * @param skB The incoming Theta sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final org.apache.datasketches.theta.Sketch skB) {
+    if (empty_) { return; }
+    if ((skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB =
+        convertToHashTable(extractThetaHashArray(skB, countB), countB, thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summaries_[i];
+          nonMatches++;
+        }
+      }
+    }
+    keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    count_ = nonMatches;
+  }
+
+
+  /**
+   * Returns the A-and-not-B set operation on the two given sketches.
+   * A null sketch argument is interpreted as an empty sketch.
+   * This is not an accumulating update.
+   *
+   * @param skA The incoming sketch for the first argument
+   * @param skB The incoming sketch for the second argument
+   * @param <S> Type of Summary
+   * @return the result as a compact sketch
+   */
+  @SuppressWarnings("unchecked")
+  public static <S extends Summary>
+        CompactSketch<S> aNotB(final Sketch<S> skA, final Sketch<S> skB) {
+    if ((skA == null) || skA.isEmpty()) {
+      return new CompactSketch<>(null, null, Long.MAX_VALUE, true);
+    }
+    //skA is not empty
+    final boolean empty = false;
+    final long thetaLongA = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    final long[] keysA = cskA.keys_;
+    final S[] summariesA = cskA.summaries_;
+    final int countA = cskA.getRetainedEntries();
+
+    if ((skB == null) || skB.isEmpty()) {
+      return new CompactSketch<>(keysA, summariesA, thetaLongA, empty);
+    }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    final long thetaLong = Math.min(thetaLongA, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB =
+        convertToHashTable(skB.keys_, countB, thetaLong);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[countA];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summariesA.getClass().getComponentType(), countA);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < countA; i++) {
+      final long key = keysA[i];
+      if ((key != 0) && (key < thetaLong)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summariesA[i];
+          nonMatches++;
+        }
+      }
+    }
+    final long[] keys = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    final S[] summaries = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    final CompactSketch<S> result =
+        new CompactSketch<>(keys, summaries, thetaLong, empty);
+    return result;
+  }
+
+  /**
+   * Returns the A-and-not-B set operation on a Tuple sketch and a Theta sketch.
+   * A null sketch argument is interpreted as an empty sketch.
+   * This is not an accumulating update.
+   *
+   * @param skA The incoming Tuple sketch for the first argument
+   * @param skB The incoming Theta sketch for the second argument
+   * @param <S> Type of Summary
+   * @return the result as a compact sketch
+   */
+  @SuppressWarnings("unchecked")
+  public static <S extends Summary>
+        CompactSketch<S> aNotB(final Sketch<S> skA, final org.apache.datasketches.theta.Sketch skB) {
+    if ((skA == null) || skA.isEmpty()) {
+      return new CompactSketch<>(null, null, Long.MAX_VALUE, true);
+    }
+    //skA is not empty
+    final boolean empty = false;
+    final long thetaLongA = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    final long[] keysA = cskA.keys_;
+    final S[] summariesA = cskA.summaries_;
+    final int countA = cskA.getRetainedEntries();
+
+    if ((skB == null) || skB.isEmpty()) {
+      return new CompactSketch<>(keysA, summariesA, thetaLongA, empty);
+    }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    final long thetaLong = Math.min(thetaLongA, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB =
+        convertToHashTable(extractThetaHashArray(skB, countB), countB, thetaLong);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[countA];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summariesA.getClass().getComponentType(), countA);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < countA; i++) {
+      final long key = keysA[i];
+      if ((key != 0) && (key < thetaLong)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summariesA[i];
+          nonMatches++;
+        }
+      }
+    }
+    final long[] keys = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    final S[] summaries = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    final CompactSketch<S> result =
+        new CompactSketch<>(keys, summaries, thetaLong, empty);
+    return result;
+  }
+
+  /**
+   * Gets the result of this operation.
+   * @param reset if true, clears this operator to the empty state after result is returned.
+   * @return the result of this operation as a CompactSketch.
+   */
+  public CompactSketch<S> getResult(final boolean reset) {
+    if (count_ == 0) {
+      return new CompactSketch<>(null, null, thetaLong_, empty_);
+    }
+    final CompactSketch<S> result =
+        new CompactSketch<>(Arrays.copyOfRange(keys_, 0, count_),
+            Arrays.copyOfRange(summaries_, 0, count_), thetaLong_, empty_);
+    if (reset) { reset(); }
+    return result;
+  }
+
+  private static long[] extractThetaHashArray(
+      final org.apache.datasketches.theta.Sketch sketch,
+      final int count) {
+    final HashIterator itr = sketch.iterator();
+    final long[] hashArr = new long[count];
+    int ctr = 0;
+    while (itr.next()) {
+      hashArr[ctr++] = itr.get();
+    }
+    assert ctr == count;

Review comment:
       Since the count is often sourced from the sketch retained entries, is there a case for another count?  Is it a performance concern to cache the count in the caller?

##########
File path: src/main/java/org/apache/datasketches/tuple/Intersection.java
##########
@@ -19,98 +19,176 @@
 
 package org.apache.datasketches.tuple;
 
+import static java.lang.Math.ceil;
+import static java.lang.Math.max;
 import static java.lang.Math.min;
+import static org.apache.datasketches.HashOperations.hashInsertOnly;
+import static org.apache.datasketches.HashOperations.hashSearch;
+import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
+import static org.apache.datasketches.Util.ceilingPowerOf2;
 
 import java.lang.reflect.Array;
 
-import org.apache.datasketches.ResizeFactor;
 import org.apache.datasketches.SketchesStateException;
 
+
 /**
  * Computes an intersection of two or more generic tuple sketches.
- * A new instance represents the Universal Set.
- * Every update() computes an intersection with the internal set
- * and can only reduce the internal set.
+ * A new instance represents the Universal Set. Because the Universal Set
+ * cannot be realized a <i>getResult()</i> on a new instance will produce an error.
+ * Every update() computes an intersection with the internal state, which will never
+ * grow larger and may be reduced to zero.
+ *
  * @param <S> Type of Summary
  */
+@SuppressWarnings("unchecked")
 public class Intersection<S extends Summary> {
-
   private final SummarySetOperations<S> summarySetOps_;
-  private QuickSelectSketch<S> sketch_;
-  private boolean isEmpty_;
-  private long theta_;
-  private boolean isFirstCall_;
+  //private QuickSelectSketch<S> sketch_;

Review comment:
       Is it intentional to leave this local state commented?

##########
File path: src/main/java/org/apache/datasketches/tuple/Intersection.java
##########
@@ -19,98 +19,176 @@
 
 package org.apache.datasketches.tuple;
 
+import static java.lang.Math.ceil;
+import static java.lang.Math.max;
 import static java.lang.Math.min;
+import static org.apache.datasketches.HashOperations.hashInsertOnly;
+import static org.apache.datasketches.HashOperations.hashSearch;
+import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
+import static org.apache.datasketches.Util.ceilingPowerOf2;
 
 import java.lang.reflect.Array;
 
-import org.apache.datasketches.ResizeFactor;
 import org.apache.datasketches.SketchesStateException;
 
+
 /**
  * Computes an intersection of two or more generic tuple sketches.
- * A new instance represents the Universal Set.
- * Every update() computes an intersection with the internal set
- * and can only reduce the internal set.
+ * A new instance represents the Universal Set. Because the Universal Set
+ * cannot be realized a <i>getResult()</i> on a new instance will produce an error.
+ * Every update() computes an intersection with the internal state, which will never
+ * grow larger and may be reduced to zero.
+ *

Review comment:
       This additional documentation is great!

##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,327 @@
 import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
 import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
 
 import java.lang.reflect.Array;
 import java.util.Arrays;
 
 import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.theta.HashIterator;
 
 /**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
  * @param <S> Type of Summary
  */
 public final class AnotB<S extends Summary> {
-  private boolean isEmpty_ = true;
-  private long theta_ = Long.MAX_VALUE;
-  private long[] keys_;
-  private S[] summaries_;
-  private int count_;
+  private boolean empty_ = true;
+  private long thetaLong_ = Long.MAX_VALUE;
+  private long[] keys_ = null;   //always in compact form, not necessarily sorted
+  private S[] summaries_ = null; //always in compact form, not necessarily sorted
+  private int count_ = 0;
+
+  /**
+   * Sets the given Tuple sketch as the first argument <i>A</i>. This overwrites the internal state of
+   * this AnotB operator with the contents of the given sketch. This sets the stage for multiple
+   * following <i>notB</i> operations.
+   * @param skA The incoming sketch for the first argument, <i>A</i>.
+   */
+  public void setA(final Sketch<S> skA) {
+    if ((skA == null) || skA.isEmpty()) { return; }
+    //skA is not empty
+    empty_ = false;
+    thetaLong_ = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    keys_ = cskA.keys_;
+    summaries_ = cskA.summaries_;
+    count_ = cskA.getRetainedEntries();
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   * @param skB The incoming Tuple sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final Sketch<S> skB) {
+    if (empty_) { return; }
+    if ((skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB = convertToHashTable(skB.keys_, countB, thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summaries_[i];
+          nonMatches++;
+        }
+      }
+    }
+    keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    count_ = nonMatches;
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   * @param skB The incoming Theta sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final org.apache.datasketches.theta.Sketch skB) {
+    if (empty_) { return; }
+    if ((skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB =
+        convertToHashTable(extractThetaHashArray(skB, countB), countB, thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta

Review comment:
       How do you interpret a hashed key that equals 0?

##########
File path: src/test/java/org/apache/datasketches/tuple/adouble/AdoubleIntersectionTest.java
##########
@@ -0,0 +1,332 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.tuple.adouble;
+
+import static org.testng.Assert.fail;
+
+import org.apache.datasketches.SketchesStateException;
+import org.apache.datasketches.theta.UpdateSketch;
+import org.apache.datasketches.theta.UpdateSketchBuilder;
+import org.apache.datasketches.tuple.CompactSketch;
+import org.apache.datasketches.tuple.Intersection;
+import org.apache.datasketches.tuple.Sketch;
+import org.apache.datasketches.tuple.SketchIterator;
+import org.apache.datasketches.tuple.Sketches;
+import org.apache.datasketches.tuple.UpdatableSketch;
+import org.apache.datasketches.tuple.UpdatableSketchBuilder;
+import org.apache.datasketches.tuple.adouble.DoubleSummary.Mode;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+/**
+ * @author Lee Rhodes
+ */
+@SuppressWarnings("javadoc")
+public class AdoubleIntersectionTest {
+  private final DoubleSummary.Mode mode = Mode.Sum;
+
+  @Test
+  public void intersectionNotEmptyNoEntries() {
+    UpdatableSketch<Double, DoubleSummary> sketch1 =
+        new UpdatableSketchBuilder<>
+          (new DoubleSummaryFactory(mode)).setSamplingProbability(0.01f).build();
+    sketch1.update("a", 1.0); // this happens to get rejected because of sampling with low probability
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    intersection.update(sketch1);
+    CompactSketch<DoubleSummary> result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    Assert.assertFalse(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), 0.0);
+    Assert.assertEquals(result.getLowerBound(1), 0.0, 0.0001);
+    Assert.assertTrue(result.getUpperBound(1) > 0);
+  }
+
+  @Test
+  public void intersectionExactWithNull() {
+    UpdatableSketch<Double, DoubleSummary> sketch1 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    sketch1.update(1, 1.0);
+    sketch1.update(2, 1.0);
+    sketch1.update(3, 1.0);
+
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    intersection.update(sketch1);
+    intersection.update(null);
+    CompactSketch<DoubleSummary> result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    Assert.assertTrue(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), 0.0);
+    Assert.assertEquals(result.getLowerBound(1), 0.0);
+    Assert.assertEquals(result.getUpperBound(1), 0.0);
+  }
+
+  @Test
+  public void intersectionExactWithEmpty() {
+    UpdatableSketch<Double, DoubleSummary> sketch1 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    sketch1.update(1, 1.0);
+    sketch1.update(2, 1.0);
+    sketch1.update(3, 1.0);
+
+    Sketch<DoubleSummary> sketch2 = Sketches.createEmptySketch();
+
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    intersection.update(sketch1);
+    intersection.update(sketch2);
+    CompactSketch<DoubleSummary> result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    Assert.assertTrue(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), 0.0);
+    Assert.assertEquals(result.getLowerBound(1), 0.0);
+    Assert.assertEquals(result.getUpperBound(1), 0.0);
+  }
+
+  @Test
+  public void intersectionExactMode() {
+    UpdatableSketch<Double, DoubleSummary> sketch1 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    sketch1.update(1, 1.0);
+    sketch1.update(1, 1.0);
+    sketch1.update(2, 1.0);
+    sketch1.update(2, 1.0);
+
+    UpdatableSketch<Double, DoubleSummary> sketch2 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    sketch2.update(2, 1.0);
+    sketch2.update(2, 1.0);
+    sketch2.update(3, 1.0);
+    sketch2.update(3, 1.0);
+
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    intersection.update(sketch1);
+    intersection.update(sketch2);
+    CompactSketch<DoubleSummary> result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 1);
+    Assert.assertFalse(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), 1.0);
+    Assert.assertEquals(result.getLowerBound(1), 1.0);
+    Assert.assertEquals(result.getUpperBound(1), 1.0);
+    SketchIterator<DoubleSummary> it = result.iterator();
+    Assert.assertTrue(it.next());
+    Assert.assertEquals(it.getSummary().getValue(), 4.0);
+    Assert.assertFalse(it.next());
+
+    intersection.reset();
+    intersection.update(null);
+    result = intersection.getResult();
+    Assert.assertTrue(result.isEmpty());
+    Assert.assertFalse(result.isEstimationMode());
+    Assert.assertEquals(result.getEstimate(), 0.0);
+    Assert.assertEquals(result.getUpperBound(1), 0.0);
+    Assert.assertEquals(result.getLowerBound(1), 0.0);
+    Assert.assertEquals(result.getTheta(), 1.0);
+}
+
+  @Test
+  public void intersectionDisjointEstimationMode() {
+    int key = 0;
+    UpdatableSketch<Double, DoubleSummary> sketch1 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    for (int i = 0; i < 8192; i++) {
+      sketch1.update(key++, 1.0);
+    }
+
+    UpdatableSketch<Double, DoubleSummary> sketch2 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    for (int i = 0; i < 8192; i++) {
+      sketch2.update(key++, 1.0);
+    }
+
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    intersection.update(sketch1);
+    intersection.update(sketch2);
+    CompactSketch<DoubleSummary> result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    Assert.assertFalse(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), 0.0);
+    Assert.assertEquals(result.getLowerBound(1), 0.0);
+    Assert.assertTrue(result.getUpperBound(1) > 0);
+
+    // an intersection with no entries must survive more updates
+    intersection.update(sketch1);
+    result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    Assert.assertFalse(result.isEmpty());
+    Assert.assertEquals(result.getEstimate(), 0.0);
+    Assert.assertEquals(result.getLowerBound(1), 0.0);
+    Assert.assertTrue(result.getUpperBound(1) > 0);
+  }
+
+  @Test
+  public void intersectionEstimationMode() {
+    int key = 0;
+    UpdatableSketch<Double, DoubleSummary> sketch1 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    for (int i = 0; i < 8192; i++) {
+      sketch1.update(key++, 1.0);
+    }
+
+    key -= 4096; // overlap half of the entries
+    UpdatableSketch<Double, DoubleSummary> sketch2 =
+        new UpdatableSketchBuilder<>(new DoubleSummaryFactory(mode)).build();
+    for (int i = 0; i < 8192; i++) {
+      sketch2.update(key++, 1.0);
+    }
+
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    intersection.update(sketch1);
+    intersection.update(sketch2);
+    CompactSketch<DoubleSummary> result = intersection.getResult();
+    Assert.assertFalse(result.isEmpty());
+    // crude estimate of RSE(95%) = 2 / sqrt(result.getRetainedEntries())
+    Assert.assertEquals(result.getEstimate(), 4096.0, 4096 * 0.03);
+    Assert.assertTrue(result.getLowerBound(1) <= result.getEstimate());
+    Assert.assertTrue(result.getUpperBound(1) > result.getEstimate());
+    SketchIterator<DoubleSummary> it = result.iterator();
+    while (it.next()) {
+      Assert.assertEquals(it.getSummary().getValue(), 2.0);
+    }
+  }
+
+  @Test
+  public void checkExactIntersectionWithTheta() {
+    UpdateSketch thSkNull = null;
+    UpdateSketch thSkEmpty = new UpdateSketchBuilder().build();
+    UpdateSketch thSk10 = new UpdateSketchBuilder().build();
+    UpdateSketch thSk15 = new UpdateSketchBuilder().build();
+    for (int i = 0; i < 10; i++) { thSk10.update(i); }
+    for (int i = 0; i < 10; i++) { thSk15.update(i + 5); } //overlap = 5
+
+    DoubleSummary dsum = new DoubleSummaryFactory(mode).newSummary();
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    CompactSketch<DoubleSummary> result;
+
+    try { intersection.getResult(); fail(); }
+    catch (SketchesStateException e ) { } //OK.
+
+    intersection.update(thSkNull, dsum);
+    result = intersection.getResult();
+    Assert.assertTrue(result.isEmpty()); //Empty after null first call
+    intersection.reset();
+
+    intersection.update(thSkEmpty, dsum);
+    result = intersection.getResult();
+    Assert.assertTrue(result.isEmpty()); //Empty after empty first call
+    intersection.reset();
+
+    intersection.update(thSk10, dsum);
+    result = intersection.getResult();
+    Assert.assertEquals(result.getEstimate(), 10.0); //Returns valid first call
+    intersection.reset();
+
+    intersection.update(thSk10, dsum);  // Valid first call
+    intersection.update(thSkNull, dsum);
+    result = intersection.getResult();
+    Assert.assertTrue(result.isEmpty()); //Returns Empty after null second call
+    intersection.reset();
+
+    intersection.update(thSk10, dsum);  // Valid first call
+    intersection.update(thSkEmpty, dsum);
+    result = intersection.getResult();
+    Assert.assertTrue(result.isEmpty()); //Returns Empty after empty second call
+    intersection.reset();
+
+    intersection.update(thSk10, dsum);
+    intersection.update(thSk15, dsum);
+    result = intersection.getResult();
+    Assert.assertEquals(result.getEstimate(), 5.0); //Returns intersection
+    intersection.reset();
+  }
+
+  @Test
+  public void checkExactIntersectionWithThetaDisjoint() {
+    UpdateSketch thSkA = new UpdateSketchBuilder().setLogNominalEntries(10).build();
+    UpdateSketch thSkB = new UpdateSketchBuilder().setLogNominalEntries(10).build();
+    int key = 0;
+    for (int i = 0; i < 32;  i++) { thSkA.update(key++); }
+    for (int i = 0; i < 32; i++) { thSkB.update(key++); }
+
+    DoubleSummary dsum = new DoubleSummaryFactory(mode).newSummary();
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    CompactSketch<DoubleSummary> result;
+
+    intersection.update(thSkA, dsum);
+    intersection.update(thSkB, dsum);
+    result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+
+    // an intersection with no entries must survive more updates
+    intersection.update(thSkA, dsum);
+    result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    intersection.reset();
+  }
+
+  @Test
+  public void checkEstimatingIntersectionWithThetaOverlapping() {
+    UpdateSketch thSkA = new UpdateSketchBuilder().setLogNominalEntries(4).build();
+    UpdateSketch thSkB = new UpdateSketchBuilder().setLogNominalEntries(10).build();
+    for (int i = 0; i < 64;  i++) { thSkA.update(i); } //dense mode, low theta
+    for (int i = 32; i < 96; i++) { thSkB.update(i); } //exact overlapping
+
+    DoubleSummary dsum = new DoubleSummaryFactory(mode).newSummary();
+    Intersection<DoubleSummary> intersection =
+        new Intersection<>(new DoubleSummarySetOperations(mode, mode));
+    CompactSketch<DoubleSummary> result;
+
+    intersection.update(thSkA, dsum);
+    intersection.update(thSkB, dsum);
+    result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 14);
+
+    thSkB.reset();
+    for (int i = 100; i < 164; i++) { thSkB.update(i); } //exact, disjoint
+    intersection.update(thSkB, dsum); //remove existing entries
+    result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);
+    intersection.update(thSkB, dsum);
+    result = intersection.getResult();
+    Assert.assertEquals(result.getRetainedEntries(), 0);

Review comment:
       When I attempted adding the set intersection with theta, I struggled with how many of the existing test cases to duplicate for theta.  I think that you have struck a good balance here.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org