You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by GitBox <gi...@apache.org> on 2020/06/05 04:33:15 UTC

[GitHub] [incubator-datasketches-java] AlexanderSaydakov commented on a change in pull request #319: Tuple theta extension

AlexanderSaydakov commented on a change in pull request #319:
URL: https://github.com/apache/incubator-datasketches-java/pull/319#discussion_r435680993



##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,351 @@
 import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
 import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
 
 import java.lang.reflect.Array;
 import java.util.Arrays;
 
 import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.SketchesArgumentException;
+import org.apache.datasketches.theta.HashIterator;
 
 /**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
  * @param <S> Type of Summary
  */
 public final class AnotB<S extends Summary> {
-  private boolean isEmpty_ = true;
-  private long theta_ = Long.MAX_VALUE;
-  private long[] keys_;
-  private S[] summaries_;
-  private int count_;
+  private boolean empty_ = true;
+  private long thetaLong_ = Long.MAX_VALUE;
+  private long[] keys_ = null;   //always in compact form, not necessarily sorted
+  private S[] summaries_ = null; //always in compact form, not necessarily sorted
+  private int count_ = 0;
+
+  /**
+   * Sets the given Tuple sketch as the first argument <i>A</i>. This overwrites the internal state of
+   * this AnotB operator with the contents of the given sketch. This sets the stage for multiple
+   * following <i>notB</i> operations.
+   *
+   * <p>An input argument of null will throw an exception.</p>
+   *
+   * @param skA The incoming sketch for the first argument, <i>A</i>.
+   */
+  public void setA(final Sketch<S> skA) {
+    if (skA == null) {
+      throw new SketchesArgumentException("The input argument may not be null");
+    }
+    if (skA.isEmpty()) { return; }
+    //skA is not empty
+    empty_ = false;
+    thetaLong_ = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    keys_ = cskA.keys_;
+    summaries_ = cskA.summaries_;
+    count_ = cskA.getRetainedEntries();
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   *
+   * <p>An input argument of null or empty is ignored.</p>
+   *
+   * @param skB The incoming Tuple sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final Sketch<S> skB) {
+    if (empty_ || (skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB = convertToHashTable(skB.keys_, countB, thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summaries_[i];
+          nonMatches++;
+        }
+      }
+    }
+    keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    count_ = nonMatches;
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.

Review comment:
       AnotB

##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,351 @@
 import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
 import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
 
 import java.lang.reflect.Array;
 import java.util.Arrays;
 
 import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.SketchesArgumentException;
+import org.apache.datasketches.theta.HashIterator;
 
 /**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
  * @param <S> Type of Summary
  */
 public final class AnotB<S extends Summary> {
-  private boolean isEmpty_ = true;
-  private long theta_ = Long.MAX_VALUE;
-  private long[] keys_;
-  private S[] summaries_;
-  private int count_;
+  private boolean empty_ = true;
+  private long thetaLong_ = Long.MAX_VALUE;
+  private long[] keys_ = null;   //always in compact form, not necessarily sorted
+  private S[] summaries_ = null; //always in compact form, not necessarily sorted
+  private int count_ = 0;
+
+  /**
+   * Sets the given Tuple sketch as the first argument <i>A</i>. This overwrites the internal state of
+   * this AnotB operator with the contents of the given sketch. This sets the stage for multiple
+   * following <i>notB</i> operations.
+   *
+   * <p>An input argument of null will throw an exception.</p>
+   *
+   * @param skA The incoming sketch for the first argument, <i>A</i>.
+   */
+  public void setA(final Sketch<S> skA) {
+    if (skA == null) {
+      throw new SketchesArgumentException("The input argument may not be null");
+    }
+    if (skA.isEmpty()) { return; }
+    //skA is not empty
+    empty_ = false;
+    thetaLong_ = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    keys_ = cskA.keys_;
+    summaries_ = cskA.summaries_;
+    count_ = cskA.getRetainedEntries();
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   *
+   * <p>An input argument of null or empty is ignored.</p>
+   *
+   * @param skB The incoming Tuple sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final Sketch<S> skB) {
+    if (empty_ || (skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta

Review comment:
       removes -> remove

##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,351 @@
 import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
 import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
 
 import java.lang.reflect.Array;
 import java.util.Arrays;
 
 import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.SketchesArgumentException;
+import org.apache.datasketches.theta.HashIterator;
 
 /**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
  * @param <S> Type of Summary
  */
 public final class AnotB<S extends Summary> {
-  private boolean isEmpty_ = true;
-  private long theta_ = Long.MAX_VALUE;
-  private long[] keys_;
-  private S[] summaries_;
-  private int count_;
+  private boolean empty_ = true;
+  private long thetaLong_ = Long.MAX_VALUE;
+  private long[] keys_ = null;   //always in compact form, not necessarily sorted
+  private S[] summaries_ = null; //always in compact form, not necessarily sorted
+  private int count_ = 0;
+
+  /**
+   * Sets the given Tuple sketch as the first argument <i>A</i>. This overwrites the internal state of
+   * this AnotB operator with the contents of the given sketch. This sets the stage for multiple
+   * following <i>notB</i> operations.
+   *
+   * <p>An input argument of null will throw an exception.</p>
+   *
+   * @param skA The incoming sketch for the first argument, <i>A</i>.
+   */
+  public void setA(final Sketch<S> skA) {
+    if (skA == null) {
+      throw new SketchesArgumentException("The input argument may not be null");
+    }
+    if (skA.isEmpty()) { return; }
+    //skA is not empty
+    empty_ = false;
+    thetaLong_ = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    keys_ = cskA.keys_;
+    summaries_ = cskA.summaries_;
+    count_ = cskA.getRetainedEntries();
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.

Review comment:
       AnotB

##########
File path: src/main/java/org/apache/datasketches/tuple/AnotB.java
##########
@@ -22,45 +22,351 @@
 import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
 import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
+import static org.apache.datasketches.Util.simpleIntLog2;
 
 import java.lang.reflect.Array;
 import java.util.Arrays;
 
 import org.apache.datasketches.HashOperations;
+import org.apache.datasketches.SketchesArgumentException;
+import org.apache.datasketches.theta.HashIterator;
 
 /**
- * Computes a set difference of two generic tuple sketches
+ * Computes a set difference, A-AND-NOT-B, of two generic tuple sketches
  * @param <S> Type of Summary
  */
 public final class AnotB<S extends Summary> {
-  private boolean isEmpty_ = true;
-  private long theta_ = Long.MAX_VALUE;
-  private long[] keys_;
-  private S[] summaries_;
-  private int count_;
+  private boolean empty_ = true;
+  private long thetaLong_ = Long.MAX_VALUE;
+  private long[] keys_ = null;   //always in compact form, not necessarily sorted
+  private S[] summaries_ = null; //always in compact form, not necessarily sorted
+  private int count_ = 0;
+
+  /**
+   * Sets the given Tuple sketch as the first argument <i>A</i>. This overwrites the internal state of
+   * this AnotB operator with the contents of the given sketch. This sets the stage for multiple
+   * following <i>notB</i> operations.
+   *
+   * <p>An input argument of null will throw an exception.</p>
+   *
+   * @param skA The incoming sketch for the first argument, <i>A</i>.
+   */
+  public void setA(final Sketch<S> skA) {
+    if (skA == null) {
+      throw new SketchesArgumentException("The input argument may not be null");
+    }
+    if (skA.isEmpty()) { return; }
+    //skA is not empty
+    empty_ = false;
+    thetaLong_ = skA.getThetaLong();
+    final CompactSketch<S> cskA = (skA instanceof CompactSketch)
+        ? (CompactSketch<S>)skA
+        : ((QuickSelectSketch<S>)skA).compact();
+    keys_ = cskA.keys_;
+    summaries_ = cskA.summaries_;
+    count_ = cskA.getRetainedEntries();
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   *
+   * <p>An input argument of null or empty is ignored.</p>
+   *
+   * @param skB The incoming Tuple sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final Sketch<S> skB) {
+    if (empty_ || (skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB = convertToHashTable(skB.keys_, countB, thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key != 0) && (key < thetaLong_)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summaries_[i];
+          nonMatches++;
+        }
+      }
+    }
+    keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    count_ = nonMatches;
+  }
+
+  /**
+   * Performs an <i>AND NOT</i> operation with the existing internal state of this AnoB operator.
+   *
+   * <p>An input argument of null or empty is ignored.</p>
+   *
+   * @param skB The incoming Theta sketch for the second (or following) argument <i>B</i>.
+   */
+  @SuppressWarnings("unchecked")
+  public void notB(final org.apache.datasketches.theta.Sketch skB) {
+    if (empty_ || (skB == null) || skB.isEmpty()) { return; }
+    //skB is not empty
+    final long thetaLongB = skB.getThetaLong();
+    thetaLong_ = Math.min(thetaLong_, thetaLongB);
+    //Build hashtable and removes keys of skB >= theta
+    final int countB = skB.getRetainedEntries();
+    final long[] hashTableKeysB =
+        convertToHashTable(extractThetaHashArray(skB, countB), countB, thetaLong_);
+
+    //build temporary arrays of skA
+    final long[] tmpKeysA = new long[count_];
+    final S[] tmpSummariesA =
+        (S[]) Array.newInstance(summaries_.getClass().getComponentType(), count_);
+
+    //search for non matches and build temp arrays
+    int nonMatches = 0;
+    for (int i = 0; i < count_; i++) {
+      final long key = keys_[i];
+      if ((key > 0) && (key < thetaLong_)) { //skips keys of A >= theta
+        final int index =
+            HashOperations.hashSearch(hashTableKeysB, simpleIntLog2(hashTableKeysB.length), key);
+        if (index == -1) {
+          tmpKeysA[nonMatches] = key;
+          tmpSummariesA[nonMatches] = summaries_[i];
+          nonMatches++;
+        }
+      }
+    }
+    keys_ = Arrays.copyOfRange(tmpKeysA, 0, nonMatches);
+    summaries_ = Arrays.copyOfRange(tmpSummariesA, 0, nonMatches);
+    count_ = nonMatches;
+  }
+
+  /**
+   * Returns the A-and-not-B set operation on the two given Tuple sketches.

Review comment:
       the operation is named inconsistently: AnotB, A-AND-NOT-B, A-and-not-B




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org