You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/06/01 00:06:47 UTC

[incubator-datasketches-java] branch Tuple_Theta_Extension updated: Rewrote Tuple Intersection. There were too many problems to list.

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch Tuple_Theta_Extension
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git


The following commit(s) were added to refs/heads/Tuple_Theta_Extension by this push:
     new 5c4e41c  Rewrote Tuple Intersection.  There were too many problems to list.
5c4e41c is described below

commit 5c4e41c6297632e8f1c82f9985c9618495a6887e
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Sun May 31 17:06:31 2020 -0700

    Rewrote Tuple Intersection.  There were too many problems to list.
---
 .../org/apache/datasketches/HashOperations.java    |   1 -
 .../apache/datasketches/tuple/Intersection.java    | 281 ++++++++++++++-------
 2 files changed, 190 insertions(+), 92 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/HashOperations.java b/src/main/java/org/apache/datasketches/HashOperations.java
index 73b17e7..abb0b93 100644
--- a/src/main/java/org/apache/datasketches/HashOperations.java
+++ b/src/main/java/org/apache/datasketches/HashOperations.java
@@ -125,7 +125,6 @@ public final class HashOperations {
    * Useful for rebuilding tables to avoid unnecessary comparisons.
    * Returns the index of insertion, which is always positive or zero. Throws an exception if the
    * table is full with no empty slot.
-   * Throws an exception if table has no empty slot.
    *
    * @param hashTable the hash table to insert into.
    * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
diff --git a/src/main/java/org/apache/datasketches/tuple/Intersection.java b/src/main/java/org/apache/datasketches/tuple/Intersection.java
index 4fe6eb3..0578f08 100644
--- a/src/main/java/org/apache/datasketches/tuple/Intersection.java
+++ b/src/main/java/org/apache/datasketches/tuple/Intersection.java
@@ -19,26 +19,35 @@
 
 package org.apache.datasketches.tuple;
 
+import static java.lang.Math.ceil;
+import static java.lang.Math.max;
 import static java.lang.Math.min;
+import static org.apache.datasketches.HashOperations.hashInsertOnly;
+import static org.apache.datasketches.HashOperations.hashSearch;
+import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
+import static org.apache.datasketches.Util.ceilingPowerOf2;
 
 import java.lang.reflect.Array;
 
-import org.apache.datasketches.ResizeFactor;
 import org.apache.datasketches.SketchesStateException;
 
+
 /**
  * Computes an intersection of two or more generic tuple sketches.
  * A new instance represents the Universal Set. Because the Universal Set
  * cannot be realized a <i>getResult()</i> on a new instance will produce an error.
  * Every update() computes an intersection with the internal state, which will never
  * grow larger and may be reduced to zero.
+ *
  * @param <S> Type of Summary
  */
+@SuppressWarnings("unchecked")
 public class Intersection<S extends Summary> {
   private final SummarySetOperations<S> summarySetOps_;
-  private QuickSelectSketch<S> sketch_;
+  //private QuickSelectSketch<S> sketch_;
   private boolean empty_;
   private long thetaLong_;
+  private HashTables hashTables_;
   private boolean firstCall_;
 
   /**
@@ -47,9 +56,9 @@ public class Intersection<S extends Summary> {
    */
   public Intersection(final SummarySetOperations<S> summarySetOps) {
     summarySetOps_ = summarySetOps;
-    sketch_ = null;
     empty_ = false; // universal set at the start
     thetaLong_ = Long.MAX_VALUE;
+    hashTables_ = new HashTables();
     firstCall_ = true;
   }
 
@@ -57,67 +66,64 @@ public class Intersection<S extends Summary> {
    * Updates the internal state by intersecting it with the given sketch
    * @param sketchIn input sketch to intersect with the internal state
    */
-  @SuppressWarnings({ "unchecked", "null" })
   public void update(final Sketch<S> sketchIn) {
     final boolean firstCall = firstCall_;
     firstCall_ = false;
     if (sketchIn == null) {
-      empty_ = (thetaLong_ == Long.MAX_VALUE);
-      sketch_ = null;
+      empty_ = true;
+      thetaLong_ = Long.MAX_VALUE;
+      hashTables_.clear();
       return;
     }
-    thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule
-    empty_ |= sketchIn.isEmpty();                          //Empty rule
-    if (empty_ || (sketchIn.getRetainedEntries() == 0)) {
-      empty_ = (thetaLong_ == Long.MAX_VALUE);
-      sketch_ = null;
+    // input sketch is not null, could be first or next call
+    final long thetaLongIn = sketchIn.getThetaLong();
+    final int countIn = sketchIn.getRetainedEntries();
+    thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
+    // Empty rule extended in case incoming sketch does not have empty bit properly set
+    empty_ |= (countIn == 0) && (thetaLongIn == Long.MAX_VALUE);
+    if (countIn == 0) {
+      hashTables_.clear();
       return;
     }
+    // input sketch will have valid entries > 0
+
     if (firstCall) {
-      sketch_ = new QuickSelectSketch<>(sketchIn.getRetainedEntries(), ResizeFactor.X1.lg(), null);
-      final SketchIterator<S> it = sketchIn.iterator();
-      while (it.next()) {
-        sketch_.insert(it.getKey(), (S)it.getSummary().copy());
-      }
-    } else {
-      if (sketch_ == null) {
-        empty_ = (thetaLong_ == Long.MAX_VALUE);
+      final Sketch<S> firstSketch = sketchIn;
+      //Copy firstSketch data into local instance hashTables_
+      hashTables_.fromSketch(firstSketch);
+    }
+
+    //Next Call
+    else {
+      if (hashTables_.count_ == 0) {
         return;
       }
-      final int maxMatchSize = min(sketch_.getRetainedEntries(), sketchIn.getRetainedEntries());
+      final Sketch<S> nextSketch = sketchIn;
+      //Match nextSketch data with local instance data, filtering by theta
+      final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries());
+
       final long[] matchKeys = new long[maxMatchSize];
       S[] matchSummaries = null;
       int matchCount = 0;
-      final SketchIterator<S> it = sketchIn.iterator();
+
+      final SketchIterator<S> it = nextSketch.iterator();
       while (it.next()) {
         final long key = it.getKey();
-        if (key >= thetaLong_) {
-          continue;
-        }
-        final S mySummary = sketch_.find(key);
-        if (mySummary != null) { //key found
-          matchKeys[matchCount] = key;
-          if (matchSummaries == null) {
-            matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
-          }
-          matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, it.getSummary());
-          matchCount++;
-        }
-      }
-      if (matchCount > 0) { //therefore matchSummaries != null.
-        // assumes that constructor of QuickSelectSketch bumps the requested size
-        // up to the nearest power of 2
-        sketch_ = new QuickSelectSketch<>(matchCount, ResizeFactor.X1.lg(), null);
-        for (int i = 0; i < matchCount; i++) {
-          sketch_.insert(matchKeys[i], matchSummaries[i]);
+        if (key >= thetaLong_) { continue; }
+        final int index = hashSearch(hashTables_.keys_, hashTables_.lgTableSize_, key);
+        if (index < 0) { continue; }
+        //Copy the intersecting items from local hashTables_
+        // sequentially into local matchKeys_ and matchSummaries_
+        final S mySummary = hashTables_.summaries_[index];
+
+        if (matchSummaries == null) {
+          matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
         }
-        sketch_.setThetaLong(thetaLong_);
-        empty_ = (thetaLong_ == Long.MAX_VALUE) && (sketch_.getRetainedEntries() == 0);
-        sketch_.setEmpty(empty_);
-      } else {
-        sketch_ = null;
-        empty_ = (thetaLong_ == Long.MAX_VALUE);
+        matchKeys[matchCount] = key;
+        matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, it.getSummary());
+        matchCount++;
       }
+      hashTables_.fromArrays(matchKeys, matchSummaries, matchCount);
     }
   }
 
@@ -126,67 +132,63 @@ public class Intersection<S extends Summary> {
    * @param sketchIn input Theta Sketch to intersect with the internal state.
    * @param summary the given proxy summary for the Theta Sketch, which doesn't have one.
    */
-  @SuppressWarnings({ "unchecked", "null" })
   public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S summary) {
     final boolean firstCall = firstCall_;
     firstCall_ = false;
     if (sketchIn == null) {
-      empty_ = (thetaLong_ == Long.MAX_VALUE);
-      sketch_ = null;
+      empty_ = true;
+      thetaLong_ = Long.MAX_VALUE;
+      hashTables_.clear();
       return;
     }
-    thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule
-    empty_ |= sketchIn.isEmpty();                          //Empty rule
-    if (empty_ || (sketchIn.getRetainedEntries() == 0)) {
-      empty_ = (thetaLong_ == Long.MAX_VALUE);
-      sketch_ = null;
+    // input sketch is not null, could be first or next call
+    final long thetaLongIn = sketchIn.getThetaLong();
+    final int countIn = sketchIn.getRetainedEntries();
+    thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
+    // Empty rule extended in case incoming sketch does not have empty bit properly set
+    empty_ |= (countIn == 0) && (thetaLongIn == Long.MAX_VALUE);
+    if (countIn == 0) {
+      hashTables_.clear();
       return;
     }
+    // input sketch will have valid entries > 0
     if (firstCall) {
-      sketch_ = new QuickSelectSketch<>(sketchIn.getRetainedEntries(), ResizeFactor.X1.lg(), null);
-      final org.apache.datasketches.theta.HashIterator it = sketchIn.iterator();
-      while (it.next()) {
-        sketch_.insert(it.get(), (S)summary.copy());
-      }
-    } else {
-      if (sketch_ == null) {
-        empty_ = (thetaLong_ == Long.MAX_VALUE);
+      final org.apache.datasketches.theta.Sketch firstSketch = sketchIn;
+      //Copy firstSketch data into local instance hashTables_
+      hashTables_.fromSketch(firstSketch, summary);
+    }
+
+    //Next Call
+    else {
+      if (hashTables_.count_ == 0) {
         return;
       }
-      final int maxMatchSize = min(sketch_.getRetainedEntries(), sketchIn.getRetainedEntries());
+      final org.apache.datasketches.theta.Sketch nextSketch = sketchIn;
+      //Match nextSketch data with local instance data, filtering by theta
+      final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries());
+
       final long[] matchKeys = new long[maxMatchSize];
       S[] matchSummaries = null;
       int matchCount = 0;
+
       final org.apache.datasketches.theta.HashIterator it = sketchIn.iterator();
       while (it.next()) {
         final long key = it.get();
-        if (key >= thetaLong_) {
-          continue;
-        }
-        final S mySummary = sketch_.find(key);
-        if (mySummary != null) { //key found
-          matchKeys[matchCount] = key;
-          if (matchSummaries == null) {
-            matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
-          }
-          matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, (S)summary.copy());
-          matchCount++;
-        }
-      }
-      if (matchCount > 0) { //therefore matchSummaries != null.
-        // assumes that constructor of QuickSelectSketch bumps the requested size
-        // up to the nearest power of 2
-        sketch_ = new QuickSelectSketch<>(matchCount, ResizeFactor.X1.lg(), null);
-        for (int i = 0; i < matchCount; i++) {
-          sketch_.insert(matchKeys[i], matchSummaries[i]);
+        if (key >= thetaLong_) { continue; }
+        final int index = hashSearch(hashTables_.keys_, hashTables_.lgTableSize_, key);
+        if (index < 0) { continue; }
+        //Copy the intersecting items from local hashTables_
+        // sequentially into local matchKeys_ and matchSummaries_
+        final S mySummary = hashTables_.summaries_[index];
+
+        if (matchSummaries == null) {
+          matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
         }
-        sketch_.setThetaLong(thetaLong_);
-        empty_ = (thetaLong_ == Long.MAX_VALUE) && (sketch_.getRetainedEntries() == 0);
-        sketch_.setEmpty(empty_);
-      } else {
-        sketch_ = null;
-        empty_ = (thetaLong_ == Long.MAX_VALUE);
+        matchKeys[matchCount] = key;
+        matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, (S)mySummary.copy());
+        matchCount++;
       }
+      hashTables_.fromArrays(matchKeys, matchSummaries, matchCount);
     }
   }
 
@@ -199,10 +201,27 @@ public class Intersection<S extends Summary> {
       throw new SketchesStateException(
         "getResult() with no intervening intersections is not a legal result.");
     }
-    if (sketch_ == null) {
+    if (hashTables_.count_ == 0) {
       return new CompactSketch<>(null, null, thetaLong_, empty_);
     }
-    return sketch_.compact();
+    //Compact the hash tables
+    final int tableSize = hashTables_.keys_.length;
+    final long[] keys = new long[hashTables_.count_];
+    S[] summaries = null;
+    int cnt = 0;
+    for (int i = 0; i < tableSize; i++) {
+      final long key = hashTables_.keys_[i];
+      if ((key == 0) || (key > thetaLong_)) { continue; }
+      final S summary = hashTables_.summaries_[i];
+      if (summaries == null) {
+        summaries = (S[]) Array.newInstance(summary.getClass(), hashTables_.count_);
+      }
+      keys[cnt] = key;
+      summaries[cnt] = summary;
+      cnt++;
+    }
+    assert cnt == hashTables_.count_;
+    return new CompactSketch<>(keys, summaries, thetaLong_, empty_);
   }
 
   /**
@@ -211,7 +230,87 @@ public class Intersection<S extends Summary> {
   public void reset() {
     empty_ = false;
     thetaLong_ = Long.MAX_VALUE;
-    sketch_ = null;
+    hashTables_.clear();
     firstCall_ = true;
   }
+
+  private static int getLgTableSize(final int count) {
+    final int tableSize = max(ceilingPowerOf2((int) ceil(count / 0.75)), 1 << MIN_LG_NOM_LONGS);
+    return Integer.numberOfTrailingZeros(tableSize);
+  }
+
+  private class HashTables {
+    long[] keys_ = null;
+    S[] summaries_ = null;
+    int lgTableSize_ = 0;
+    int count_ = 0;
+
+    HashTables() { }
+
+    public void fromSketch(final Sketch<S> sketch) {
+      count_ = sketch.getRetainedEntries();
+      lgTableSize_ = getLgTableSize(count_);
+      S mySummary = null;
+
+      keys_ = new long[1 << lgTableSize_];
+      final SketchIterator<S> it = sketch.iterator();
+      while (it.next()) {
+        final long key = it.getKey();
+        final int index = hashInsertOnly(keys_, lgTableSize_, key);
+        mySummary = (S)it.getSummary().copy();
+        if (summaries_ == null) {
+          summaries_ = (S[]) Array.newInstance(mySummary.getClass(), 1 << lgTableSize_);
+        }
+        keys_[index] = key;
+        summaries_[index] = mySummary;
+      }
+    }
+
+    public void fromSketch(final org.apache.datasketches.theta.Sketch sketch, final S summary) {
+      count_ = sketch.getRetainedEntries();
+      lgTableSize_ = getLgTableSize(count_);
+      S mySummary = null;
+
+      keys_ = new long[1 << lgTableSize_];
+      final org.apache.datasketches.theta.HashIterator it = sketch.iterator();
+      while (it.next()) {
+        final long key = it.get();
+        final int index = hashInsertOnly(keys_, lgTableSize_, key);
+        mySummary = summary;
+        if (summaries_ == null) {
+          summaries_ = (S[]) Array.newInstance(mySummary.getClass(), 1 << lgTableSize_);
+        }
+        keys_[index] = key;
+        summaries_[index] = mySummary;
+      }
+    }
+
+
+    public void fromArrays(final long[] keys, final S[] summaries, final int count) {
+      count_ = count;
+      lgTableSize_ = getLgTableSize(count);
+
+      S mySummary = null;
+      summaries_ = null;
+      keys_ = new long[1 << lgTableSize_];
+      for (int i = 0; i < count; i++) {
+        final long key = keys[i];
+        final int index = hashInsertOnly(keys_, lgTableSize_, key);
+        mySummary = summaries[i];
+        if (summaries_ == null) {
+          summaries_ = (S[]) Array.newInstance(mySummary.getClass(), 1 << lgTableSize_);
+        }
+        keys_[index] = key;
+        summaries_[index] = summaries[i];
+      }
+    }
+
+    public void clear() {
+      keys_ = null;
+      summaries_ = null;
+      lgTableSize_ = 0;
+      count_ = 0;
+    }
+  }
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org