You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/06/01 00:06:47 UTC
[incubator-datasketches-java] branch Tuple_Theta_Extension updated:
Rewrote Tuple Intersection. There were too many problems to list.
This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch Tuple_Theta_Extension
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git
The following commit(s) were added to refs/heads/Tuple_Theta_Extension by this push:
new 5c4e41c Rewrote Tuple Intersection. There were too many problems to list.
5c4e41c is described below
commit 5c4e41c6297632e8f1c82f9985c9618495a6887e
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Sun May 31 17:06:31 2020 -0700
Rewrote Tuple Intersection. There were too many problems to list.
---
.../org/apache/datasketches/HashOperations.java | 1 -
.../apache/datasketches/tuple/Intersection.java | 281 ++++++++++++++-------
2 files changed, 190 insertions(+), 92 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/HashOperations.java b/src/main/java/org/apache/datasketches/HashOperations.java
index 73b17e7..abb0b93 100644
--- a/src/main/java/org/apache/datasketches/HashOperations.java
+++ b/src/main/java/org/apache/datasketches/HashOperations.java
@@ -125,7 +125,6 @@ public final class HashOperations {
* Useful for rebuilding tables to avoid unnecessary comparisons.
* Returns the index of insertion, which is always positive or zero. Throws an exception if the
* table is full with no empty slot.
- * Throws an exception if table has no empty slot.
*
* @param hashTable the hash table to insert into.
* @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
diff --git a/src/main/java/org/apache/datasketches/tuple/Intersection.java b/src/main/java/org/apache/datasketches/tuple/Intersection.java
index 4fe6eb3..0578f08 100644
--- a/src/main/java/org/apache/datasketches/tuple/Intersection.java
+++ b/src/main/java/org/apache/datasketches/tuple/Intersection.java
@@ -19,26 +19,35 @@
package org.apache.datasketches.tuple;
+import static java.lang.Math.ceil;
+import static java.lang.Math.max;
import static java.lang.Math.min;
+import static org.apache.datasketches.HashOperations.hashInsertOnly;
+import static org.apache.datasketches.HashOperations.hashSearch;
+import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
+import static org.apache.datasketches.Util.ceilingPowerOf2;
import java.lang.reflect.Array;
-import org.apache.datasketches.ResizeFactor;
import org.apache.datasketches.SketchesStateException;
+
/**
* Computes an intersection of two or more generic tuple sketches.
* A new instance represents the Universal Set. Because the Universal Set
* cannot be realized a <i>getResult()</i> on a new instance will produce an error.
* Every update() computes an intersection with the internal state, which will never
* grow larger and may be reduced to zero.
+ *
* @param <S> Type of Summary
*/
+@SuppressWarnings("unchecked")
public class Intersection<S extends Summary> {
private final SummarySetOperations<S> summarySetOps_;
- private QuickSelectSketch<S> sketch_;
+ //private QuickSelectSketch<S> sketch_;
private boolean empty_;
private long thetaLong_;
+ private HashTables hashTables_;
private boolean firstCall_;
/**
@@ -47,9 +56,9 @@ public class Intersection<S extends Summary> {
*/
public Intersection(final SummarySetOperations<S> summarySetOps) {
summarySetOps_ = summarySetOps;
- sketch_ = null;
empty_ = false; // universal set at the start
thetaLong_ = Long.MAX_VALUE;
+ hashTables_ = new HashTables();
firstCall_ = true;
}
@@ -57,67 +66,64 @@ public class Intersection<S extends Summary> {
* Updates the internal state by intersecting it with the given sketch
* @param sketchIn input sketch to intersect with the internal state
*/
- @SuppressWarnings({ "unchecked", "null" })
public void update(final Sketch<S> sketchIn) {
final boolean firstCall = firstCall_;
firstCall_ = false;
if (sketchIn == null) {
- empty_ = (thetaLong_ == Long.MAX_VALUE);
- sketch_ = null;
+ empty_ = true;
+ thetaLong_ = Long.MAX_VALUE;
+ hashTables_.clear();
return;
}
- thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule
- empty_ |= sketchIn.isEmpty(); //Empty rule
- if (empty_ || (sketchIn.getRetainedEntries() == 0)) {
- empty_ = (thetaLong_ == Long.MAX_VALUE);
- sketch_ = null;
+ // input sketch is not null, could be first or next call
+ final long thetaLongIn = sketchIn.getThetaLong();
+ final int countIn = sketchIn.getRetainedEntries();
+ thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
+ // Empty rule extended in case incoming sketch does not have empty bit properly set
+ empty_ |= (countIn == 0) && (thetaLongIn == Long.MAX_VALUE);
+ if (countIn == 0) {
+ hashTables_.clear();
return;
}
+ // input sketch will have valid entries > 0
+
if (firstCall) {
- sketch_ = new QuickSelectSketch<>(sketchIn.getRetainedEntries(), ResizeFactor.X1.lg(), null);
- final SketchIterator<S> it = sketchIn.iterator();
- while (it.next()) {
- sketch_.insert(it.getKey(), (S)it.getSummary().copy());
- }
- } else {
- if (sketch_ == null) {
- empty_ = (thetaLong_ == Long.MAX_VALUE);
+ final Sketch<S> firstSketch = sketchIn;
+ //Copy firstSketch data into local instance hashTables_
+ hashTables_.fromSketch(firstSketch);
+ }
+
+ //Next Call
+ else {
+ if (hashTables_.count_ == 0) {
return;
}
- final int maxMatchSize = min(sketch_.getRetainedEntries(), sketchIn.getRetainedEntries());
+ final Sketch<S> nextSketch = sketchIn;
+ //Match nextSketch data with local instance data, filtering by theta
+ final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries());
+
final long[] matchKeys = new long[maxMatchSize];
S[] matchSummaries = null;
int matchCount = 0;
- final SketchIterator<S> it = sketchIn.iterator();
+
+ final SketchIterator<S> it = nextSketch.iterator();
while (it.next()) {
final long key = it.getKey();
- if (key >= thetaLong_) {
- continue;
- }
- final S mySummary = sketch_.find(key);
- if (mySummary != null) { //key found
- matchKeys[matchCount] = key;
- if (matchSummaries == null) {
- matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
- }
- matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, it.getSummary());
- matchCount++;
- }
- }
- if (matchCount > 0) { //therefore matchSummaries != null.
- // assumes that constructor of QuickSelectSketch bumps the requested size
- // up to the nearest power of 2
- sketch_ = new QuickSelectSketch<>(matchCount, ResizeFactor.X1.lg(), null);
- for (int i = 0; i < matchCount; i++) {
- sketch_.insert(matchKeys[i], matchSummaries[i]);
+ if (key >= thetaLong_) { continue; }
+ final int index = hashSearch(hashTables_.keys_, hashTables_.lgTableSize_, key);
+ if (index < 0) { continue; }
+ //Copy the intersecting items from local hashTables_
+ // sequentially into local matchKeys_ and matchSummaries_
+ final S mySummary = hashTables_.summaries_[index];
+
+ if (matchSummaries == null) {
+ matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
}
- sketch_.setThetaLong(thetaLong_);
- empty_ = (thetaLong_ == Long.MAX_VALUE) && (sketch_.getRetainedEntries() == 0);
- sketch_.setEmpty(empty_);
- } else {
- sketch_ = null;
- empty_ = (thetaLong_ == Long.MAX_VALUE);
+ matchKeys[matchCount] = key;
+ matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, it.getSummary());
+ matchCount++;
}
+ hashTables_.fromArrays(matchKeys, matchSummaries, matchCount);
}
}
@@ -126,67 +132,63 @@ public class Intersection<S extends Summary> {
* @param sketchIn input Theta Sketch to intersect with the internal state.
* @param summary the given proxy summary for the Theta Sketch, which doesn't have one.
*/
- @SuppressWarnings({ "unchecked", "null" })
public void update(final org.apache.datasketches.theta.Sketch sketchIn, final S summary) {
final boolean firstCall = firstCall_;
firstCall_ = false;
if (sketchIn == null) {
- empty_ = (thetaLong_ == Long.MAX_VALUE);
- sketch_ = null;
+ empty_ = true;
+ thetaLong_ = Long.MAX_VALUE;
+ hashTables_.clear();
return;
}
- thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule
- empty_ |= sketchIn.isEmpty(); //Empty rule
- if (empty_ || (sketchIn.getRetainedEntries() == 0)) {
- empty_ = (thetaLong_ == Long.MAX_VALUE);
- sketch_ = null;
+ // input sketch is not null, could be first or next call
+ final long thetaLongIn = sketchIn.getThetaLong();
+ final int countIn = sketchIn.getRetainedEntries();
+ thetaLong_ = min(thetaLong_, thetaLongIn); //Theta rule
+ // Empty rule extended in case incoming sketch does not have empty bit properly set
+ empty_ |= (countIn == 0) && (thetaLongIn == Long.MAX_VALUE);
+ if (countIn == 0) {
+ hashTables_.clear();
return;
}
+ // input sketch will have valid entries > 0
if (firstCall) {
- sketch_ = new QuickSelectSketch<>(sketchIn.getRetainedEntries(), ResizeFactor.X1.lg(), null);
- final org.apache.datasketches.theta.HashIterator it = sketchIn.iterator();
- while (it.next()) {
- sketch_.insert(it.get(), (S)summary.copy());
- }
- } else {
- if (sketch_ == null) {
- empty_ = (thetaLong_ == Long.MAX_VALUE);
+ final org.apache.datasketches.theta.Sketch firstSketch = sketchIn;
+ //Copy firstSketch data into local instance hashTables_
+ hashTables_.fromSketch(firstSketch, summary);
+ }
+
+ //Next Call
+ else {
+ if (hashTables_.count_ == 0) {
return;
}
- final int maxMatchSize = min(sketch_.getRetainedEntries(), sketchIn.getRetainedEntries());
+ final org.apache.datasketches.theta.Sketch nextSketch = sketchIn;
+ //Match nextSketch data with local instance data, filtering by theta
+ final int maxMatchSize = min(hashTables_.count_, nextSketch.getRetainedEntries());
+
final long[] matchKeys = new long[maxMatchSize];
S[] matchSummaries = null;
int matchCount = 0;
+
final org.apache.datasketches.theta.HashIterator it = sketchIn.iterator();
while (it.next()) {
final long key = it.get();
- if (key >= thetaLong_) {
- continue;
- }
- final S mySummary = sketch_.find(key);
- if (mySummary != null) { //key found
- matchKeys[matchCount] = key;
- if (matchSummaries == null) {
- matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
- }
- matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, (S)summary.copy());
- matchCount++;
- }
- }
- if (matchCount > 0) { //therefore matchSummaries != null.
- // assumes that constructor of QuickSelectSketch bumps the requested size
- // up to the nearest power of 2
- sketch_ = new QuickSelectSketch<>(matchCount, ResizeFactor.X1.lg(), null);
- for (int i = 0; i < matchCount; i++) {
- sketch_.insert(matchKeys[i], matchSummaries[i]);
+ if (key >= thetaLong_) { continue; }
+ final int index = hashSearch(hashTables_.keys_, hashTables_.lgTableSize_, key);
+ if (index < 0) { continue; }
+ //Copy the intersecting items from local hashTables_
+ // sequentially into local matchKeys_ and matchSummaries_
+ final S mySummary = hashTables_.summaries_[index];
+
+ if (matchSummaries == null) {
+ matchSummaries = (S[]) Array.newInstance(mySummary.getClass(), maxMatchSize);
}
- sketch_.setThetaLong(thetaLong_);
- empty_ = (thetaLong_ == Long.MAX_VALUE) && (sketch_.getRetainedEntries() == 0);
- sketch_.setEmpty(empty_);
- } else {
- sketch_ = null;
- empty_ = (thetaLong_ == Long.MAX_VALUE);
+ matchKeys[matchCount] = key;
+ matchSummaries[matchCount] = summarySetOps_.intersection(mySummary, (S)mySummary.copy());
+ matchCount++;
}
+ hashTables_.fromArrays(matchKeys, matchSummaries, matchCount);
}
}
@@ -199,10 +201,27 @@ public class Intersection<S extends Summary> {
throw new SketchesStateException(
"getResult() with no intervening intersections is not a legal result.");
}
- if (sketch_ == null) {
+ if (hashTables_.count_ == 0) {
return new CompactSketch<>(null, null, thetaLong_, empty_);
}
- return sketch_.compact();
+ //Compact the hash tables
+ final int tableSize = hashTables_.keys_.length;
+ final long[] keys = new long[hashTables_.count_];
+ S[] summaries = null;
+ int cnt = 0;
+ for (int i = 0; i < tableSize; i++) {
+ final long key = hashTables_.keys_[i];
+ if ((key == 0) || (key > thetaLong_)) { continue; }
+ final S summary = hashTables_.summaries_[i];
+ if (summaries == null) {
+ summaries = (S[]) Array.newInstance(summary.getClass(), hashTables_.count_);
+ }
+ keys[cnt] = key;
+ summaries[cnt] = summary;
+ cnt++;
+ }
+ assert cnt == hashTables_.count_;
+ return new CompactSketch<>(keys, summaries, thetaLong_, empty_);
}
/**
@@ -211,7 +230,87 @@ public class Intersection<S extends Summary> {
public void reset() {
empty_ = false;
thetaLong_ = Long.MAX_VALUE;
- sketch_ = null;
+ hashTables_.clear();
firstCall_ = true;
}
+
+ private static int getLgTableSize(final int count) {
+ final int tableSize = max(ceilingPowerOf2((int) ceil(count / 0.75)), 1 << MIN_LG_NOM_LONGS);
+ return Integer.numberOfTrailingZeros(tableSize);
+ }
+
+ private class HashTables {
+ long[] keys_ = null;
+ S[] summaries_ = null;
+ int lgTableSize_ = 0;
+ int count_ = 0;
+
+ HashTables() { }
+
+ public void fromSketch(final Sketch<S> sketch) {
+ count_ = sketch.getRetainedEntries();
+ lgTableSize_ = getLgTableSize(count_);
+ S mySummary = null;
+
+ keys_ = new long[1 << lgTableSize_];
+ final SketchIterator<S> it = sketch.iterator();
+ while (it.next()) {
+ final long key = it.getKey();
+ final int index = hashInsertOnly(keys_, lgTableSize_, key);
+ mySummary = (S)it.getSummary().copy();
+ if (summaries_ == null) {
+ summaries_ = (S[]) Array.newInstance(mySummary.getClass(), 1 << lgTableSize_);
+ }
+ keys_[index] = key;
+ summaries_[index] = mySummary;
+ }
+ }
+
+ public void fromSketch(final org.apache.datasketches.theta.Sketch sketch, final S summary) {
+ count_ = sketch.getRetainedEntries();
+ lgTableSize_ = getLgTableSize(count_);
+ S mySummary = null;
+
+ keys_ = new long[1 << lgTableSize_];
+ final org.apache.datasketches.theta.HashIterator it = sketch.iterator();
+ while (it.next()) {
+ final long key = it.get();
+ final int index = hashInsertOnly(keys_, lgTableSize_, key);
+ mySummary = summary;
+ if (summaries_ == null) {
+ summaries_ = (S[]) Array.newInstance(mySummary.getClass(), 1 << lgTableSize_);
+ }
+ keys_[index] = key;
+ summaries_[index] = mySummary;
+ }
+ }
+
+
+ public void fromArrays(final long[] keys, final S[] summaries, final int count) {
+ count_ = count;
+ lgTableSize_ = getLgTableSize(count);
+
+ S mySummary = null;
+ summaries_ = null;
+ keys_ = new long[1 << lgTableSize_];
+ for (int i = 0; i < count; i++) {
+ final long key = keys[i];
+ final int index = hashInsertOnly(keys_, lgTableSize_, key);
+ mySummary = summaries[i];
+ if (summaries_ == null) {
+ summaries_ = (S[]) Array.newInstance(mySummary.getClass(), 1 << lgTableSize_);
+ }
+ keys_[index] = key;
+ summaries_[index] = summaries[i];
+ }
+ }
+
+ public void clear() {
+ keys_ = null;
+ summaries_ = null;
+ lgTableSize_ = 0;
+ count_ = 0;
+ }
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org