You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/07/25 01:22:10 UTC
[incubator-datasketches-java] 01/02: Interim
This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch Refactor_Intersection
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git
commit 7e35246400eb07f2920f1eac5d10169167929e25
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Tue Jul 21 12:16:48 2020 -0700
Interim
---
.../org/apache/datasketches/HashOperations.java | 27 +++--
.../org/apache/datasketches/theta/AnotBimpl.java | 3 +-
.../datasketches/theta/IntersectionImpl.java | 130 +++++++++++++--------
.../datasketches/theta/IntersectionImplR.java | 75 +++++++-----
.../apache/datasketches/theta/SetOperation.java | 105 ++++++++++-------
.../org/apache/datasketches/theta/UnionImpl.java | 5 +-
.../apache/datasketches/theta/AnotBimplTest.java | 2 +-
.../datasketches/theta/DirectIntersectionTest.java | 6 +-
.../datasketches/theta/HeapIntersectionTest.java | 8 +-
.../datasketches/theta/SetOperationTest.java | 18 +--
.../apache/datasketches/theta/UnionImplTest.java | 4 +-
11 files changed, 228 insertions(+), 155 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/HashOperations.java b/src/main/java/org/apache/datasketches/HashOperations.java
index 9067b79..6541397 100644
--- a/src/main/java/org/apache/datasketches/HashOperations.java
+++ b/src/main/java/org/apache/datasketches/HashOperations.java
@@ -19,7 +19,8 @@
package org.apache.datasketches;
-import static org.apache.datasketches.Util.MIN_LG_NOM_LONGS;
+import static java.lang.Math.max;
+import static org.apache.datasketches.Util.MIN_LG_ARR_LONGS;
import static org.apache.datasketches.Util.ceilingPowerOf2;
import org.apache.datasketches.memory.Memory;
@@ -334,17 +335,27 @@ public final class HashOperations {
final int count,
final long thetaLong,
final double rebuildThreshold) {
- final int size = Math.max(
- ceilingPowerOf2((int) Math.ceil(count / rebuildThreshold)),
- 1 << MIN_LG_NOM_LONGS
- );
- final long[] hashTable = new long[size];
- hashArrayInsert(
- hashArr, hashTable, Integer.numberOfTrailingZeros(size), thetaLong);
+ final int lgArrLongs = minLgHashTableSize(count, rebuildThreshold);
+ final int arrLongs = 1 << lgArrLongs;
+ final long[] hashTable = new long[arrLongs];
+ hashArrayInsert(hashArr, hashTable, lgArrLongs, thetaLong);
return hashTable;
}
/**
+ * Returns the smallest log hash table size given the count of items and the rebuild threshold.
+ * @param count the given count of items
+ * @param rebuild_threshold the rebuild threshold as a fraction between zero and one.
+ * @return the smallest log hash table size
+ */
+ public static int minLgHashTableSize(final int count, final double rebuild_threshold) {
+ final int upperCount = (int) Math.ceil(count / rebuild_threshold);
+ final int arrLongs = max(ceilingPowerOf2(upperCount), 1 << MIN_LG_ARR_LONGS);
+ final int newLgArrLongs = Integer.numberOfTrailingZeros(arrLongs);
+ return newLgArrLongs;
+ }
+
+ /**
* Counts the cardinality of the first Log2 values of the given source array.
* @param srcArr the given source array
* @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>
diff --git a/src/main/java/org/apache/datasketches/theta/AnotBimpl.java b/src/main/java/org/apache/datasketches/theta/AnotBimpl.java
index 4686cb8..9eecee3 100644
--- a/src/main/java/org/apache/datasketches/theta/AnotBimpl.java
+++ b/src/main/java/org/apache/datasketches/theta/AnotBimpl.java
@@ -23,6 +23,7 @@ import static org.apache.datasketches.HashOperations.convertToHashTable;
import static org.apache.datasketches.HashOperations.hashSearch;
import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
import static org.apache.datasketches.Util.checkSeedHashes;
+import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.Util.simpleIntLog2;
import java.util.Arrays;
@@ -141,7 +142,7 @@ final class AnotBimpl extends AnotB {
}
@Override
- int getRetainedEntries(final boolean valid) {
+ int getRetainedEntries() {
return curCount_;
}
diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
index 8166fb3..aa49451 100644
--- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
+++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
@@ -20,7 +20,13 @@
package org.apache.datasketches.theta;
import static java.lang.Math.min;
+import static org.apache.datasketches.HashOperations.continueCondition;
+import static org.apache.datasketches.HashOperations.hashInsertOnly;
+import static org.apache.datasketches.HashOperations.hashInsertOnlyMemory;
+import static org.apache.datasketches.HashOperations.hashSearch;
+import static org.apache.datasketches.HashOperations.minLgHashTableSize;
import static org.apache.datasketches.Util.MIN_LG_ARR_LONGS;
+import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.FLAGS_BYTE;
@@ -35,15 +41,16 @@ import static org.apache.datasketches.theta.PreambleUtil.insertCurCount;
import static org.apache.datasketches.theta.PreambleUtil.insertFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.insertFlags;
import static org.apache.datasketches.theta.PreambleUtil.insertLgArrLongs;
+import static org.apache.datasketches.theta.PreambleUtil.insertLgNomLongs;
import static org.apache.datasketches.theta.PreambleUtil.insertP;
import static org.apache.datasketches.theta.PreambleUtil.insertPreLongs;
import static org.apache.datasketches.theta.PreambleUtil.insertSerVer;
import static org.apache.datasketches.theta.PreambleUtil.insertThetaLong;
+import static org.apache.datasketches.theta.PreambleUtil.setEmpty;
import java.util.Arrays;
import org.apache.datasketches.Family;
-import org.apache.datasketches.HashOperations;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.Util;
import org.apache.datasketches.memory.Memory;
@@ -62,12 +69,13 @@ final class IntersectionImpl extends IntersectionImplR {
* @return a new IntersectionImpl on the Java heap
*/
static IntersectionImpl initNewHeapInstance(final long seed) {
+ //compute & store seedHash,
final IntersectionImpl impl = new IntersectionImpl(null, seed, false);
impl.lgArrLongs_ = 0;
impl.curCount_ = -1; //Universal Set is true
impl.thetaLong_ = Long.MAX_VALUE;
impl.empty_ = false; //A virgin intersection represents the Universal Set so empty is FALSE!
- impl.hashTable_ = null;
+ impl.hashTable_ = null; //retained entries of the intersection as a hash table, on-heap only.
return impl;
}
@@ -81,27 +89,30 @@ final class IntersectionImpl extends IntersectionImplR {
* @return a new IntersectionImpl that may be off-heap
*/
static IntersectionImpl initNewDirectInstance(final long seed, final WritableMemory dstMem) {
- final IntersectionImpl impl = new IntersectionImpl(dstMem, seed, true);
+ //DstMem: compute & store seedHash, true means no seedhash checking
+ final IntersectionImpl impl = new IntersectionImpl(dstMem, seed, true); //compute & store seedHash
//Load Preamble
+
insertPreLongs(dstMem, CONST_PREAMBLE_LONGS); //RF not used = 0
insertSerVer(dstMem, SER_VER);
insertFamilyID(dstMem, Family.INTERSECTION.getID());
- //Note: Intersection does not use lgNomLongs or k, per se.
+ insertLgNomLongs(dstMem, 0); //Note: Intersection does not use lgNomLongs or k, per se.
//set lgArrLongs initially to minimum. Don't clear cache in mem
insertLgArrLongs(dstMem, MIN_LG_ARR_LONGS);
insertFlags(dstMem, 0); //bigEndian = readOnly = compact = ordered = empty = false;
//seedHash loaded and checked in private constructor
insertCurCount(dstMem, -1);
insertP(dstMem, (float) 1.0);
+ //pre2
insertThetaLong(dstMem, Long.MAX_VALUE);
- //Initialize
+ //internal initialize
impl.lgArrLongs_ = MIN_LG_ARR_LONGS;
impl.curCount_ = -1; //set in mem below
impl.thetaLong_ = Long.MAX_VALUE;
impl.empty_ = false;
- impl.maxLgArrLongs_ = checkMaxLgArrLongs(dstMem); //Only Off Heap
+ impl.maxLgArrLongs_ = getMaxLgArrLongs(dstMem); //Only Off Heap
return impl;
}
@@ -114,6 +125,7 @@ final class IntersectionImpl extends IntersectionImplR {
* @return a IntersectionImplR instance on the Java heap
*/
static IntersectionImplR heapifyInstance(final Memory srcMem, final long seed) {
+ //compute & store seedHash,
final IntersectionImpl impl = new IntersectionImpl(null, seed, false);
//Get Preamble
@@ -164,13 +176,14 @@ final class IntersectionImpl extends IntersectionImplR {
}
/**
- * Wrap an Intersection target around the given source Memory containing intersection data.
- * @param srcMem The source Memory image.
+ * Wrap an Intersection target around the given source WritableMemory containing intersection data.
+ * @param srcMem The source WritableMemory image.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See seed</a>
- * @return a IntersectionImpl that wraps a source Memory that contains an Intersection image
+ * @return a IntersectionImpl that wraps a source WritableMemory that contains an Intersection image
*/
static IntersectionImpl wrapInstance(final WritableMemory srcMem, final long seed) {
+ //SrcMem:gets and stores the seedHash, checks mem_seedHash against the seed
final IntersectionImpl impl = new IntersectionImpl(srcMem, seed, false);
return (IntersectionImpl) internalWrapInstance(srcMem, impl);
}
@@ -193,31 +206,31 @@ final class IntersectionImpl extends IntersectionImplR {
curCount_ = 0;
thetaLong_ = Long.MAX_VALUE;
empty_ = true;
- maxLgArrLongs_ = 0;
hashTable_ = null;
- if (mem_ != null) {
- PreambleUtil.setEmpty(mem_); //true
- insertThetaLong(mem_, thetaLong_);
- insertCurCount(mem_, 0);
- insertLgArrLongs(mem_, lgArrLongs_);
+ if (wmem_ != null) {
+ setEmpty(wmem_); //true
+ insertThetaLong(wmem_, thetaLong_);
+ insertCurCount(wmem_, 0);
+ insertLgArrLongs(wmem_, lgArrLongs_);
}
return;
}
Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash());
thetaLong_ = min(thetaLong_, sketchIn.getThetaLong()); //Theta rule
empty_ = false;
- if (mem_ != null) {
- insertThetaLong(mem_, thetaLong_);
- PreambleUtil.clearEmpty(mem_); //false
+ if (wmem_ != null) {
+ insertThetaLong(wmem_, thetaLong_);
+ clearEmpty(wmem_); //false
}
final int sketchInEntries = sketchIn.getRetainedEntries(true);
// The truth table for the following state machine
// Case curCount sketchInEntries | Actions
- // 1 <0 0 | First intersect, curCount = 0; HT = null; exit
+ // 1 <0 0 | First intersect, curCount < 0; HT = null; exit
// 2 0 0 | CurCount = 0; HT = null; exit
// 3 >0 0 | CurCount = 0; HT = null; exit
+ // 4 | Not used
// 5 <0 >0 | First intersect, clone SketchIn; exit
// 6 0 >0 | CurCount = 0; HT = null; exit
// 7 >0 >0 | Perform full intersect
@@ -230,26 +243,27 @@ final class IntersectionImpl extends IntersectionImplR {
case 6: { //(curCount_ == 0) || (sketchInEntries == 0)
//All future intersections result in zero data, but theta can still be reduced.
curCount_ = 0;
- if (mem_ != null) { insertCurCount(mem_, 0); }
+ if (wmem_ != null) { insertCurCount(wmem_, 0); }
hashTable_ = null; //No need for a HT. Don't bother clearing mem if valid
break;
}
case 5: { // curCount_ < 0; This is the 1st intersect, clone the incoming sketch
curCount_ = sketchIn.getRetainedEntries(true);
- final int requiredLgArrLongs = computeMinLgArrLongsFromCount(curCount_);
+ final int requiredLgArrLongs = minLgHashTableSize(curCount_, REBUILD_THRESHOLD);
final int priorLgArrLongs = lgArrLongs_; //prior only used in error message
lgArrLongs_ = requiredLgArrLongs;
- if (mem_ != null) { //Off heap, check if current dstMem is large enough
- insertCurCount(mem_, curCount_);
- insertLgArrLongs(mem_, lgArrLongs_);
+ if (wmem_ != null) { //Off heap, check if current dstMem is large enough
+ insertCurCount(wmem_, curCount_);
+ insertLgArrLongs(wmem_, lgArrLongs_);
if (requiredLgArrLongs <= maxLgArrLongs_) { //OK
- mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear only what required
+ wmem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear only what required
}
else { //not enough space in dstMem
+ final int requiredBytes = (8 << requiredLgArrLongs) + 24;
+ final int givenBytes = (8 << priorLgArrLongs) + 24;
throw new SketchesArgumentException(
- "Insufficient dstMem hash table space: "
- + (1 << requiredLgArrLongs) + " > " + (1 << priorLgArrLongs));
+ "Insufficient internal Memory space: " + requiredBytes + " > " + givenBytes);
}
}
else { //On the heap, allocate a HT
@@ -283,11 +297,35 @@ final class IntersectionImpl extends IntersectionImplR {
thetaLong_ = Long.MAX_VALUE;
empty_ = false;
hashTable_ = null;
- if (mem_ != null) {
- insertLgArrLongs(mem_, lgArrLongs_); //make sure
- insertCurCount(mem_, -1);
- insertThetaLong(mem_, Long.MAX_VALUE);
- clearEmpty(mem_);
+ if (wmem_ != null) {
+ insertLgArrLongs(wmem_, 0);
+ insertCurCount(wmem_, -1);
+ insertThetaLong(wmem_, Long.MAX_VALUE);
+ clearEmpty(wmem_);
+ }
+ }
+
+ private void hardReset() { //Universal Set
+ resetToEmpty();
+ curCount_ = -1;
+ empty_ = false;
+ if (wmem_ != null) {
+ clearEmpty(wmem_);
+ insertCurCount(wmem_, -1);
+ }
+ }
+
+ private void resetToEmpty() { //empty state
+ lgArrLongs_ = 0;
+ curCount_ = 0;
+ thetaLong_ = Long.MAX_VALUE;
+ empty_ = true;
+ hashTable_ = null;
+ if (wmem_ != null) {
+ insertLgArrLongs(wmem_, 0);
+ insertCurCount(wmem_, 0);
+ insertThetaLong(wmem_, Long.MAX_VALUE);
+ setEmpty(wmem_); //true
}
}
@@ -299,10 +337,10 @@ final class IntersectionImpl extends IntersectionImplR {
final long[] cacheIn = sketchIn.getCache();
final int arrLongsIn = cacheIn.length;
final long[] hashTable;
- if (mem_ != null) {
+ if (wmem_ != null) {
final int htLen = 1 << lgArrLongs_;
hashTable = new long[htLen];
- mem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen);
+ wmem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen);
} else {
hashTable = hashTable_;
}
@@ -318,7 +356,7 @@ final class IntersectionImpl extends IntersectionImplR {
if (hashIn >= thetaLong_) {
break; //early stop assumes that hashes in input sketch are ordered!
}
- final int foundIdx = HashOperations.hashSearch(hashTable, lgArrLongs_, hashIn);
+ final int foundIdx = hashSearch(hashTable, lgArrLongs_, hashIn);
if (foundIdx == -1) { continue; }
matchSet[matchSetCount++] = hashIn;
}
@@ -328,18 +366,18 @@ final class IntersectionImpl extends IntersectionImplR {
for (int i = 0; i < arrLongsIn; i++ ) {
final long hashIn = cacheIn[i];
if ((hashIn <= 0L) || (hashIn >= thetaLong_)) { continue; }
- final int foundIdx = HashOperations.hashSearch(hashTable, lgArrLongs_, hashIn);
+ final int foundIdx = hashSearch(hashTable, lgArrLongs_, hashIn);
if (foundIdx == -1) { continue; }
matchSet[matchSetCount++] = hashIn;
}
}
//reduce effective array size to minimum
curCount_ = matchSetCount;
- lgArrLongs_ = computeMinLgArrLongsFromCount(matchSetCount);
- if (mem_ != null) {
- insertCurCount(mem_, matchSetCount);
- insertLgArrLongs(mem_, lgArrLongs_);
- mem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear for rebuild
+ lgArrLongs_ = minLgHashTableSize(matchSetCount, REBUILD_THRESHOLD);
+ if (wmem_ != null) {
+ insertCurCount(wmem_, matchSetCount);
+ insertLgArrLongs(wmem_, lgArrLongs_);
+ wmem_.clear(CONST_PREAMBLE_LONGS << 3, 8 << lgArrLongs_); //clear for rebuild
} else {
Arrays.fill(hashTable_, 0, 1 << lgArrLongs_, 0L); //clear for rebuild
}
@@ -356,21 +394,21 @@ final class IntersectionImpl extends IntersectionImplR {
private void moveDataToTgt(final long[] arr, final int count) {
final int arrLongsIn = arr.length;
int tmpCnt = 0;
- if (mem_ != null) { //Off Heap puts directly into mem
+ if (wmem_ != null) { //Off Heap puts directly into mem
final int preBytes = CONST_PREAMBLE_LONGS << 3;
final int lgArrLongs = lgArrLongs_;
final long thetaLong = thetaLong_;
for (int i = 0; i < arrLongsIn; i++ ) {
final long hashIn = arr[i];
- if (HashOperations.continueCondition(thetaLong, hashIn)) { continue; }
- HashOperations.hashInsertOnlyMemory(mem_, lgArrLongs, hashIn, preBytes);
+ if (continueCondition(thetaLong, hashIn)) { continue; }
+ hashInsertOnlyMemory(wmem_, lgArrLongs, hashIn, preBytes);
tmpCnt++;
}
} else { //On Heap. Assumes HT exists and is large enough
for (int i = 0; i < arrLongsIn; i++ ) {
final long hashIn = arr[i];
- if (HashOperations.continueCondition(thetaLong_, hashIn)) { continue; }
- HashOperations.hashInsertOnly(hashTable_, lgArrLongs_, hashIn);
+ if (continueCondition(thetaLong_, hashIn)) { continue; }
+ hashInsertOnly(hashTable_, lgArrLongs_, hashIn);
tmpCnt++;
}
}
diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImplR.java b/src/main/java/org/apache/datasketches/theta/IntersectionImplR.java
index f752a7e..658d293 100644
--- a/src/main/java/org/apache/datasketches/theta/IntersectionImplR.java
+++ b/src/main/java/org/apache/datasketches/theta/IntersectionImplR.java
@@ -20,6 +20,7 @@
package org.apache.datasketches.theta;
import static org.apache.datasketches.Util.MIN_LG_ARR_LONGS;
+import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.Util.floorPowerOf2;
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
@@ -57,7 +58,8 @@ import org.apache.datasketches.memory.WritableMemory;
*/
class IntersectionImplR extends Intersection {
protected final short seedHash_;
- protected final WritableMemory mem_;
+ protected final boolean readOnly_;
+ protected final WritableMemory wmem_;
//Note: Intersection does not use lgNomLongs or k, per se.
protected int lgArrLongs_; //current size of hash table
@@ -66,31 +68,42 @@ class IntersectionImplR extends Intersection {
protected boolean empty_;
protected long[] hashTable_ = null; //HT => Data. Only used On Heap
- protected int maxLgArrLongs_ = 0; //max size of hash table. Only used Off Heap
+ protected int maxLgArrLongs_ = 0; //max size of wmem_ hash table. Only used Off Heap, never reset.
- protected IntersectionImplR(final WritableMemory mem, final long seed, final boolean newMem) {
- mem_ = mem;
- if (mem != null) {
- if (newMem) {
+ /**
+ * Constructor.
+ * @param wmem Can be either a Source(e.g. wrap) or Destination (new Direct) WritableMemory
+ * @param seed Used to validate incoming sketch arguments
+ * @param dstMemFlag The given memory is a Destination (new Direct) WritableMemory
+ */
+ protected IntersectionImplR(final WritableMemory wmem, final long seed, final boolean dstMemFlag) {
+ readOnly_ = !dstMemFlag;
+ if (wmem != null) {
+ wmem_ = wmem;
+ if (dstMemFlag) { //DstMem: compute & store seedHash, newMem means no seedhash checking
+ checkMinSizeMemory(wmem);
seedHash_ = computeSeedHash(seed);
- mem_.putShort(SEED_HASH_SHORT, seedHash_);
- } else {
- seedHash_ = mem_.getShort(SEED_HASH_SHORT);
+ wmem_.putShort(SEED_HASH_SHORT, seedHash_);
+ } else { //SrcMem:gets and stores the seedHash, checks mem_seedHash against the seed
+ seedHash_ = wmem_.getShort(SEED_HASH_SHORT);
Util.checkSeedHashes(seedHash_, computeSeedHash(seed)); //check for seed hash conflict
}
- } else {
+ } else { //compute & store seedHash
+ wmem_ = null;
seedHash_ = computeSeedHash(seed);
}
}
/**
* Wrap an Intersection target around the given source Memory containing intersection data.
+ * @See SetOperation#wrap(Memory, long)
* @param srcMem The source Memory image.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See seed</a>
- * @return an IntersectionImplR that wraps a read-only Intersection image referenced by srcMem
+ * @return an IntersectionImplR that wraps a read-only Intersection image
*/
static IntersectionImplR wrapInstance(final Memory srcMem, final long seed) {
+ //SrcMem:gets and stores the seedHash, checks mem_seedHash against the seed
final IntersectionImplR impl = new IntersectionImplR((WritableMemory) srcMem, seed, false);
return internalWrapInstance(srcMem, impl);
}
@@ -134,7 +147,7 @@ class IntersectionImplR extends Intersection {
impl.curCount_ = curCount;
impl.thetaLong_ = thetaLong;
impl.empty_ = empty;
- impl.maxLgArrLongs_ = checkMaxLgArrLongs(srcMem); //Only Off Heap, check for min size
+ impl.maxLgArrLongs_ = getMaxLgArrLongs(srcMem); //Only Off Heap, check for min size
return impl;
}
@@ -154,10 +167,10 @@ class IntersectionImplR extends Intersection {
}
//else curCount > 0
final long[] hashTable;
- if (mem_ != null) {
+ if (wmem_ != null) {
final int htLen = 1 << lgArrLongs_;
hashTable = new long[htLen];
- mem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen);
+ wmem_.getLongArray(CONST_PREAMBLE_LONGS << 3, hashTable, 0, htLen);
} else {
hashTable = hashTable_;
}
@@ -172,13 +185,13 @@ class IntersectionImplR extends Intersection {
* as the infinite <i>Universal Set</i>.
*/
@Override
- int getRetainedEntries(final boolean valid) {
+ int getRetainedEntries() {
return curCount_;
}
@Override
public boolean hasResult() {
- return (mem_ != null) ? mem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0;
+ return (wmem_ != null) ? wmem_.getInt(RETAINED_ENTRIES_INT) >= 0 : curCount_ >= 0;
}
@Override
@@ -188,7 +201,7 @@ class IntersectionImplR extends Intersection {
@Override
public boolean isSameResource(final Memory that) {
- return (mem_ != null) ? mem_.isSameResource(that) : false;
+ return (wmem_ != null) ? wmem_.isSameResource(that) : false;
}
@Override
@@ -201,8 +214,8 @@ class IntersectionImplR extends Intersection {
final int preBytes = CONST_PREAMBLE_LONGS << 3;
final int dataBytes = (curCount_ > 0) ? 8 << lgArrLongs_ : 0;
final byte[] byteArrOut = new byte[preBytes + dataBytes];
- if (mem_ != null) {
- mem_.getByteArray(0, byteArrOut, 0, preBytes + dataBytes);
+ if (wmem_ != null) {
+ wmem_.getByteArray(0, byteArrOut, 0, preBytes + dataBytes);
}
else {
final WritableMemory memOut = WritableMemory.wrap(byteArrOut);
@@ -253,13 +266,13 @@ class IntersectionImplR extends Intersection {
@Override
long[] getCache() {
- if (mem_ == null) {
+ if (wmem_ == null) {
return (hashTable_ != null) ? hashTable_ : new long[0];
}
//Direct
final int arrLongs = 1 << lgArrLongs_;
final long[] outArr = new long[arrLongs];
- mem_.getLongArray(CONST_PREAMBLE_LONGS << 3, outArr, 0, arrLongs);
+ wmem_.getLongArray(CONST_PREAMBLE_LONGS << 3, outArr, 0, arrLongs);
return outArr;
}
@@ -274,21 +287,23 @@ class IntersectionImplR extends Intersection {
}
/**
- * Returns the correct maximum lgArrLongs given the capacity of the Memory. Checks that the
- * capacity is large enough for the minimum sized hash table.
+ * Returns the maximum lgArrLongs given the capacity of the Memory.
* @param dstMem the given Memory
- * @return the correct maximum lgArrLongs given the capacity of the Memory
+ * @return the maximum lgArrLongs given the capacity of the Memory
*/
- static final int checkMaxLgArrLongs(final Memory dstMem) {
+ static final int getMaxLgArrLongs(final Memory dstMem) {
final int preBytes = CONST_PREAMBLE_LONGS << 3;
final long cap = dstMem.getCapacity();
- final int maxLgArrLongs =
- Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3);
- if (maxLgArrLongs < MIN_LG_ARR_LONGS) {
+ return Integer.numberOfTrailingZeros(floorPowerOf2((int)(cap - preBytes)) >>> 3);
+ }
+
+ static final void checkMinSizeMemory(final Memory mem) {
+ final int minBytes = (CONST_PREAMBLE_LONGS << 3) + (8 << MIN_LG_ARR_LONGS);//280
+ final long cap = mem.getCapacity();
+ if (cap < minBytes) {
throw new SketchesArgumentException(
- "dstMem not large enough for minimum sized hash table: " + cap);
+ "Memory must be at least " + minBytes + " bytes. Actual capacity: " + cap);
}
- return maxLgArrLongs;
}
/**
diff --git a/src/main/java/org/apache/datasketches/theta/SetOperation.java b/src/main/java/org/apache/datasketches/theta/SetOperation.java
index b43977c..837d1a9 100644
--- a/src/main/java/org/apache/datasketches/theta/SetOperation.java
+++ b/src/main/java/org/apache/datasketches/theta/SetOperation.java
@@ -19,18 +19,14 @@
package org.apache.datasketches.theta;
-import static java.lang.Math.max;
import static org.apache.datasketches.Family.idToFamily;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
-import static org.apache.datasketches.Util.MIN_LG_ARR_LONGS;
-import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
import static org.apache.datasketches.Util.ceilingPowerOf2;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
import org.apache.datasketches.Family;
import org.apache.datasketches.SketchesArgumentException;
-import org.apache.datasketches.Util;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;
@@ -58,6 +54,10 @@ public abstract class SetOperation {
* SetOperation using the
* <a href="{@docRoot}/resources/dictionary.html#defaultUpdateSeed">Default Update Seed</a>.
* The resulting SetOperation will not retain any link to the source Memory.
+ *
+ * <p>Note: Only certain set operators during stateful operations can be serialized and thus
+ * heapified.</p>
+ *
* @param srcMem an image of a SetOperation where the image seed hash matches the default seed hash.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @return a Heap-based SetOperation from the given Memory
@@ -70,6 +70,10 @@ public abstract class SetOperation {
* Heapify takes the SetOperation image in Memory and instantiates an on-heap
* SetOperation using the given seed.
* The resulting SetOperation will not retain any link to the source Memory.
+ *
+ * <p>Note: Only certain set operators during stateful operations can be serialized and thus
+ * heapified.</p>
+ *
* @param srcMem an image of a SetOperation where the hash of the given seed matches the image seed hash.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
@@ -95,9 +99,12 @@ public abstract class SetOperation {
/**
* Wrap takes the SetOperation image in Memory and refers to it directly.
* There is no data copying onto the java heap.
- * Only "Direct" SetOperations that have been explicitly stored as direct can be wrapped.
* This method assumes the
* <a href="{@docRoot}/resources/dictionary.html#defaultUpdateSeed">Default Update Seed</a>.
+ *
+ * <p>Note: Only certain set operators during stateful operations can be serialized and thus
+ * wrapped.</p>
+ *
* @param srcMem an image of a SetOperation where the image seed hash matches the default seed hash.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @return a SetOperation backed by the given Memory
@@ -109,7 +116,10 @@ public abstract class SetOperation {
/**
* Wrap takes the SetOperation image in Memory and refers to it directly.
* There is no data copying onto the java heap.
- * Only "Direct" SetOperations that have been explicitly stored as direct can be wrapped.
+ *
+ * <p>Note: Only certain set operators during stateful operations can be serialized and thus
+ * wrapped.</p>
+ *
* @param srcMem an image of a SetOperation where the hash of the given seed matches the image seed hash.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
@@ -122,7 +132,7 @@ public abstract class SetOperation {
if (serVer != 3) {
throw new SketchesArgumentException("SerVer must be 3: " + serVer);
}
- switch (family) { //TODO Do we need to add the stateful AnotB ?
+ switch (family) {
case UNION : {
return UnionImpl.wrapInstance(srcMem, seed);
}
@@ -137,9 +147,12 @@ public abstract class SetOperation {
/**
* Wrap takes the SetOperation image in Memory and refers to it directly.
* There is no data copying onto the java heap.
- * Only "Direct" SetOperations that have been explicitly stored as direct can be wrapped.
* This method assumes the
* <a href="{@docRoot}/resources/dictionary.html#defaultUpdateSeed">Default Update Seed</a>.
+ *
+ * <p>Note: Only certain set operators during stateful operations can be serialized and thus
+ * wrapped.</p>
+ *
* @param srcMem an image of a SetOperation where the image seed hash matches the default seed hash.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @return a SetOperation backed by the given Memory
@@ -151,7 +164,10 @@ public abstract class SetOperation {
/**
* Wrap takes the SetOperation image in Memory and refers to it directly.
* There is no data copying onto the java heap.
- * Only "Direct" SetOperations that have been explicitly stored as direct can be wrapped.
+ *
+ * <p>Note: Only certain set operators during stateful operations can be serialized and thus
+ * wrapped.</p>
+ *
* @param srcMem an image of a SetOperation where the hash of the given seed matches the image seed hash.
* <a href="{@docRoot}/resources/dictionary.html#mem">See Memory</a>
* @param seed <a href="{@docRoot}/resources/dictionary.html#seed">See Update Hash Seed</a>.
@@ -202,17 +218,16 @@ public abstract class SetOperation {
}
/**
- * Returns the maximum number of bytes for the returned CompactSketch, given the maximum
+ * Returns the maximum number of bytes for the returned CompactSketch, given the
* value of nomEntries of the first sketch A of AnotB.
- * @param maxNomEntries the given value must be a power of 2.
+ * @param nomEntries this value must be a power of 2.
* @return the maximum number of bytes.
*/
- public static int getMaxAnotBResultBytes(final int maxNomEntries) {
- final int ceil = ceilingPowerOf2(maxNomEntries);
+ public static int getMaxAnotBResultBytes(final int nomEntries) {
+ final int ceil = ceilingPowerOf2(nomEntries);
return 24 + (15 * ceil);
}
-
/**
* Gets the Family of this SetOperation
* @return the Family of this SetOperation
@@ -223,6 +238,10 @@ public abstract class SetOperation {
* Returns true if the backing resource of <i>this</i> is identical with the backing resource
* of <i>that</i>. The capacities must be the same. If <i>this</i> is a region,
* the region offset must also be the same.
+ *
+ * <p>Note: Only certain set operators during stateful operations can be serialized.
+ * Only when they are stored into Memory will this be relevant.</p>
+ *
* @param that A different non-null object
* @return true if the backing resource of <i>this</i> is the same as the backing resource
* of <i>that</i>.
@@ -231,45 +250,43 @@ public abstract class SetOperation {
//restricted
+ /**
+ * Gets the hash array in compact form.
+ * This is only useful during stateful operations.
+ * This should never be made public.
+ * @return the hash array
+ */
abstract long[] getCache();
- //intentionally not made public because behavior will be confusing to end user.
- abstract int getRetainedEntries(boolean valid);
+ /**
+ * Gets the current count of retained entries.
+ * This is only useful during stateful operations.
+ * Intentionally not made public because behavior will be confusing to end user.
+ *
+ * @return Gets the current count of retained entries.
+ */
+ abstract int getRetainedEntries();
+ /**
+ * Returns the seedHash established during class construction.
+ * @return the seedHash.
+ */
abstract short getSeedHash();
- abstract long getThetaLong();
-
- static short computeSeedHash(final long seed) {
- return Util.computeSeedHash(seed);
- }
-
- //intentionally not made public because behavior will be confusing to end user.
- abstract boolean isEmpty();
-
/**
- * Computes minimum lgArrLongs from a current count.
- * @param count the given current count
- * @return the minimum lgArrLongs from a current count.
+ * Gets the current value of ThetaLong.
+ * Only useful during stateful operations.
+ * Intentionally not made public because behavior will be confusing to end user.
+ * @return the current value of ThetaLong.
*/
- //Used by intersection and AnotB
- static final int computeMinLgArrLongsFromCount(final int count) {
- final int upperCount = (int) Math.ceil(count / REBUILD_THRESHOLD);
- final int arrLongs = max(ceilingPowerOf2(upperCount), 1 << MIN_LG_ARR_LONGS);
- final int newLgArrLongs = Integer.numberOfTrailingZeros(arrLongs);
- return newLgArrLongs;
- }
+ abstract long getThetaLong();
/**
- * Returns true if given Family id is one of the set operations
- * @param id the given Family id
- * @return true if given Family id is one of the set operations
+ * Returns true if this set operator is empty.
+ * Only useful during stateful operations.
+ * Intentionally not made public because behavior will be confusing to end user.
+ * @return true if this set operator is empty.
*/
- static boolean isValidSetOpID(final int id) {
- final Family family = Family.idToFamily(id);
- final boolean ret = ((family == Family.UNION) || (family == Family.INTERSECTION)
- || (family == Family.A_NOT_B));
- return ret;
- }
+ abstract boolean isEmpty();
}
diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java
index a1b20bc..72515f6 100644
--- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java
+++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java
@@ -22,6 +22,7 @@ package org.apache.datasketches.theta;
import static java.lang.Math.min;
import static org.apache.datasketches.QuickSelect.selectExcludingZeros;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
+import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
@@ -477,8 +478,8 @@ final class UnionImpl extends Union {
}
@Override
- int getRetainedEntries(final boolean valid) {
- return gadget_.getRetainedEntries(valid);
+ int getRetainedEntries() {
+ return gadget_.getRetainedEntries(true);
}
@Override
diff --git a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java
index c7cc1ef..5c61c01 100644
--- a/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java
+++ b/src/test/java/org/apache/datasketches/theta/AnotBimplTest.java
@@ -59,7 +59,7 @@ public class AnotBimplTest {
aNb.setA(usk1);
aNb.notB(usk2);
- assertEquals(aNb.getRetainedEntries(true), k/2);
+ assertEquals(aNb.getRetainedEntries(), k/2);
CompactSketch rsk1;
diff --git a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java
index f9ede9e..062c190 100644
--- a/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java
+++ b/src/test/java/org/apache/datasketches/theta/DirectIntersectionTest.java
@@ -69,7 +69,7 @@ public class DirectIntersectionTest {
inter.intersect(usk1);
inter.intersect(usk2);
- long[] cache = inter.getCache();
+ long[] cache = inter.getCache(); //only applies to stateful
assertEquals(cache.length, 32);
CompactSketch rsk1;
@@ -454,7 +454,7 @@ public class DirectIntersectionTest {
@Test
public void checkWrapVirginEmpty() {
int lgK = 5;
- int k = 1<<lgK;
+ int k = 1 << lgK;
Intersection inter1, inter2;
UpdateSketch sk1;
@@ -484,7 +484,7 @@ public class DirectIntersectionTest {
//test the path via toByteArray, wrap, now in a different state
iMem = WritableMemory.wrap(inter1.toByteArray());
- inter2 = Sketches.wrapIntersection(iMem);
+ inter2 = Sketches.wrapIntersection((Memory)iMem);
assertTrue(inter2.hasResult()); //still true
//test the compaction path
diff --git a/src/test/java/org/apache/datasketches/theta/HeapIntersectionTest.java b/src/test/java/org/apache/datasketches/theta/HeapIntersectionTest.java
index 179e847..2f6fa8f 100644
--- a/src/test/java/org/apache/datasketches/theta/HeapIntersectionTest.java
+++ b/src/test/java/org/apache/datasketches/theta/HeapIntersectionTest.java
@@ -472,10 +472,10 @@ public class HeapIntersectionTest {
Intersection inter = Sketches.setOperationBuilder().buildIntersection();
inter.intersect(usk);
assertTrue(inter.isEmpty());
- assertEquals(inter.getRetainedEntries(true), 0);
+ assertEquals(inter.getRetainedEntries(), 0);
assertTrue(inter.getSeedHash() != 0);
assertEquals(inter.getThetaLong(), Long.MAX_VALUE);
- long[] longArr = inter.getCache();
+ long[] longArr = inter.getCache(); //only applies to stateful
assertEquals(longArr.length, 0);
}
@@ -486,10 +486,10 @@ public class HeapIntersectionTest {
Intersection inter = Sketches.setOperationBuilder().buildIntersection();
inter.intersect(usk);
assertFalse(inter.isEmpty());
- assertEquals(inter.getRetainedEntries(true), 1);
+ assertEquals(inter.getRetainedEntries(), 1);
assertTrue(inter.getSeedHash() != 0);
assertEquals(inter.getThetaLong(), Long.MAX_VALUE);
- long[] longArr = inter.getCache();
+ long[] longArr = inter.getCache(); //only applies to stateful
assertEquals(longArr.length, 32);
}
diff --git a/src/test/java/org/apache/datasketches/theta/SetOperationTest.java b/src/test/java/org/apache/datasketches/theta/SetOperationTest.java
index ca32df4..a599f40 100644
--- a/src/test/java/org/apache/datasketches/theta/SetOperationTest.java
+++ b/src/test/java/org/apache/datasketches/theta/SetOperationTest.java
@@ -19,11 +19,9 @@
package org.apache.datasketches.theta;
-import static org.apache.datasketches.Family.A_NOT_B;
-import static org.apache.datasketches.Family.INTERSECTION;
-import static org.apache.datasketches.Family.UNION;
+import static org.apache.datasketches.HashOperations.minLgHashTableSize;
import static org.apache.datasketches.ResizeFactor.X4;
-import static org.apache.datasketches.theta.SetOperation.computeMinLgArrLongsFromCount;
+import static org.apache.datasketches.Util.REBUILD_THRESHOLD;
import static org.apache.datasketches.theta.Sketch.getMaxUpdateSketchBytes;
import static org.testng.Assert.assertEquals;
//import static org.testng.Assert.assertTrue;
@@ -227,8 +225,8 @@ public class SetOperationTest {
@Test
public void checkComputeLgArrLongs() {
- assertEquals(computeMinLgArrLongsFromCount(30), 5);
- assertEquals(computeMinLgArrLongsFromCount(31), 6);
+ assertEquals(minLgHashTableSize(30, REBUILD_THRESHOLD), 5);
+ assertEquals(minLgHashTableSize(31, REBUILD_THRESHOLD), 6);
}
/**
@@ -280,14 +278,6 @@ public class SetOperationTest {
}
@Test
- public void checkValidSetOpID() {
- assertFalse(SetOperation.isValidSetOpID(1)); //Alpha
- assertTrue(SetOperation.isValidSetOpID(UNION.getID()));
- assertTrue(SetOperation.isValidSetOpID(INTERSECTION.getID()));
- assertTrue(SetOperation.isValidSetOpID(A_NOT_B.getID()));
- }
-
- @Test
public void setOpsExample() {
println("Set Operations Example:");
int k = 4096;
diff --git a/src/test/java/org/apache/datasketches/theta/UnionImplTest.java b/src/test/java/org/apache/datasketches/theta/UnionImplTest.java
index dbb359d..c8e8817 100644
--- a/src/test/java/org/apache/datasketches/theta/UnionImplTest.java
+++ b/src/test/java/org/apache/datasketches/theta/UnionImplTest.java
@@ -193,8 +193,8 @@ public class UnionImplTest {
assertTrue(union.isEmpty());
assertEquals(union.getThetaLong(), Long.MAX_VALUE);
assertEquals(union.getSeedHash(), Util.computeSeedHash(DEFAULT_UPDATE_SEED));
- assertEquals(union.getRetainedEntries(true), 0);
- assertEquals(union.getCache().length, 128);
+ assertEquals(union.getRetainedEntries(), 0);
+ assertEquals(union.getCache().length, 128); //only applies to stateful
}
@Test
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org