You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/07/07 23:37:12 UTC
[incubator-datasketches-java] 09/10: Close to final set of changes.
This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch Refactor_Theta_Tuple
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git
commit e3f4ad2dee4d75356f587a4143634ac862c3493e
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Tue Jul 7 14:38:28 2020 -0700
Close to final set of changes.
---
.../org/apache/datasketches/HashOperations.java | 193 +++++++++++----------
.../theta/DirectQuickSelectSketch.java | 2 +-
.../theta/DirectQuickSelectSketchR.java | 18 +-
.../datasketches/theta/HeapQuickSelectSketch.java | 13 +-
.../datasketches/theta/HeapUpdateSketch.java | 12 +-
.../datasketches/theta/IntersectionImpl.java | 2 +-
.../java/org/apache/datasketches/theta/Sketch.java | 19 +-
.../apache/datasketches/theta/UpdateSketch.java | 39 ++---
.../datasketches/tuple/QuickSelectSketch.java | 39 +++--
.../DirectArrayOfDoublesQuickSelectSketch.java | 6 +-
.../apache/datasketches/HashOperationsTest.java | 23 +--
11 files changed, 194 insertions(+), 172 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/HashOperations.java b/src/main/java/org/apache/datasketches/HashOperations.java
index 47bf16a..9067b79 100644
--- a/src/main/java/org/apache/datasketches/HashOperations.java
+++ b/src/main/java/org/apache/datasketches/HashOperations.java
@@ -33,7 +33,6 @@ import org.apache.datasketches.memory.WritableMemory;
*/
public final class HashOperations {
private static final int STRIDE_HASH_BITS = 7;
-
private static final int EMPTY = 0;
/**
@@ -43,47 +42,8 @@ public final class HashOperations {
private HashOperations() {}
- /**
- * Counts the cardinality of the first Log2 values of the given source array.
- * @param srcArr the given source array
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>
- * @param thetaLong <a href="{@docRoot}/resources/dictionary.html#thetaLong">See Theta Long</a>
- * @return the cardinality
- */
- public static int countPart(final long[] srcArr, final int lgArrLongs, final long thetaLong) {
- int cnt = 0;
- final int len = 1 << lgArrLongs;
- for (int i = len; i-- > 0;) {
- final long hash = srcArr[i];
- if (continueCondition(thetaLong, hash) ) {
- continue;
- }
- cnt++ ;
- }
- return cnt;
- }
-
- /**
- * Counts the cardinality of the given source array.
- * @param srcArr the given source array
- * @param thetaLong <a href="{@docRoot}/resources/dictionary.html#thetaLong">See Theta Long</a>
- * @return the cardinality
- */
- public static int count(final long[] srcArr, final long thetaLong) {
- int cnt = 0;
- final int len = srcArr.length;
- for (int i = len; i-- > 0;) {
- final long hash = srcArr[i];
- if (continueCondition(thetaLong, hash) ) {
- continue;
- }
- cnt++ ;
- }
- return cnt;
- }
-
- // make odd and independent of index assuming lgArrLongs lowest bits of the hash were used for
- // index
+ //Make odd and independent of index assuming lgArrLongs lowest bits of the hash were used for
+ // index. This results in a 8 bit value that is always odd.
private static int getStride(final long hash, final int lgArrLongs) {
return (2 * (int) ((hash >>> lgArrLongs) & STRIDE_MASK) ) + 1;
}
@@ -91,18 +51,18 @@ public final class HashOperations {
//ON-HEAP
/**
- * This is a classical Knuth-style Open Addressing, Double Hash search scheme for on-heap.
+ * This is a classical Knuth-style Open Addressing, Double Hash (OADH) search scheme for on-heap.
* Returns the index if found, -1 if not found.
*
- * @param hashTable The hash table to search. Must be a power of 2 in size.
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
- * lgArrLongs ≤ log2(hashTable.length).
- * @param hash A hash value to search for. Must not be zero.
+ * @param hashTable The hash table to search. Its size must be a power of 2.
+ * @param lgArrLongs The log_base2(hashTable.length).
+ * <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
+ * @param hash The hash value to search for. It must not be zero.
* @return Current probe index if found, -1 if not found.
*/
public static int hashSearch(final long[] hashTable, final int lgArrLongs, final long hash) {
if (hash == 0) {
- throw new SketchesArgumentException("Given hash cannot be zero: " + hash);
+ throw new SketchesArgumentException("Given hash must not be zero: " + hash);
}
final int arrayMask = (1 << lgArrLongs) - 1; // current Size -1
final int stride = getStride(hash, lgArrLongs);
@@ -123,16 +83,16 @@ public final class HashOperations {
}
/**
- * This is a classical Knuth-style Open Addressing, Double Hash insert scheme for on-heap.
+ * This is a classical Knuth-style Open Addressing, Double Hash (OADH) insert scheme for on-heap.
* This method assumes that the input hash is not a duplicate.
* Useful for rebuilding tables to avoid unnecessary comparisons.
- * Returns the index of insertion, which is always positive or zero. Throws an exception if the
- * table is full with no empty slot.
+ * Returns the index of insertion, which is always positive or zero.
+ * Throws an exception if the table has no empty slot.
*
- * @param hashTable the hash table to insert into.
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
- * lgArrLongs ≤ log2(hashTable.length).
- * @param hash value that must not be zero and will be inserted into the array into an empty slot.
+ * @param hashTable the hash table to insert into. Its size must be a power of 2.
+ * @param lgArrLongs The log_base2(hashTable.length).
+ * <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
+ * @param hash The hash value to be potentially inserted into an empty slot. It must not be zero.
* @return index of insertion. Always positive or zero.
*/
public static int hashInsertOnly(final long[] hashTable, final int lgArrLongs, final long hash) {
@@ -153,15 +113,15 @@ public final class HashOperations {
}
/**
- * This is a classical Knuth-style Open Addressing, Double Hash insert scheme for on-heap.
+ * This is a classical Knuth-style Open Addressing, Double Hash (OADH) insert scheme for on-heap.
* Returns index ≥ 0 if found (duplicate); < 0 if inserted, inserted at -(index + 1).
* Throws an exception if the value is not found and table has no empty slot.
*
- * @param hashTable the hash table to insert into.
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
- * lgArrLongs ≤ log2(hashTable.length).
- * @param hash hash value that must not be zero and if not a duplicate will be inserted into the
- * array into an empty slot
+ * @param hashTable The hash table to insert into. Its size must be a power of 2.
+ * @param lgArrLongs The log_base2(hashTable.length).
+ * <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
+ * @param hash The hash value to be potentially inserted into an empty slot only if it is not
+ * a duplicate of any other hash value in the table. It must not be zero.
* @return index ≥ 0 if found (duplicate); < 0 if inserted, inserted at -(index + 1).
*/
public static int hashSearchOrInsert(final long[] hashTable, final int lgArrLongs,
@@ -182,22 +142,24 @@ public final class HashOperations {
}
curProbe = (curProbe + stride) & arrayMask;
} while (curProbe != loopIndex);
- throw new SketchesArgumentException("Key not found and no empty slots!");
+ throw new SketchesArgumentException("Hash not found and no empty slots!");
}
/**
- * Inserts the given long array into the given hash table array of the target size,
- * removes any negative input values, ignores duplicates and counts the values inserted.
+ * Inserts the given long array into the given OADH hashTable of the target size,
+ * ignores duplicates and counts the values inserted.
+ * The hash values must not be negative, zero values and values ≥ thetaLong are ignored.
* The given hash table may have values, but they must have been inserted by this method or one
- * of the other OADH insert methods in this class and they may not be dirty.
+ * of the other OADH insert methods in this class.
* This method performs additional checks against potentially invalid hash values or theta values.
* Returns the count of values actually inserted.
*
* @param srcArr the source hash array to be potentially inserted
- * @param hashTable The correctly sized target hash table that must be a power of two.
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
- * lgArrLongs ≤ log2(hashTable.length).
- * @param thetaLong must greater than zero
+ * @param hashTable The hash table to insert into. Its size must be a power of 2.
+ * @param lgArrLongs The log_base2(hashTable.length).
+ * <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
+ * @param thetaLong The theta value that all input hash values are compared against.
+ * It must greater than zero.
* <a href="{@docRoot}/resources/dictionary.html#thetaLong">See Theta Long</a>
* @return the count of values actually inserted
*/
@@ -219,21 +181,25 @@ public final class HashOperations {
return count;
}
- //OFF-HEAP (these are kept for backward compatibility)
+ //With Memory or WritableMemory
/**
- * This is a classical Knuth-style Open Addressing, Double Hash search scheme for off-heap.
+ * This is a classical Knuth-style Open Addressing, Double Hash (OADH) search scheme for Memory.
* Returns the index if found, -1 if not found.
*
- * @param mem The Memory hash table to search.
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
- * lgArrLongs ≤ log2(hashTable.length).
- * @param hash A hash value to search for. Must not be zero.
- * @param memOffsetBytes offset in the memory where the hash array starts
- * @return index if found, -1 if not found.
+ * @param mem The <i>Memory</i> containing the hash table to search.
+ * The hash table portion must be a power of 2 in size.
+ * @param lgArrLongs The log_base2(hashTable.length).
+ * <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
+ * @param hash The hash value to search for. Must not be zero.
+ * @param memOffsetBytes offset in the memory where the hashTable starts
+ * @return Current probe index if found, -1 if not found.
*/
- public static int hashSearch(final Memory mem, final int lgArrLongs, final long hash,
+ public static int hashSearchMemory(final Memory mem, final int lgArrLongs, final long hash,
final int memOffsetBytes) {
+ if (hash == 0) {
+ throw new SketchesArgumentException("Given hash must not be zero: " + hash);
+ }
final int arrayMask = (1 << lgArrLongs) - 1;
final int stride = getStride(hash, lgArrLongs);
int curProbe = (int) (hash & arrayMask);
@@ -249,21 +215,21 @@ public final class HashOperations {
}
/**
- * This is a classical Knuth-style Open Addressing, Double Hash insert scheme, but inserts
- * values directly into a Memory.
+ * This is a classical Knuth-style Open Addressing, Double Hash (OADH) insert scheme for Memory.
* This method assumes that the input hash is not a duplicate.
* Useful for rebuilding tables to avoid unnecessary comparisons.
* Returns the index of insertion, which is always positive or zero.
* Throws an exception if table has no empty slot.
*
- * @param wmem The writable memory
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
- * lgArrLongs ≤ log2(hashTable.length).
+ * @param wmem The <i>WritableMemory</i> that contains the hashTable to insert into.
+ * The size of the hashTable portion must be a power of 2.
+ * @param lgArrLongs The log_base2(hashTable.length.
+ * <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
* @param hash value that must not be zero and will be inserted into the array into an empty slot.
- * @param memOffsetBytes offset in the memory where the hash array starts
+ * @param memOffsetBytes offset in the <i>WritableMemory</i> where the hashTable starts
* @return index of insertion. Always positive or zero.
*/
- public static int fastHashInsertOnly(final WritableMemory wmem, final int lgArrLongs,
+ public static int hashInsertOnlyMemory(final WritableMemory wmem, final int lgArrLongs,
final long hash, final int memOffsetBytes) {
final int arrayMask = (1 << lgArrLongs) - 1; // current Size -1
final int stride = getStride(hash, lgArrLongs);
@@ -282,23 +248,21 @@ public final class HashOperations {
throw new SketchesArgumentException("No empty slot in table!");
}
- //FAST OFF-HEAP
-
/**
* This is a classical Knuth-style Open Addressing, Double Hash insert scheme, but inserts
* values directly into a Memory.
* Returns index ≥ 0 if found (duplicate); < 0 if inserted, inserted at -(index + 1).
* Throws an exception if the value is not found and table has no empty slot.
*
- * @param wmem the WritableMemory
- * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
- * lgArrLongs ≤ log2(hashTable.length).
- * @param hash A hash value that must not be zero and if not a duplicate will be inserted into the
- * array into an empty slot.
- * @param memOffsetBytes offset in the memory where the hash array starts
+ * @param wmem The <i>WritableMemory</i> that contains the hashTable to insert into.
+ * @param lgArrLongs The log_base2(hashTable.length).
+ * <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>.
+ * @param hash The hash value to be potentially inserted into an empty slot only if it is not
+ * a duplicate of any other hash value in the table. It must not be zero.
+ * @param memOffsetBytes offset in the <i>WritableMemory</i> where the hash array starts
* @return index ≥ 0 if found (duplicate); < 0 if inserted, inserted at -(index + 1).
*/
- public static int fastHashSearchOrInsert(final WritableMemory wmem, final int lgArrLongs,
+ public static int hashSearchOrInsertMemory(final WritableMemory wmem, final int lgArrLongs,
final long hash, final int memOffsetBytes) {
final int arrayMask = (1 << lgArrLongs) - 1; // current Size -1
final int stride = getStride(hash, lgArrLongs);
@@ -318,6 +282,8 @@ public final class HashOperations {
throw new SketchesArgumentException("Key not found and no empty slot in table!");
}
+ //Other related methods
+
/**
* @param thetaLong must be greater than zero otherwise throws an exception.
* <a href="{@docRoot}/resources/dictionary.html#thetaLong">See Theta Long</a>
@@ -378,4 +344,43 @@ public final class HashOperations {
return hashTable;
}
+ /**
+ * Counts the cardinality of the first Log2 values of the given source array.
+ * @param srcArr the given source array
+ * @param lgArrLongs <a href="{@docRoot}/resources/dictionary.html#lgArrLongs">See lgArrLongs</a>
+ * @param thetaLong <a href="{@docRoot}/resources/dictionary.html#thetaLong">See Theta Long</a>
+ * @return the cardinality
+ */
+ public static int countPart(final long[] srcArr, final int lgArrLongs, final long thetaLong) {
+ int cnt = 0;
+ final int len = 1 << lgArrLongs;
+ for (int i = len; i-- > 0;) {
+ final long hash = srcArr[i];
+ if (continueCondition(thetaLong, hash) ) {
+ continue;
+ }
+ cnt++ ;
+ }
+ return cnt;
+ }
+
+ /**
+ * Counts the cardinality of the given source array.
+ * @param srcArr the given source array
+ * @param thetaLong <a href="{@docRoot}/resources/dictionary.html#thetaLong">See Theta Long</a>
+ * @return the cardinality
+ */
+ public static int count(final long[] srcArr, final long thetaLong) {
+ int cnt = 0;
+ final int len = srcArr.length;
+ for (int i = len; i-- > 0;) {
+ final long hash = srcArr[i];
+ if (continueCondition(thetaLong, hash) ) {
+ continue;
+ }
+ cnt++ ;
+ }
+ return cnt;
+ }
+
}
diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java
index dc34bc5..df24c11 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java
@@ -260,7 +260,7 @@ class DirectQuickSelectSketch extends DirectQuickSelectSketchR {
//The duplicate test
final int index =
- HashOperations.fastHashSearchOrInsert(wmem_, lgArrLongs, hash, preambleLongs << 3);
+ HashOperations.hashSearchOrInsertMemory(wmem_, lgArrLongs, hash, preambleLongs << 3);
if (index >= 0) {
return RejectedDuplicate; //Duplicate, not inserted
}
diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java
index d5290b1..d79b71b 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java
@@ -188,11 +188,21 @@ class DirectQuickSelectSketchR extends UpdateSketch {
}
@Override
+ float getP() {
+ return wmem_.getFloat(P_FLOAT);
+ }
+
+ @Override
public ResizeFactor getResizeFactor() {
return ResizeFactor.getRF(getLgRF());
}
@Override
+ long getSeed() {
+ return seed_;
+ }
+
+ @Override
public UpdateSketch rebuild() {
throw new SketchesReadOnlyException();
}
@@ -229,15 +239,7 @@ class DirectQuickSelectSketchR extends UpdateSketch {
return wmem_;
}
- @Override
- float getP() {
- return wmem_.getFloat(P_FLOAT);
- }
- @Override
- long getSeed() {
- return seed_;
- }
@Override
short getSeedHash() {
diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
index b3dcf6b..14197aa 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
@@ -280,14 +280,15 @@ class HeapQuickSelectSketch extends HeapUpdateSketch {
return numEntries > hashTableThreshold_;
}
- //Must resize. Changes lgArrLongs_ and cache_. theta and count don't change.
+ //Must resize. Changes lgArrLongs_, cache_, hashTableThreshold;
+ // theta and count don't change.
// Used by hashUpdate()
private final void resizeCache() {
final ResizeFactor rf = getResizeFactor();
- final int lgTgtLongs = lgNomLongs_ + 1;
- final int lgDeltaLongs = lgTgtLongs - lgArrLongs_;
+ final int lgMaxArrLongs = lgNomLongs_ + 1;
+ final int lgDeltaLongs = lgMaxArrLongs - lgArrLongs_;
final int lgResizeFactor = max(min(rf.lg(), lgDeltaLongs), 1); //rf_.lg() could be 0
- lgArrLongs_ += lgResizeFactor; // new tgt size
+ lgArrLongs_ += lgResizeFactor; // new arr size
final long[] tgtArr = new long[1 << lgArrLongs_];
final int newCount = HashOperations.hashArrayInsert(cache_, tgtArr, lgArrLongs_, thetaLong_);
@@ -301,9 +302,9 @@ class HeapQuickSelectSketch extends HeapUpdateSketch {
//array stays the same size. Changes theta and thus count
private final void quickSelectAndRebuild() {
- final int arrLongs = 1 << lgArrLongs_;
+ final int arrLongs = 1 << lgArrLongs_; // generally 2 * k,
- final int pivot = (1 << lgNomLongs_) + 1; // pivot for QS
+ final int pivot = (1 << lgNomLongs_) + 1; // pivot for QS = k + 1
thetaLong_ = selectExcludingZeros(cache_, curCount_, pivot); //messes up the cache_
diff --git a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java
index d62e88d..2bc2915 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapUpdateSketch.java
@@ -85,15 +85,13 @@ abstract class HeapUpdateSketch extends UpdateSketch {
}
@Override
- public ResizeFactor getResizeFactor() {
- return rf_;
+ float getP() {
+ return p_;
}
- //restricted methods
-
@Override
- float getP() {
- return p_;
+ public ResizeFactor getResizeFactor() {
+ return rf_;
}
@Override
@@ -101,6 +99,8 @@ abstract class HeapUpdateSketch extends UpdateSketch {
return seed_;
}
+ //restricted methods
+
@Override
short getSeedHash() {
return Util.computeSeedHash(getSeed());
diff --git a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
index d5478e4..611130d 100644
--- a/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
+++ b/src/main/java/org/apache/datasketches/theta/IntersectionImpl.java
@@ -359,7 +359,7 @@ final class IntersectionImpl extends IntersectionImplR {
for (int i = 0; i < arrLongsIn; i++ ) {
final long hashIn = arr[i];
if (HashOperations.continueCondition(thetaLong, hashIn)) { continue; }
- HashOperations.fastHashInsertOnly(mem_, lgArrLongs, hashIn, preBytes);
+ HashOperations.hashInsertOnlyMemory(mem_, lgArrLongs, hashIn, preBytes);
tmpCnt++;
}
} else { //On Heap. Assumes HT exists and is large enough
diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java
index d65c5cb..0075b82 100644
--- a/src/main/java/org/apache/datasketches/theta/Sketch.java
+++ b/src/main/java/org/apache/datasketches/theta/Sketch.java
@@ -280,14 +280,6 @@ public abstract class Sketch {
public abstract Family getFamily();
/**
- * Returns a HashIterator that can be used to iterate over the retained hash values of the
- * Theta sketch.
- * @return a HashIterator that can be used to iterate over the retained hash values of the
- * Theta sketch.
- */
- public abstract HashIterator iterator();
-
- /**
* Gets the approximate lower error bound given the specified number of Standard Deviations.
* This will return getEstimate() if isEmpty() is true.
*
@@ -437,6 +429,14 @@ public abstract class Sketch {
}
/**
+ * Returns a HashIterator that can be used to iterate over the retained hash values of the
+ * Theta sketch.
+ * @return a HashIterator that can be used to iterate over the retained hash values of the
+ * Theta sketch.
+ */
+ public abstract HashIterator iterator();
+
+ /**
* Serialize this sketch to a byte array form.
* @return byte array of this sketch
*/
@@ -568,7 +568,7 @@ public abstract class Sketch {
/**
* Gets the internal cache array. For on-heap sketches this will return a reference to the actual
- * cache array. For off-heap sketches this returns a copy.
+ * cache array. For Memory-based sketches this returns a copy.
* @return the internal cache array.
*/
abstract long[] getCache();
@@ -633,7 +633,6 @@ public abstract class Sketch {
}
}
-
static final double estimate(final long thetaLong, final int curCount) {
return curCount * (LONG_MAX_VALUE_AS_DOUBLE / thetaLong);
}
diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
index c28c983..740b03e 100644
--- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
@@ -167,7 +167,6 @@ public abstract class UpdateSketch extends Sketch {
/**
* Returns a new builder
- *
* @return a new builder
*/
public static final UpdateSketchBuilder builder() {
@@ -175,6 +174,25 @@ public abstract class UpdateSketch extends Sketch {
}
/**
+ * Returns the configured ResizeFactor
+ * @return the configured ResizeFactor
+ */
+ public abstract ResizeFactor getResizeFactor();
+
+ /**
+ * Gets the configured sampling probability, <i>p</i>.
+ * <a href="{@docRoot}/resources/dictionary.html#p">See Sampling Probability, <i>p</i></a>
+ * @return the sampling probability, <i>p</i>
+ */
+ abstract float getP();
+
+ /**
+ * Gets the configured seed
+ * @return the configured seed
+ */
+ abstract long getSeed();
+
+ /**
* Resets this sketch back to a virgin empty state.
*/
public abstract void reset();
@@ -187,12 +205,6 @@ public abstract class UpdateSketch extends Sketch {
public abstract UpdateSketch rebuild();
/**
- * Returns the configured ResizeFactor
- * @return the configured ResizeFactor
- */
- public abstract ResizeFactor getResizeFactor();
-
- /**
* Present this sketch with a long.
*
* @param datum The given long datum.
@@ -330,19 +342,6 @@ public abstract class UpdateSketch extends Sketch {
public abstract int getLgNomLongs();
/**
- * Gets the configured sampling probability, <i>p</i>.
- * <a href="{@docRoot}/resources/dictionary.html#p">See Sampling Probability, <i>p</i></a>
- * @return the sampling probability, <i>p</i>
- */
- abstract float getP();
-
- /**
- * Gets the configured seed
- * @return the configured seed
- */
- abstract long getSeed();
-
- /**
* Returns true if the internal cache contains "dirty" values that are greater than or equal
* to thetaLong.
* @return true if the internal cache is dirty.
diff --git a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
index 26abe61..e6fb943 100644
--- a/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/QuickSelectSketch.java
@@ -41,7 +41,6 @@ import org.apache.datasketches.memory.Memory;
* @param <S> type of Summary
*/
class QuickSelectSketch<S extends Summary> extends Sketch<S> {
- //private static final byte serialVersionWithSummaryFactoryUID = 1;
private static final byte serialVersionUID = 2;
private enum Flags { IS_BIG_ENDIAN, IS_IN_SAMPLING_MODE, IS_EMPTY, HAS_ENTRIES, IS_THETA_INCLUDED }
@@ -63,7 +62,9 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
* given value.
* @param summaryFactory An instance of a SummaryFactory.
*/
- QuickSelectSketch(final int nomEntries, final SummaryFactory<S> summaryFactory) {
+ QuickSelectSketch(
+ final int nomEntries,
+ final SummaryFactory<S> summaryFactory) {
this(nomEntries, DEFAULT_LG_RESIZE_FACTOR, summaryFactory);
}
@@ -80,7 +81,9 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
* </pre>
* @param summaryFactory An instance of a SummaryFactory.
*/
- QuickSelectSketch(final int nomEntries, final int lgResizeFactor,
+ QuickSelectSketch(
+ final int nomEntries,
+ final int lgResizeFactor,
final SummaryFactory<S> summaryFactory) {
this(nomEntries, lgResizeFactor, 1f, summaryFactory);
}
@@ -100,7 +103,10 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
* @param samplingProbability the given sampling probability
* @param summaryFactory An instance of a SummaryFactory.
*/
- QuickSelectSketch(final int nomEntries, final int lgResizeFactor, final float samplingProbability,
+ QuickSelectSketch(
+ final int nomEntries,
+ final int lgResizeFactor,
+ final float samplingProbability,
final SummaryFactory<S> summaryFactory) {
this(
nomEntries,
@@ -111,8 +117,12 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
);
}
- QuickSelectSketch(final int nomEntries, final int lgResizeFactor, final float samplingProbability,
- final SummaryFactory<S> summaryFactory, final int startingSize) {
+ QuickSelectSketch(
+ final int nomEntries,
+ final int lgResizeFactor,
+ final float samplingProbability,
+ final SummaryFactory<S> summaryFactory,
+ final int startingSize) {
nomEntries_ = ceilingPowerOf2(nomEntries);
lgResizeFactor_ = lgResizeFactor;
samplingProbability_ = samplingProbability;
@@ -130,7 +140,9 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
* @param deserializer the SummaryDeserializer
* @param summaryFactory the SummaryFactory
*/
- QuickSelectSketch(final Memory mem, final SummaryDeserializer<S> deserializer,
+ QuickSelectSketch(
+ final Memory mem,
+ final SummaryDeserializer<S> deserializer,
final SummaryFactory<S> summaryFactory) {
summaryFactory_ = summaryFactory;
int offset = 0;
@@ -202,7 +214,6 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
return summaryTable_;
}
-
/**
* Get configured nominal number of entries
* @return nominal number of entries
@@ -249,7 +260,7 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
public void trim() {
if (count_ > nomEntries_) {
updateTheta();
- rebuild(hashTable_.length);
+ resize(hashTable_.length);
}
}
@@ -427,13 +438,13 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
updateTheta();
rebuild();
} else {
- rebuild(hashTable_.length * (1 << lgResizeFactor_));
+ resize(hashTable_.length * (1 << lgResizeFactor_));
}
return true;
}
void rebuild() {
- rebuild(hashTable_.length);
+ resize(hashTable_.length);
}
void insert(final long hash, final S summary) {
@@ -446,6 +457,10 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
private void updateTheta() {
final long[] hashArr = new long[count_];
int i = 0;
+ //Because of the association of the hashTable with the summaryTable we cannot destroy the
+ // hashTable structure. So we must copy. May as well compact at the same time.
+ // Might consider a whole table clone and use the selectExcludingZeros method instead.
+ // Not sure if there would be any speed advantage.
for (int j = 0; j < hashTable_.length; j++) {
if (summaryTable_[j] != null) {
hashArr[i++] = hashTable_[j];
@@ -455,7 +470,7 @@ class QuickSelectSketch<S extends Summary> extends Sketch<S> {
}
@SuppressWarnings({"unchecked"})
- private void rebuild(final int newSize) {
+ private void resize(final int newSize) {
final long[] oldHashTable = hashTable_;
final S[] oldSummaryTable = summaryTable_;
final Class<S> summaryType = (Class<S>) summaryTable_.getClass().getComponentType();
diff --git a/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java b/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java
index c47dd1a..5c4e4b4 100644
--- a/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/tuple/arrayofdoubles/DirectArrayOfDoublesQuickSelectSketch.java
@@ -294,17 +294,17 @@ class DirectArrayOfDoublesQuickSelectSketch extends ArrayOfDoublesQuickSelectSke
@Override
protected int insertKey(final long key) {
- return HashOperations.fastHashInsertOnly(mem_, lgCurrentCapacity_, key, ENTRIES_START);
+ return HashOperations.hashInsertOnlyMemory(mem_, lgCurrentCapacity_, key, ENTRIES_START);
}
@Override
protected int findOrInsertKey(final long key) {
- return HashOperations.fastHashSearchOrInsert(mem_, lgCurrentCapacity_, key, ENTRIES_START);
+ return HashOperations.hashSearchOrInsertMemory(mem_, lgCurrentCapacity_, key, ENTRIES_START);
}
@Override
protected double[] find(final long key) {
- final int index = HashOperations.hashSearch(mem_, lgCurrentCapacity_, key, ENTRIES_START);
+ final int index = HashOperations.hashSearchMemory(mem_, lgCurrentCapacity_, key, ENTRIES_START);
if (index == -1) { return null; }
final double[] array = new double[numValues_];
mem_.getDoubleArray(valuesOffset_ + ((long) SIZE_OF_VALUE_BYTES * numValues_ * index),
diff --git a/src/test/java/org/apache/datasketches/HashOperationsTest.java b/src/test/java/org/apache/datasketches/HashOperationsTest.java
index 00c34fa..f7495aa 100644
--- a/src/test/java/org/apache/datasketches/HashOperationsTest.java
+++ b/src/test/java/org/apache/datasketches/HashOperationsTest.java
@@ -22,11 +22,12 @@ package org.apache.datasketches;
import static org.apache.datasketches.HashOperations.checkHashCorruption;
import static org.apache.datasketches.HashOperations.checkThetaCorruption;
import static org.apache.datasketches.HashOperations.continueCondition;
-import static org.apache.datasketches.HashOperations.fastHashInsertOnly;
-import static org.apache.datasketches.HashOperations.fastHashSearchOrInsert;
+import static org.apache.datasketches.HashOperations.hashInsertOnlyMemory;
+import static org.apache.datasketches.HashOperations.hashSearchOrInsertMemory;
import static org.apache.datasketches.HashOperations.hashArrayInsert;
import static org.apache.datasketches.HashOperations.hashInsertOnly;
import static org.apache.datasketches.HashOperations.hashSearch;
+import static org.apache.datasketches.HashOperations.hashSearchMemory;
import static org.apache.datasketches.HashOperations.hashSearchOrInsert;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.testng.Assert.assertEquals;
@@ -106,7 +107,7 @@ public class HashOperationsTest {
public void testHashInsertOnlyMemoryNoStride() {
final long[] table = new long[32];
final WritableMemory mem = WritableMemory.wrap(table);
- final int index = fastHashInsertOnly(mem, 5, 1, 0);
+ final int index = hashInsertOnlyMemory(mem, 5, 1, 0);
assertEquals(index, 1);
assertEquals(table[1], 1L);
}
@@ -116,7 +117,7 @@ public class HashOperationsTest {
final long[] table = new long[32];
table[1] = 1;
final WritableMemory mem = WritableMemory.wrap(table);
- final int index = fastHashInsertOnly(mem, 5, 1, 0);
+ final int index = hashInsertOnlyMemory(mem, 5, 1, 0);
assertEquals(index, 2);
assertEquals(table[2], 1L);
}
@@ -152,22 +153,22 @@ public class HashOperationsTest {
final long[] table = new long[32];
final WritableMemory mem = WritableMemory.wrap(table);
for (int i = 1; i <= 32; ++i) {
- fastHashInsertOnly(mem, 5, i, 0);
+ hashInsertOnlyMemory(mem, 5, i, 0);
}
// table full; search returns not found, others throw exception
- final int retVal = hashSearch(mem, 5, 33, 0);
+ final int retVal = hashSearchMemory(mem, 5, 33, 0);
assertEquals(retVal, -1);
try {
- fastHashInsertOnly(mem, 5, 33, 0);
+ hashInsertOnlyMemory(mem, 5, 33, 0);
fail();
} catch (final SketchesArgumentException e) {
// expected
}
try {
- fastHashSearchOrInsert(mem, 5, 33, 0);
+ hashSearchOrInsertMemory(mem, 5, 33, 0);
fail();
} catch (final SketchesArgumentException e) {
// expected
@@ -180,19 +181,19 @@ public class HashOperationsTest {
final WritableMemory wmem = WritableMemory.wrap(table);
for (int i = 1; i <= 32; ++i) {
- fastHashInsertOnly(wmem, 5, i, 0);
+ hashInsertOnlyMemory(wmem, 5, i, 0);
}
// table full; throws exception
try {
- fastHashInsertOnly(wmem, 5, 33, 0);
+ hashInsertOnlyMemory(wmem, 5, 33, 0);
fail();
} catch (final SketchesArgumentException e) {
// expected
}
try {
- fastHashSearchOrInsert(wmem, 5, 33, 0);
+ hashSearchOrInsertMemory(wmem, 5, 33, 0);
fail();
} catch (final SketchesArgumentException e) {
// expected
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org