You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/06/09 05:36:05 UTC
[incubator-datasketches-java] 01/01: Improvements in Theta Sketch
to (hopefully) eliminate the misinterpretation of SingleItemSketches.
This is an automated email from the ASF dual-hosted git repository.
leerho pushed a commit to branch FixSingleItem
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git
commit d90ab751dca0a73bad335fa45e81c0c534d3b3a3
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Mon Jun 8 22:35:36 2020 -0700
Improvements in Theta Sketch to (hopefully) eliminate the
misinterpretation of SingleItemSketches.
We should try to test this in the field.
---
.../theta/DirectCompactOrderedSketch.java | 8 +-
.../theta/DirectCompactUnorderedSketch.java | 8 +-
.../datasketches/theta/ForwardCompatibility.java | 9 +--
.../theta/HeapCompactOrderedSketch.java | 9 +--
.../theta/HeapCompactUnorderedSketch.java | 9 +--
.../apache/datasketches/theta/PreambleUtil.java | 58 +++++++++++----
.../datasketches/theta/SingleItemSketch.java | 85 +++++++++-------------
.../java/org/apache/datasketches/theta/Sketch.java | 23 +++---
.../org/apache/datasketches/theta/UnionImpl.java | 33 +++++----
.../apache/datasketches/theta/UpdateSketch.java | 4 +-
.../datasketches/theta/CompactSketchTest.java | 13 ++--
.../org/apache/datasketches/theta/EmptyTest.java | 45 ------------
.../datasketches/theta/SingleItemSketchTest.java | 46 ++++++++----
13 files changed, 157 insertions(+), 193 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java
index b68bc08..6a88213 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java
@@ -19,13 +19,11 @@
package org.apache.datasketches.theta;
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;
@@ -54,9 +52,7 @@ final class DirectCompactOrderedSketch extends DirectCompactSketch {
* @return this sketch
*/
static DirectCompactOrderedSketch wrapInstance(final Memory srcMem, final long seed) {
- final short memSeedHash = srcMem.getShort(SEED_HASH_SHORT);
- final short computedSeedHash = computeSeedHash(seed);
- checkSeedHashes(memSeedHash, computedSeedHash);
+ checkMemorySeedHash(srcMem, seed);
return new DirectCompactOrderedSketch(srcMem);
}
diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java
index 9ea6965..eb9d425 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java
@@ -19,12 +19,10 @@
package org.apache.datasketches.theta;
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;
@@ -53,9 +51,7 @@ final class DirectCompactUnorderedSketch extends DirectCompactSketch {
* @return this sketch
*/
static DirectCompactUnorderedSketch wrapInstance(final Memory srcMem, final long seed) {
- final short memSeedHash = srcMem.getShort(SEED_HASH_SHORT);
- final short computedSeedHash = computeSeedHash(seed);
- checkSeedHashes(memSeedHash, computedSeedHash);
+ checkMemorySeedHash(srcMem, seed);
return new DirectCompactUnorderedSketch(srcMem);
}
diff --git a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java
index 825e925..535fba6 100644
--- a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java
+++ b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java
@@ -19,9 +19,9 @@
package org.apache.datasketches.theta;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import org.apache.datasketches.SketchesArgumentException;
@@ -38,7 +38,7 @@ import org.apache.datasketches.memory.Memory;
final class ForwardCompatibility {
/**
- * Convert a serialization version (SerVer) 1 sketch to a SerVer 3 sketch.
+ * Convert a serialization version (SerVer) 1 sketch (~Feb 2014) to a SerVer 3 sketch.
* Note: SerVer 1 sketches always have metadata-longs of 3 and are always stored
* in a compact ordered form, but with 3 different sketch types. All SerVer 1 sketches will
* be converted to a SerVer 3 sketches. There is no concept of p-sampling, no empty bit.
@@ -91,10 +91,7 @@ final class ForwardCompatibility {
* @return a SerVer 3 HeapCompactOrderedSketch
*/
static final CompactSketch heapify2to3(final Memory srcMem, final long seed) {
- final short seedHash = Util.computeSeedHash(seed);
- final short memSeedHash = (short)extractSeedHash(srcMem);
- Util.checkSeedHashes(seedHash, memSeedHash);
-
+ final short seedHash = checkMemorySeedHash(srcMem, seed);
final int memCap = (int) srcMem.getCapacity();
final int preLongs = extractPreLongs(srcMem); //1,2 or 3
int reqBytesIn = 8;
diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java
index 9b55833..220f95c 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java
@@ -19,11 +19,9 @@
package org.apache.datasketches.theta;
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import org.apache.datasketches.memory.Memory;
@@ -57,10 +55,7 @@ final class HeapCompactOrderedSketch extends HeapCompactSketch {
* @return a CompactSketch
*/
static CompactSketch heapifyInstance(final Memory srcMem, final long seed) {
- final short memSeedHash = (short) extractSeedHash(srcMem);
- final short computedSeedHash = computeSeedHash(seed);
- checkSeedHashes(memSeedHash, computedSeedHash);
-
+ final short memSeedHash = checkMemorySeedHash(srcMem, seed);
final int preLongs = extractPreLongs(srcMem);
final boolean empty = PreambleUtil.isEmpty(srcMem); //checks for cap <= 8
long thetaLong = Long.MAX_VALUE;
diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java
index 2c375bb..c9ce337 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java
@@ -19,11 +19,9 @@
package org.apache.datasketches.theta;
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import org.apache.datasketches.memory.Memory;
@@ -57,10 +55,7 @@ final class HeapCompactUnorderedSketch extends HeapCompactSketch {
* @return a CompactSketch
*/
static CompactSketch heapifyInstance(final Memory srcMem, final long seed) {
- final short memSeedHash = (short) extractSeedHash(srcMem);
- final short computedSeedHash = computeSeedHash(seed);
- checkSeedHashes(memSeedHash, computedSeedHash);
-
+ final short memSeedHash = checkMemorySeedHash(srcMem, seed);
final int preLongs = extractPreLongs(srcMem); //must be > 1
final boolean empty = PreambleUtil.isEmpty(srcMem); //checks for cap <= 8
int curCount = 0;
diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
index 7fd6210..94345eb 100644
--- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
+++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
@@ -20,6 +20,8 @@
package org.apache.datasketches.theta;
import static org.apache.datasketches.Util.LS;
+import static org.apache.datasketches.Util.checkSeedHashes;
+import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.Util.zeroPad;
import java.nio.ByteOrder;
@@ -227,22 +229,24 @@ final class PreambleUtil {
*/
static String preambleToString(final Memory mem) {
final int preLongs = getAndCheckPreLongs(mem);
- final ResizeFactor rf = ResizeFactor.getRF(extractLgResizeFactor(mem));
+ final int rfId = extractLgResizeFactor(mem);
+ final ResizeFactor rf = ResizeFactor.getRF(rfId);
final int serVer = extractSerVer(mem);
- final Family family = Family.idToFamily(extractFamilyID(mem));
+ final int familyId = extractFamilyID(mem);
+ final Family family = Family.idToFamily(familyId);
final int lgNomLongs = extractLgNomLongs(mem);
final int lgArrLongs = extractLgArrLongs(mem);
//Flags
final int flags = extractFlags(mem);
- final String flagsStr = zeroPad(Integer.toBinaryString(flags), 8) + ", " + (flags);
+ final String flagsStr = (flags) + ", " + zeroPad(Integer.toBinaryString(flags), 8);
final String nativeOrder = ByteOrder.nativeOrder().toString();
final boolean bigEndian = (flags & BIG_ENDIAN_FLAG_MASK) > 0;
final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0;
final boolean empty = (flags & EMPTY_FLAG_MASK) > 0;
final boolean compact = (flags & COMPACT_FLAG_MASK) > 0;
final boolean ordered = (flags & ORDERED_FLAG_MASK) > 0;
- final boolean singleItem = !empty && (preLongs == 1);
+ final boolean singleItem = (flags & SINGLEITEM_FLAG_MASK) > 0; //!empty && (preLongs == 1);
final int seedHash = extractSeedHash(mem);
@@ -278,23 +282,24 @@ final class PreambleUtil {
final StringBuilder sb = new StringBuilder();
sb.append(LS);
sb.append("### SKETCH PREAMBLE SUMMARY:").append(LS);
+ sb.append("Native Byte Order : ").append(nativeOrder).append(LS);
sb.append("Byte 0: Preamble Longs : ").append(preLongs).append(LS);
- sb.append("Byte 0: ResizeFactor : ").append(rf.toString()).append(LS);
+ sb.append("Byte 0: ResizeFactor : ").append(rfId + ", " + rf.toString()).append(LS);
sb.append("Byte 1: Serialization Version: ").append(serVer).append(LS);
- sb.append("Byte 2: Family : ").append(family.toString()).append(LS);
+ sb.append("Byte 2: Family : ").append(familyId + ", " + family.toString()).append(LS);
sb.append("Byte 3: LgNomLongs : ").append(lgNomLongs).append(LS);
sb.append("Byte 4: LgArrLongs : ").append(lgArrLongs).append(LS);
sb.append("Byte 5: Flags Field : ").append(flagsStr).append(LS);
- sb.append(" (Native Byte Order) : ").append(nativeOrder).append(LS);
- sb.append(" BIG_ENDIAN_STORAGE : ").append(bigEndian).append(LS);
- sb.append(" READ_ONLY : ").append(readOnly).append(LS);
- sb.append(" EMPTY : ").append(empty).append(LS);
- sb.append(" COMPACT : ").append(compact).append(LS);
- sb.append(" ORDERED : ").append(ordered).append(LS);
- sb.append(" SINGLEITEM (derived) : ").append(singleItem).append(LS);
- sb.append("Bytes 6-7 : Seed Hash : ").append(Integer.toHexString(seedHash)).append(LS);
+ sb.append(" Bit Flag Name : State:").append(LS);
+ sb.append(" 0 BIG_ENDIAN_STORAGE : ").append(bigEndian).append(LS);
+ sb.append(" 1 READ_ONLY : ").append(readOnly).append(LS);
+ sb.append(" 2 EMPTY : ").append(empty).append(LS);
+ sb.append(" 3 COMPACT : ").append(compact).append(LS);
+ sb.append(" 4 ORDERED : ").append(ordered).append(LS);
+ sb.append(" 5 SINGLE_ITEM : ").append(singleItem).append(LS);
+ sb.append("Bytes 6-7 : Seed Hash Hex : ").append(Integer.toHexString(seedHash)).append(LS);
if (preLongs == 1) {
- sb.append(" --ABSENT, ASSUMED:").append(LS);
+ sb.append(" --ABSENT FIELDS, ASSUMED:").append(LS);
sb.append("Bytes 8-11 : CurrentCount : ").append(curCount).append(LS);
sb.append("Bytes 12-15: P : ").append(p).append(LS);
sb.append("Bytes 16-23: Theta (double) : ").append(thetaDbl).append(LS);
@@ -328,7 +333,8 @@ final class PreambleUtil {
}
sb.append( "Preamble Bytes : ").append(preLongs * 8).append(LS);
sb.append( "Data Bytes : ").append(curCount * 8).append(LS);
- sb.append( "TOTAL Sketch Bytes : ").append(mem.getCapacity()).append(LS);
+ sb.append( "TOTAL Sketch Bytes : ").append((preLongs + curCount) * 8).append(LS);
+ sb.append( "TOTAL Capacity Bytes : ").append(mem.getCapacity()).append(LS);
sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS);
return sb.toString();
}
@@ -462,6 +468,20 @@ final class PreambleUtil {
return emptyFlag || emptyCap;
}
+ static boolean isSingleItem(final Memory mem) {
+ // Flags byte must be LittleEndian, ReadOnly, Not Empty, Compact, Ordered = 11010 = 0x1A.
+ // Flags mask will be 0x1F.
+ // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now.
+ // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3,
+ // and the hash seed matches (not done here), it is virtually guaranteed that we have a
+ // SingleItem Sketch.
+ final boolean preLongs = extractPreLongs(mem) == 1;
+ final boolean serVer = extractSerVer(mem) >= 3;
+ final boolean famId = extractFamilyID(mem) == 3; //compact
+ final boolean flags = (extractFlags(mem) & 0x1F) == 0x1A; //no SI, yet
+ return preLongs && serVer && famId && flags;
+ }
+
/**
* Checks Memory for capacity to hold the preamble and returns the extracted preLongs.
* @param mem the given Memory
@@ -480,6 +500,12 @@ final class PreambleUtil {
return preLongs;
}
+ static final short checkMemorySeedHash(final Memory mem, final long seed) {
+ final short seedHashMem = (short) extractSeedHash(mem);
+ checkSeedHashes(seedHashMem, computeSeedHash(seed)); //throws if bad seedHash
+ return seedHashMem;
+ }
+
private static void throwNotBigEnough(final long cap, final int required) {
throw new SketchesArgumentException(
"Possible Corruption: Size of byte array or Memory not large enough: Size: " + cap
diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
index 2d56a12..d82a5c2 100644
--- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
@@ -20,64 +20,57 @@
package org.apache.datasketches.theta;
import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.datasketches.ByteArrayUtil.putLongLE;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
-import static org.apache.datasketches.Util.checkSeedHashes;
import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.apache.datasketches.theta.PreambleUtil.MAX_THETA_LONG_AS_DOUBLE;
-import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.isSingleItem;
import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.Util;
import org.apache.datasketches.memory.Memory;
/**
+ * A CompactSketch that holds only one item hash.
+ *
* @author Lee Rhodes
*/
public final class SingleItemSketch extends CompactSketch {
private static final long DEFAULT_SEED_HASH = computeSeedHash(DEFAULT_UPDATE_SEED) & 0xFFFFL;
- private static final String LS = System.getProperty("line.separator");
- //For backward compatibility, a candidate pre0 long must have Flags= compact, read-only, ordered,
- // empty=0; COMPACT-Family=3, SerVer=3, PreLongs=1, plus the relevant seedHash.
- private static final long PRE0_LO6 = 0X00_00_1A_00_00_03_03_01L; //no SI flag, requires not empty
- private static final long PRE0_MASK = 0XFF_FF_DF_FF_FF_FF_FF_FFL; //removes SI flag
+ // For backward compatibility, a candidate pre0_ long must have:
+ // Flags byte 5 must be Ordered, Compact, NOT Empty, Read Only, LittleEndian = 11010 = 0x1A.
+ // Flags mask will be 0x1F.
+ // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now.
+ // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3,
+ // and the hash seed matches, it is virtually guaranteed that we have a SingleItem Sketch.
- private final long[] arr = new long[2];
+ private static final long PRE0_LO6_SI = 0X00_00_3A_00_00_03_03_01L; //with SI flag
- //use to test a candidate pre0 given a seed
- static boolean testPre0Seed(final long candidate, final long seed) {
- final long seedHash = computeSeedHash(seed) & 0xFFFFL;
- return testPre0SeedHash(candidate, seedHash);
- }
+ private long pre0_ = 0;
+ private long hash_ = 0;
- //use to test a candidate pre0 given a seedHash
- static boolean testPre0SeedHash(final long candidate, final long seedHash) {
- final long test1 = (seedHash << 48) | PRE0_LO6; //no SI bit
- final long test2 = test1 | ((long)SINGLEITEM_FLAG_MASK << 40); //adds the SI bit
- final long mask = PRE0_MASK; //ignores the SI flag
- final long masked = candidate & mask;
- return (masked == test1) || (candidate == test2);
- }
- //Internal Constructor. All checking has been done, assumes default seed
+ //Internal Constructor. All checking & hashing has been done, assumes default seed
private SingleItemSketch(final long hash) {
- arr[0] = (DEFAULT_SEED_HASH << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);
- arr[1] = hash;
+ pre0_ = (DEFAULT_SEED_HASH << 48) | PRE0_LO6_SI;
+ hash_ = hash;
}
- //Internal Constructor.All checking has been done, given the relevant seed
+ //All checking & hashing has been done, given the relevant seed
SingleItemSketch(final long hash, final long seed) {
final long seedHash = computeSeedHash(seed) & 0xFFFFL;
- arr[0] = (seedHash << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);
- arr[1] = hash;
+ pre0_ = (seedHash << 48) | PRE0_LO6_SI;
+ hash_ = hash;
}
- //All checking has been done, given the relevant seedHash
+ //All checking & hashing has been done, given the relevant seedHash
SingleItemSketch(final long hash, final short seedHash) {
final long seedH = seedHash & 0xFFFFL;
- arr[0] = (seedH << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);
- arr[1] = hash;
+ pre0_ = (seedH << 48) | PRE0_LO6_SI;
+ hash_ = hash;
}
/**
@@ -93,22 +86,16 @@ public final class SingleItemSketch extends CompactSketch {
/**
* Creates a SingleItemSketch on the heap given a SingleItemSketch Memory image and a seed.
* Checks the seed hash of the given Memory against a hash of the given seed.
- * @param srcMem the Memory to be heapified
+ * @param srcMem the Memory to be heapified.
* @param seed a given hash seed
* @return a SingleItemSketch
*/
public static SingleItemSketch heapify(final Memory srcMem, final long seed) {
- final long memPre0 = srcMem.getLong(0);
- final short seedHashMem = srcMem.getShort(6);
- final short seedHashCk = computeSeedHash(seed);
- checkSeedHashes(seedHashMem, seedHashCk);
- if (testPre0SeedHash(memPre0, seedHashCk)) {
- return new SingleItemSketch(srcMem.getLong(8), seedHashCk);
+ final short seedHashMem = checkMemorySeedHash(srcMem, seed);
+ if (isSingleItem(srcMem)) {
+ return new SingleItemSketch(srcMem.getLong(8), seedHashMem);
}
- final long def = (((long)seedHashCk << 48) | PRE0_LO6);
- throw new SketchesArgumentException("Input Memory does not match required Preamble. " + LS
- + "Memory Pre0 : " + Long.toHexString(memPre0) + LS
- + "Should be Pre0 : " + Long.toHexString(def));
+ throw new SketchesArgumentException("Input Memory Preamble is not a SingleItemSketch.");
}
//Create methods using the default seed
@@ -302,7 +289,7 @@ public final class SingleItemSketch extends CompactSketch {
}
/**
- * Create this sketch with the given long array and a seed.
+ * Create this sketch with the given long array (as an item) and a seed.
* If the long array is null or empty no create attempt is made and the method returns null.
*
* @param data The given long array.
@@ -318,7 +305,7 @@ public final class SingleItemSketch extends CompactSketch {
@Override
public int getCountLessThanTheta(final double theta) {
- return (arr[1] < (theta * MAX_THETA_LONG_AS_DOUBLE)) ? 1 : 0;
+ return (hash_ < (theta * MAX_THETA_LONG_AS_DOUBLE)) ? 1 : 0;
}
@Override
@@ -333,7 +320,7 @@ public final class SingleItemSketch extends CompactSketch {
@Override
public HashIterator iterator() {
- return new HeapHashIterator(new long[] { arr[1] }, 1, Long.MAX_VALUE);
+ return new HeapHashIterator(new long[] { hash_ }, 1, Long.MAX_VALUE);
}
@Override
@@ -379,10 +366,8 @@ public final class SingleItemSketch extends CompactSketch {
@Override
public byte[] toByteArray() {
final byte[] out = new byte[16];
- for (int i = 0; i < 8; i++) {
- out[i] = (byte) (arr[0] >>> (i * 8));
- out[i + 8] = (byte) (arr[1] >>> (i * 8));
- }
+ putLongLE(out, 0, pre0_);
+ putLongLE(out, 8, hash_);
return out;
}
@@ -390,7 +375,7 @@ public final class SingleItemSketch extends CompactSketch {
@Override
long[] getCache() {
- return new long[] { arr[1] };
+ return new long[] { hash_ };
}
@Override
@@ -405,7 +390,7 @@ public final class SingleItemSketch extends CompactSketch {
@Override
short getSeedHash() {
- return (short) (arr[0] >>> 48);
+ return (short) (pre0_ >>> 48);
}
}
diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java
index 50a1544..75ded37 100644
--- a/src/main/java/org/apache/datasketches/theta/Sketch.java
+++ b/src/main/java/org/apache/datasketches/theta/Sketch.java
@@ -24,7 +24,6 @@ import static org.apache.datasketches.HashOperations.count;
import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
import static org.apache.datasketches.Util.LS;
import static org.apache.datasketches.Util.ceilingPowerOf2;
-import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.Util.zeroPad;
import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
@@ -34,7 +33,8 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.isSingleItem;
import org.apache.datasketches.BinomialBoundsN;
import org.apache.datasketches.Family;
@@ -136,14 +136,14 @@ public abstract class Sketch {
}
case COMPACT: { //serVer 1, 2, or 3, preLongs = 1, 2, or 3
if (serVer == 3) {
- final long cap = srcMem.getCapacity();
- if (cap < 16) { //EMPTY?
+ if (srcMem.getCapacity() < 16) { //EMPTY
return EmptyCompactSketch.getInstance(srcMem);
}
- if (cap == 16) { //SINGLEITEM?
+
+ if (isSingleItem(srcMem)) { //SINGLEITEM?
return SingleItemSketch.heapify(srcMem, seed);
}
- //not empty & not singleItem, assume cap > 16
+ //not empty & not singleItem
final int flags = srcMem.getByte(FLAGS_BYTE);
final boolean orderedFlag = (flags & ORDERED_FLAG_MASK) > 0;
final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0;
@@ -677,15 +677,14 @@ public abstract class Sketch {
throw new SketchesArgumentException(
"Corrupted: COMPACT family sketch image must have Read-Only flag set");
}
- final boolean empty = PreambleUtil.isEmpty(srcMem); //also checks memCap < 16
- if (empty) { //empty flag or < 16 bytes
+ final boolean empty = PreambleUtil.isEmpty(srcMem);
+ if (empty) { //TODO
return EmptyCompactSketch.getInstance(srcMem);
}
- //cap >= 16 and not emptyFlag. Note older sketches may have missing empty flag.
+ checkMemorySeedHash(srcMem, seed);
+ //cap >= 16 and not emptyFlag. Note very old sketches (<2014) may have missing empty flag.
if (preLongs == 1) {
- final short memSeedHash = (short) extractSeedHash(srcMem);
- final short computedSeedHash = computeSeedHash(seed);
- if ((memSeedHash == computedSeedHash) && orderedFlag) { //SINGLE ITEM
+ if (isSingleItem(srcMem)) { //SINGLE ITEM
return SingleItemSketch.heapify(srcMem, seed);
} else { //EMPTY
return EmptyCompactSketch.getInstance(srcMem);
diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java
index dcb5ccd..ebaae45 100644
--- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java
+++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java
@@ -38,6 +38,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.PreambleUtil.extractUnionThetaLong;
import static org.apache.datasketches.theta.PreambleUtil.insertUnionThetaLong;
+import static org.apache.datasketches.theta.PreambleUtil.isSingleItem;
import org.apache.datasketches.Family;
import org.apache.datasketches.HashOperations;
@@ -276,6 +277,10 @@ final class UnionImpl extends Union {
}
//sketchIn is valid and not empty
Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash());
+ if (sketchIn instanceof SingleItemSketch) {
+ gadget_.hashUpdate(sketchIn.getCache()[0]);
+ return;
+ }
Sketch.checkSketchAndMemoryFlags(sketchIn);
unionThetaLong_ = min(min(unionThetaLong_, sketchIn.getThetaLong()), gadget_.getThetaLong()); //Theta rule
@@ -330,7 +335,7 @@ final class UnionImpl extends Union {
final int serVer = extractSerVer(skMem);
final int fam = extractFamilyID(skMem);
- if (serVer == 3) { //The OpenSource sketches (Aug 4, 2015)
+ if (serVer >= 3) { //The OpenSource sketches (Aug 4, 2015) starts with serVer = 3
if ((fam < 1) || (fam > 3)) {
throw new SketchesArgumentException(
"Family must be Alpha, QuickSelect, or Compact: " + Family.idToFamily(fam));
@@ -339,11 +344,6 @@ final class UnionImpl extends Union {
return;
}
- if (fam != 3) { //In older sketches this family was called the SetSketch
- throw new SketchesArgumentException(
- "Family must be old SET_SKETCH (now COMPACT) = 3: " + Family.idToFamily(fam));
- }
-
if (serVer == 2) { //older Sketch, which is compact and ordered
Util.checkSeedHashes(seedHash_, (short)extractSeedHash(skMem));
processVer2(skMem);
@@ -359,31 +359,26 @@ final class UnionImpl extends Union {
}
//Has seedHash, p, could have 0 entries & theta < 1.0,
- //could be unordered, ordered, compact, or not compact, size >= 16,
+ //could be unordered, ordered, compact, or not compact,
//could be Alpha, QuickSelect, or Compact.
private void processVer3(final Memory skMem) {
final int preLongs = extractPreLongs(skMem);
- if (preLongs == 1) { //we know cap >= 16
- //This test requires compact, ordered, notEmpty, ReadOnly, LE, seedHash is OK;
- // OR the above and the SI bit is set
- if (SingleItemSketch.testPre0SeedHash(skMem.getLong(0), seedHash_)) {
+ if (preLongs == 1) {
+ if (isSingleItem(skMem)) {
final long hash = skMem.getLong(8);
- //backdoor update, hash function is bypassed. A hash < 1 will be rejected later
gadget_.hashUpdate(hash);
return;
}
return; //empty
}
-
Util.checkSeedHashes(seedHash_, (short)extractSeedHash(skMem));
-
final int curCountIn;
final long thetaLongIn;
if (preLongs == 2) { //exact mode
curCountIn = extractCurCount(skMem);
- if (curCountIn == 0) { return; } //should be > 0, but if it is return empty anyway.
+ if (curCountIn == 0) { return; } //should be > 0, but if it is 0 return empty anyway.
thetaLongIn = Long.MAX_VALUE;
}
@@ -431,6 +426,10 @@ final class UnionImpl extends Union {
//has seedHash and p, could have 0 entries & theta,
// can only be compact, ordered, size >= 8
private void processVer2(final Memory skMem) {
+ final int famId = extractFamilyID(skMem);
+ if (famId != 3) {
+ throw new SketchesArgumentException("Invalid Family ID: " + famId + ". It should be 3");
+ }
final int preLongs = extractPreLongs(skMem);
if (preLongs == 1) { //does not change anything, return empty
@@ -476,6 +475,10 @@ final class UnionImpl extends Union {
//no seedHash, assumes given seed is correct. No p, no empty flag, no concept of direct
// can only be compact, ordered, size > 24
private void processVer1(final Memory skMem, final int cap) {
+ final int famId = extractFamilyID(skMem);
+ if (famId != 3) {
+ throw new SketchesArgumentException("Invalid Family ID: " + famId + ". It should be 3");
+ }
final long thetaLongIn = skMem.getLong(THETA_LONG);
final int curCountIn = extractCurCount(skMem);
if ((cap <= 24) || ((curCountIn == 0) && (unionThetaLong_ == Long.MAX_VALUE))) {
diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
index 308fee5..7236aa9 100644
--- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
@@ -34,10 +34,10 @@ import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
import static org.apache.datasketches.theta.PreambleUtil.extractP;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
import static org.apache.datasketches.theta.PreambleUtil.getMemBytes;
@@ -467,7 +467,7 @@ public abstract class UpdateSketch extends Sketch {
}
//Check seed hashes
- final short seedHash = (short)extractSeedHash(srcMem); //byte 6,7
+ final short seedHash = checkMemorySeedHash(srcMem, seed); //byte 6,7
Util.checkSeedHashes(seedHash, Util.computeSeedHash(seed));
//Check mem capacity, lgArrLongs
diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
index ebb6ef5..f294edb 100644
--- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
+++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
@@ -25,13 +25,12 @@ import static org.testng.Assert.assertNotNull;
import static org.testng.Assert.assertNull;
import static org.testng.Assert.assertTrue;
-import org.testng.annotations.Test;
-
+import org.apache.datasketches.Family;
+import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableDirectHandle;
import org.apache.datasketches.memory.WritableMemory;
-import org.apache.datasketches.Family;
-import org.apache.datasketches.SketchesArgumentException;
+import org.testng.annotations.Test;
/**
* @author Lee Rhodes
@@ -256,10 +255,10 @@ public class CompactSketchTest {
public void checkHeapifyEmptySketch() {
UpdateSketch sk = Sketches.updateSketchBuilder().build();
WritableMemory wmem = WritableMemory.allocate(16); //extra bytes
- sk.compact(false, wmem);
- PreambleUtil.clearEmpty(wmem); //corrupt to simulate missing empty flag
- Sketch csk = Sketch.heapify(wmem);
+ CompactSketch csk = sk.compact(false, wmem);
assertTrue(csk instanceof EmptyCompactSketch);
+ Sketch csk2 = Sketch.heapify(wmem);
+ assertTrue(csk2 instanceof EmptyCompactSketch);
}
@Test
diff --git a/src/test/java/org/apache/datasketches/theta/EmptyTest.java b/src/test/java/org/apache/datasketches/theta/EmptyTest.java
index ba8217b..001df72 100644
--- a/src/test/java/org/apache/datasketches/theta/EmptyTest.java
+++ b/src/test/java/org/apache/datasketches/theta/EmptyTest.java
@@ -19,18 +19,12 @@
package org.apache.datasketches.theta;
-import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
import org.testng.annotations.Test;
-import org.apache.datasketches.memory.Memory;
-import org.apache.datasketches.memory.WritableMemory;
-
/**
* Empty essentially means that the sketch has never seen data.
@@ -111,45 +105,6 @@ public class EmptyTest {
assertEquals(sk1.compact().toByteArray().length, 8);
}
- //These 3 tests reproduce a failure mode where an "old" empty sketch of 8 bytes without
- // its empty-flag bit set is read.
- @Test
- public void checkBackwardCompatibility1() {
- final int k = 16;
- final int bytes = Sketches.getMaxUnionBytes(k); //288
- Union union = SetOperation.builder().buildUnion(WritableMemory.allocate(bytes));
- Memory mem = badEmptySk();
- Sketch wsk = Sketches.wrapSketch(mem);
- union.update(wsk); //union has memory
- }
-
- @Test
- public void checkBackwardCompatibility2() {
- Union union = SetOperation.builder().setNominalEntries(16).buildUnion();
- Memory mem = badEmptySk();
- Sketch wsk = Sketches.wrapSketch(mem);
- union.update(wsk); //heap union
- }
-
- @Test
- public void checkBackwardCompatibility3() {
- Memory mem = badEmptySk();
- Sketches.heapifySketch(mem);
- }
-
- private static Memory badEmptySk() { //missing the empty bit
- final long preLongs = 1;
- final long serVer = 3;
- final long family = 3; //compact
- final long flags = (ORDERED_FLAG_MASK | COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK);
- final long seedHash = 0x93CC;
- final long badEmptySk = (seedHash << 48) | (flags << 40)
- | (family << 16) | (serVer << 8) | preLongs;
- WritableMemory wmem = WritableMemory.allocate(8);
- wmem.putLong(0, badEmptySk);
- return wmem;
- }
-
/**
* @param s value to print
*/
diff --git a/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java b/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java
index 76d3413..cb73501 100644
--- a/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java
+++ b/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java
@@ -23,18 +23,16 @@ import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
import static org.apache.datasketches.Util.computeSeedHash;
import static org.apache.datasketches.hash.MurmurHash3.hash;
import static org.apache.datasketches.theta.PreambleUtil.MAX_THETA_LONG_AS_DOUBLE;
-import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertNull;
import static org.testng.Assert.assertTrue;
import static org.testng.Assert.fail;
-import org.testng.annotations.Test;
-
+import org.apache.datasketches.SketchesArgumentException;
import org.apache.datasketches.memory.Memory;
import org.apache.datasketches.memory.WritableMemory;
-import org.apache.datasketches.SketchesArgumentException;
+import org.testng.annotations.Test;
/**
* @author Lee Rhodes
@@ -173,16 +171,6 @@ public class SingleItemSketchTest {
assertEquals(sis.getCurrentPreambleLongs(true), 1);
}
- @Test//(expectedExceptions = SketchesArgumentException.class)
- public void testPre0Seed() {
- long pre0_lo6 = 0X00_00_1A_00_00_03_03_01L; //no SI flag, requires not empty
- long pre0 = ((long)DEFAULT_SEED_HASH << 48) | pre0_lo6;
- assertTrue(SingleItemSketch.testPre0Seed(pre0, DEFAULT_UPDATE_SEED));
- //add SI flag
- pre0 |= ((long)SINGLEITEM_FLAG_MASK << 40);
- assertTrue(SingleItemSketch.testPre0Seed(pre0, DEFAULT_UPDATE_SEED));
- }
-
@Test
public void unionWrapped() {
Sketch sketch = SingleItemSketch.create(1);
@@ -328,6 +316,36 @@ public class SingleItemSketchTest {
}
@Test
+ public void checkDirectUnionSingleItem2() {
+ Sketch sk = Sketch.wrap(siSkWoutSiFlag24Bytes());
+ assertEquals(sk.getEstimate(), 1.0, 0.0);
+ //println(sk.toString());
+ sk = Sketch.wrap(siSkWithSiFlag24Bytes());
+ assertEquals(sk.getEstimate(), 1.0, 0.0);
+ //println(sk.toString());
+ }
+
+ static final long SiSkPre0WithSiFlag = 0x93cc3a0000030301L;
+ static final long SiSkPre0WoutSiFlag = 0x93cc1a0000030301L;
+ static final long Hash = 0x05a186bdcb7df915L;
+
+ static Memory siSkWithSiFlag24Bytes() {
+ int cap = 24;
+ WritableMemory wmem = WritableMemory.allocate(cap);
+ wmem.putLong(0, SiSkPre0WithSiFlag);
+ wmem.putLong(8, Hash);
+ return wmem;
+ }
+
+ static Memory siSkWoutSiFlag24Bytes() {
+ int cap = 24;
+ WritableMemory wmem = WritableMemory.allocate(cap);
+ wmem.putLong(0, SiSkPre0WoutSiFlag);
+ wmem.putLong(8, Hash);
+ return wmem;
+ }
+
+ @Test
public void printlnTest() {
println("PRINTING: "+this.getClass().getName());
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org