You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/06/09 05:36:05 UTC

[incubator-datasketches-java] 01/01: Improvements in Theta Sketch to (hopefully) eliminate the misinterpretation of SingleItemSketches.

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch FixSingleItem
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit d90ab751dca0a73bad335fa45e81c0c534d3b3a3
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Mon Jun 8 22:35:36 2020 -0700

    Improvements in Theta Sketch to (hopefully) eliminate the
    misinterpretation of SingleItemSketches.
    
    We should try to test this in the field.
---
 .../theta/DirectCompactOrderedSketch.java          |  8 +-
 .../theta/DirectCompactUnorderedSketch.java        |  8 +-
 .../datasketches/theta/ForwardCompatibility.java   |  9 +--
 .../theta/HeapCompactOrderedSketch.java            |  9 +--
 .../theta/HeapCompactUnorderedSketch.java          |  9 +--
 .../apache/datasketches/theta/PreambleUtil.java    | 58 +++++++++++----
 .../datasketches/theta/SingleItemSketch.java       | 85 +++++++++-------------
 .../java/org/apache/datasketches/theta/Sketch.java | 23 +++---
 .../org/apache/datasketches/theta/UnionImpl.java   | 33 +++++----
 .../apache/datasketches/theta/UpdateSketch.java    |  4 +-
 .../datasketches/theta/CompactSketchTest.java      | 13 ++--
 .../org/apache/datasketches/theta/EmptyTest.java   | 45 ------------
 .../datasketches/theta/SingleItemSketchTest.java   | 46 ++++++++----
 13 files changed, 157 insertions(+), 193 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java
index b68bc08..6a88213 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectCompactOrderedSketch.java
@@ -19,13 +19,11 @@
 
 package org.apache.datasketches.theta;
 
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
 
 import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.memory.WritableMemory;
@@ -54,9 +52,7 @@ final class DirectCompactOrderedSketch extends DirectCompactSketch {
    * @return this sketch
    */
   static DirectCompactOrderedSketch wrapInstance(final Memory srcMem, final long seed) {
-    final short memSeedHash = srcMem.getShort(SEED_HASH_SHORT);
-    final short computedSeedHash = computeSeedHash(seed);
-    checkSeedHashes(memSeedHash, computedSeedHash);
+    checkMemorySeedHash(srcMem, seed);
     return new DirectCompactOrderedSketch(srcMem);
   }
 
diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java
index 9ea6965..eb9d425 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectCompactUnorderedSketch.java
@@ -19,12 +19,10 @@
 
 package org.apache.datasketches.theta;
 
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.EMPTY_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.SEED_HASH_SHORT;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
 
 import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.memory.WritableMemory;
@@ -53,9 +51,7 @@ final class DirectCompactUnorderedSketch extends DirectCompactSketch {
    * @return this sketch
    */
   static DirectCompactUnorderedSketch wrapInstance(final Memory srcMem, final long seed) {
-    final short memSeedHash = srcMem.getShort(SEED_HASH_SHORT);
-    final short computedSeedHash = computeSeedHash(seed);
-    checkSeedHashes(memSeedHash, computedSeedHash);
+    checkMemorySeedHash(srcMem, seed);
     return new DirectCompactUnorderedSketch(srcMem);
   }
 
diff --git a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java
index 825e925..535fba6 100644
--- a/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java
+++ b/src/main/java/org/apache/datasketches/theta/ForwardCompatibility.java
@@ -19,9 +19,9 @@
 
 package org.apache.datasketches.theta;
 
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
 import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
 
 import org.apache.datasketches.SketchesArgumentException;
@@ -38,7 +38,7 @@ import org.apache.datasketches.memory.Memory;
 final class ForwardCompatibility {
 
   /**
-   * Convert a serialization version (SerVer) 1 sketch to a SerVer 3 sketch.
+   * Convert a serialization version (SerVer) 1 sketch (~Feb 2014) to a SerVer 3 sketch.
    * Note: SerVer 1 sketches always have metadata-longs of 3 and are always stored
    * in a compact ordered form, but with 3 different sketch types.  All SerVer 1 sketches will
    * be converted to a SerVer 3 sketches. There is no concept of p-sampling, no empty bit.
@@ -91,10 +91,7 @@ final class ForwardCompatibility {
    * @return a SerVer 3 HeapCompactOrderedSketch
    */
   static final CompactSketch heapify2to3(final Memory srcMem, final long seed) {
-    final short seedHash = Util.computeSeedHash(seed);
-    final short memSeedHash = (short)extractSeedHash(srcMem);
-    Util.checkSeedHashes(seedHash, memSeedHash);
-
+    final short seedHash = checkMemorySeedHash(srcMem, seed);
     final int memCap = (int) srcMem.getCapacity();
     final int preLongs = extractPreLongs(srcMem); //1,2 or 3
     int reqBytesIn = 8;
diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java
index 9b55833..220f95c 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapCompactOrderedSketch.java
@@ -19,11 +19,9 @@
 
 package org.apache.datasketches.theta;
 
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
 import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
 
 import org.apache.datasketches.memory.Memory;
@@ -57,10 +55,7 @@ final class HeapCompactOrderedSketch extends HeapCompactSketch {
    * @return a CompactSketch
    */
   static CompactSketch heapifyInstance(final Memory srcMem, final long seed) {
-    final short memSeedHash = (short) extractSeedHash(srcMem);
-    final short computedSeedHash = computeSeedHash(seed);
-    checkSeedHashes(memSeedHash, computedSeedHash);
-
+    final short memSeedHash = checkMemorySeedHash(srcMem, seed);
     final int preLongs = extractPreLongs(srcMem);
     final boolean empty = PreambleUtil.isEmpty(srcMem); //checks for cap <= 8
     long thetaLong = Long.MAX_VALUE;
diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java
index 2c375bb..c9ce337 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapCompactUnorderedSketch.java
@@ -19,11 +19,9 @@
 
 package org.apache.datasketches.theta;
 
-import static org.apache.datasketches.Util.checkSeedHashes;
-import static org.apache.datasketches.Util.computeSeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
 import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
 
 import org.apache.datasketches.memory.Memory;
@@ -57,10 +55,7 @@ final class HeapCompactUnorderedSketch extends HeapCompactSketch {
    * @return a CompactSketch
    */
   static CompactSketch heapifyInstance(final Memory srcMem, final long seed) {
-    final short memSeedHash = (short) extractSeedHash(srcMem);
-    final short computedSeedHash = computeSeedHash(seed);
-    checkSeedHashes(memSeedHash, computedSeedHash);
-
+    final short memSeedHash = checkMemorySeedHash(srcMem, seed);
     final int preLongs = extractPreLongs(srcMem); //must be > 1
     final boolean empty = PreambleUtil.isEmpty(srcMem); //checks for cap <= 8
     int curCount = 0;
diff --git a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
index 7fd6210..94345eb 100644
--- a/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
+++ b/src/main/java/org/apache/datasketches/theta/PreambleUtil.java
@@ -20,6 +20,8 @@
 package org.apache.datasketches.theta;
 
 import static org.apache.datasketches.Util.LS;
+import static org.apache.datasketches.Util.checkSeedHashes;
+import static org.apache.datasketches.Util.computeSeedHash;
 import static org.apache.datasketches.Util.zeroPad;
 
 import java.nio.ByteOrder;
@@ -227,22 +229,24 @@ final class PreambleUtil {
    */
   static String preambleToString(final Memory mem) {
     final int preLongs = getAndCheckPreLongs(mem);
-    final ResizeFactor rf = ResizeFactor.getRF(extractLgResizeFactor(mem));
+    final int rfId = extractLgResizeFactor(mem);
+    final ResizeFactor rf = ResizeFactor.getRF(rfId);
     final int serVer = extractSerVer(mem);
-    final Family family = Family.idToFamily(extractFamilyID(mem));
+    final int familyId = extractFamilyID(mem);
+    final Family family = Family.idToFamily(familyId);
     final int lgNomLongs = extractLgNomLongs(mem);
     final int lgArrLongs = extractLgArrLongs(mem);
 
     //Flags
     final int flags = extractFlags(mem);
-    final String flagsStr = zeroPad(Integer.toBinaryString(flags), 8) + ", " + (flags);
+    final String flagsStr = (flags) + ", " + zeroPad(Integer.toBinaryString(flags), 8);
     final String nativeOrder = ByteOrder.nativeOrder().toString();
     final boolean bigEndian = (flags & BIG_ENDIAN_FLAG_MASK) > 0;
     final boolean readOnly = (flags & READ_ONLY_FLAG_MASK) > 0;
     final boolean empty = (flags & EMPTY_FLAG_MASK) > 0;
     final boolean compact = (flags & COMPACT_FLAG_MASK) > 0;
     final boolean ordered = (flags & ORDERED_FLAG_MASK) > 0;
-    final boolean singleItem = !empty && (preLongs == 1);
+    final boolean singleItem = (flags & SINGLEITEM_FLAG_MASK) > 0; //!empty && (preLongs == 1);
 
     final int seedHash = extractSeedHash(mem);
 
@@ -278,23 +282,24 @@ final class PreambleUtil {
     final StringBuilder sb = new StringBuilder();
     sb.append(LS);
     sb.append("### SKETCH PREAMBLE SUMMARY:").append(LS);
+    sb.append("Native Byte Order             : ").append(nativeOrder).append(LS);
     sb.append("Byte  0: Preamble Longs       : ").append(preLongs).append(LS);
-    sb.append("Byte  0: ResizeFactor         : ").append(rf.toString()).append(LS);
+    sb.append("Byte  0: ResizeFactor         : ").append(rfId + ", " + rf.toString()).append(LS);
     sb.append("Byte  1: Serialization Version: ").append(serVer).append(LS);
-    sb.append("Byte  2: Family               : ").append(family.toString()).append(LS);
+    sb.append("Byte  2: Family               : ").append(familyId + ", " + family.toString()).append(LS);
     sb.append("Byte  3: LgNomLongs           : ").append(lgNomLongs).append(LS);
     sb.append("Byte  4: LgArrLongs           : ").append(lgArrLongs).append(LS);
     sb.append("Byte  5: Flags Field          : ").append(flagsStr).append(LS);
-    sb.append("  (Native Byte Order)         : ").append(nativeOrder).append(LS);
-    sb.append("  BIG_ENDIAN_STORAGE          : ").append(bigEndian).append(LS);
-    sb.append("  READ_ONLY                   : ").append(readOnly).append(LS);
-    sb.append("  EMPTY                       : ").append(empty).append(LS);
-    sb.append("  COMPACT                     : ").append(compact).append(LS);
-    sb.append("  ORDERED                     : ").append(ordered).append(LS);
-    sb.append("  SINGLEITEM  (derived)       : ").append(singleItem).append(LS);
-    sb.append("Bytes 6-7  : Seed Hash        : ").append(Integer.toHexString(seedHash)).append(LS);
+    sb.append("  Bit Flag Name               : State:").append(LS);
+    sb.append("    0 BIG_ENDIAN_STORAGE      : ").append(bigEndian).append(LS);
+    sb.append("    1 READ_ONLY               : ").append(readOnly).append(LS);
+    sb.append("    2 EMPTY                   : ").append(empty).append(LS);
+    sb.append("    3 COMPACT                 : ").append(compact).append(LS);
+    sb.append("    4 ORDERED                 : ").append(ordered).append(LS);
+    sb.append("    5 SINGLE_ITEM             : ").append(singleItem).append(LS);
+    sb.append("Bytes 6-7  : Seed Hash Hex    : ").append(Integer.toHexString(seedHash)).append(LS);
     if (preLongs == 1) {
-      sb.append(" --ABSENT, ASSUMED:").append(LS);
+      sb.append(" --ABSENT FIELDS, ASSUMED:").append(LS);
       sb.append("Bytes 8-11 : CurrentCount     : ").append(curCount).append(LS);
       sb.append("Bytes 12-15: P                : ").append(p).append(LS);
       sb.append("Bytes 16-23: Theta (double)   : ").append(thetaDbl).append(LS);
@@ -328,7 +333,8 @@ final class PreambleUtil {
     }
     sb.append(  "Preamble Bytes                : ").append(preLongs * 8).append(LS);
     sb.append(  "Data Bytes                    : ").append(curCount * 8).append(LS);
-    sb.append(  "TOTAL Sketch Bytes            : ").append(mem.getCapacity()).append(LS);
+    sb.append(  "TOTAL Sketch Bytes            : ").append((preLongs + curCount) * 8).append(LS);
+    sb.append(  "TOTAL Capacity Bytes          : ").append(mem.getCapacity()).append(LS);
     sb.append("### END SKETCH PREAMBLE SUMMARY").append(LS);
     return sb.toString();
   }
@@ -462,6 +468,20 @@ final class PreambleUtil {
     return emptyFlag || emptyCap;
   }
 
+  static boolean isSingleItem(final Memory mem) {
+    // Flags byte must be LittleEndian, ReadOnly, Not Empty, Compact, Ordered = 11010 = 0x1A.
+    // Flags mask will be 0x1F.
+    // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now.
+    // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3,
+    // and the hash seed matches (not done here), it is virtually guaranteed that we have a
+    // SingleItem Sketch.
+    final boolean preLongs = extractPreLongs(mem) == 1;
+    final boolean serVer = extractSerVer(mem) >= 3;
+    final boolean famId = extractFamilyID(mem) == 3; //compact
+    final boolean flags =  (extractFlags(mem) & 0x1F) == 0x1A; //no SI, yet
+    return preLongs && serVer && famId && flags;
+  }
+
   /**
    * Checks Memory for capacity to hold the preamble and returns the extracted preLongs.
    * @param mem the given Memory
@@ -480,6 +500,12 @@ final class PreambleUtil {
     return preLongs;
   }
 
+  static final short checkMemorySeedHash(final Memory mem, final long seed) {
+    final short seedHashMem = (short) extractSeedHash(mem);
+    checkSeedHashes(seedHashMem, computeSeedHash(seed)); //throws if bad seedHash
+    return seedHashMem;
+  }
+
   private static void throwNotBigEnough(final long cap, final int required) {
     throw new SketchesArgumentException(
         "Possible Corruption: Size of byte array or Memory not large enough: Size: " + cap
diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
index 2d56a12..d82a5c2 100644
--- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
@@ -20,64 +20,57 @@
 package org.apache.datasketches.theta;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.apache.datasketches.ByteArrayUtil.putLongLE;
 import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
-import static org.apache.datasketches.Util.checkSeedHashes;
 import static org.apache.datasketches.Util.computeSeedHash;
 import static org.apache.datasketches.hash.MurmurHash3.hash;
 import static org.apache.datasketches.theta.PreambleUtil.MAX_THETA_LONG_AS_DOUBLE;
-import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.isSingleItem;
 
 import org.apache.datasketches.SketchesArgumentException;
 import org.apache.datasketches.Util;
 import org.apache.datasketches.memory.Memory;
 
 /**
+ * A CompactSketch that holds only one item hash.
+ *
  * @author Lee Rhodes
  */
 public final class SingleItemSketch extends CompactSketch {
   private static final long DEFAULT_SEED_HASH = computeSeedHash(DEFAULT_UPDATE_SEED) & 0xFFFFL;
-  private static final String LS = System.getProperty("line.separator");
 
-  //For backward compatibility, a candidate pre0 long must have Flags= compact, read-only, ordered,
-  //  empty=0; COMPACT-Family=3, SerVer=3, PreLongs=1, plus the relevant seedHash.
-  private static final long PRE0_LO6 =  0X00_00_1A_00_00_03_03_01L; //no SI flag, requires not empty
-  private static final long PRE0_MASK = 0XFF_FF_DF_FF_FF_FF_FF_FFL; //removes SI flag
+  // For backward compatibility, a candidate pre0_ long must have:
+  // Flags byte 5 must be Ordered, Compact, NOT Empty, Read Only, LittleEndian = 11010 = 0x1A.
+  // Flags mask will be 0x1F.
+  // SingleItem flag may not be set due to a historical bug, so we can't depend on it for now.
+  // However, if the above flags are correct, preLongs == 1, SerVer >= 3, FamilyID == 3,
+  // and the hash seed matches, it is virtually guaranteed that we have a SingleItem Sketch.
 
-  private final long[] arr = new long[2];
+  private static final long PRE0_LO6_SI   = 0X00_00_3A_00_00_03_03_01L; //with SI flag
 
-  //use to test a candidate pre0 given a seed
-  static boolean testPre0Seed(final long candidate, final long seed) {
-    final long seedHash = computeSeedHash(seed) & 0xFFFFL;
-    return testPre0SeedHash(candidate, seedHash);
-  }
+  private long pre0_ = 0;
+  private long hash_ = 0;
 
-  //use to test a candidate pre0 given a seedHash
-  static boolean testPre0SeedHash(final long candidate, final long seedHash) {
-    final long test1 = (seedHash << 48) | PRE0_LO6; //no SI bit
-    final long test2 = test1 | ((long)SINGLEITEM_FLAG_MASK << 40); //adds the SI bit
-    final long mask = PRE0_MASK; //ignores the SI flag
-    final long masked = candidate & mask;
-    return (masked == test1) || (candidate == test2);
-  }
 
-  //Internal Constructor. All checking has been done, assumes default seed
+  //Internal Constructor. All checking & hashing has been done, assumes default seed
   private SingleItemSketch(final long hash) {
-    arr[0] = (DEFAULT_SEED_HASH << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);
-    arr[1] = hash;
+    pre0_ = (DEFAULT_SEED_HASH << 48) | PRE0_LO6_SI;
+    hash_ = hash;
   }
 
-  //Internal Constructor.All checking has been done, given the relevant seed
+  //All checking & hashing has been done, given the relevant seed
   SingleItemSketch(final long hash, final long seed) {
     final long seedHash = computeSeedHash(seed) & 0xFFFFL;
-    arr[0] = (seedHash << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);
-    arr[1] = hash;
+    pre0_ = (seedHash << 48) | PRE0_LO6_SI;
+    hash_ = hash;
   }
 
-  //All checking has been done, given the relevant seedHash
+  //All checking & hashing has been done, given the relevant seedHash
   SingleItemSketch(final long hash, final short seedHash) {
     final long seedH = seedHash & 0xFFFFL;
-    arr[0] = (seedH << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);
-    arr[1] = hash;
+    pre0_ = (seedH << 48) | PRE0_LO6_SI;
+    hash_ = hash;
   }
 
   /**
@@ -93,22 +86,16 @@ public final class SingleItemSketch extends CompactSketch {
   /**
    * Creates a SingleItemSketch on the heap given a SingleItemSketch Memory image and a seed.
    * Checks the seed hash of the given Memory against a hash of the given seed.
-   * @param srcMem the Memory to be heapified
+   * @param srcMem the Memory to be heapified.
    * @param seed a given hash seed
    * @return a SingleItemSketch
    */
   public static SingleItemSketch heapify(final Memory srcMem, final long seed) {
-    final long memPre0 = srcMem.getLong(0);
-    final short seedHashMem = srcMem.getShort(6);
-    final short seedHashCk = computeSeedHash(seed);
-    checkSeedHashes(seedHashMem, seedHashCk);
-    if (testPre0SeedHash(memPre0, seedHashCk)) {
-      return new SingleItemSketch(srcMem.getLong(8), seedHashCk);
+    final short seedHashMem = checkMemorySeedHash(srcMem, seed);
+    if (isSingleItem(srcMem)) {
+      return new SingleItemSketch(srcMem.getLong(8), seedHashMem);
     }
-    final long def = (((long)seedHashCk << 48) | PRE0_LO6);
-    throw new SketchesArgumentException("Input Memory does not match required Preamble. " + LS
-        + "Memory    Pre0 : " + Long.toHexString(memPre0) + LS
-        + "Should be Pre0 : " + Long.toHexString(def));
+    throw new SketchesArgumentException("Input Memory Preamble is not a SingleItemSketch.");
   }
 
   //Create methods using the default seed
@@ -302,7 +289,7 @@ public final class SingleItemSketch extends CompactSketch {
   }
 
   /**
-   * Create this sketch with the given long array and a seed.
+   * Create this sketch with the given long array (as an item) and a seed.
    * If the long array is null or empty no create attempt is made and the method returns null.
    *
    * @param data The given long array.
@@ -318,7 +305,7 @@ public final class SingleItemSketch extends CompactSketch {
 
   @Override
   public int getCountLessThanTheta(final double theta) {
-    return (arr[1] < (theta * MAX_THETA_LONG_AS_DOUBLE)) ? 1 : 0;
+    return (hash_ < (theta * MAX_THETA_LONG_AS_DOUBLE)) ? 1 : 0;
   }
 
   @Override
@@ -333,7 +320,7 @@ public final class SingleItemSketch extends CompactSketch {
 
   @Override
   public HashIterator iterator() {
-    return new HeapHashIterator(new long[] { arr[1] }, 1, Long.MAX_VALUE);
+    return new HeapHashIterator(new long[] { hash_ }, 1, Long.MAX_VALUE);
   }
 
   @Override
@@ -379,10 +366,8 @@ public final class SingleItemSketch extends CompactSketch {
   @Override
   public byte[] toByteArray() {
     final byte[] out = new byte[16];
-    for (int i = 0; i < 8; i++) {
-      out[i]     = (byte) (arr[0] >>> (i * 8));
-      out[i + 8] = (byte) (arr[1] >>> (i * 8));
-    }
+    putLongLE(out, 0, pre0_);
+    putLongLE(out, 8, hash_);
     return out;
   }
 
@@ -390,7 +375,7 @@ public final class SingleItemSketch extends CompactSketch {
 
   @Override
   long[] getCache() {
-    return new long[] { arr[1] };
+    return new long[] { hash_ };
   }
 
   @Override
@@ -405,7 +390,7 @@ public final class SingleItemSketch extends CompactSketch {
 
   @Override
   short getSeedHash() {
-    return (short) (arr[0] >>> 48);
+    return (short) (pre0_ >>> 48);
   }
 
 }
diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java
index 50a1544..75ded37 100644
--- a/src/main/java/org/apache/datasketches/theta/Sketch.java
+++ b/src/main/java/org/apache/datasketches/theta/Sketch.java
@@ -24,7 +24,6 @@ import static org.apache.datasketches.HashOperations.count;
 import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
 import static org.apache.datasketches.Util.LS;
 import static org.apache.datasketches.Util.ceilingPowerOf2;
-import static org.apache.datasketches.Util.computeSeedHash;
 import static org.apache.datasketches.Util.zeroPad;
 import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.FAMILY_BYTE;
@@ -34,7 +33,8 @@ import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
 import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
+import static org.apache.datasketches.theta.PreambleUtil.isSingleItem;
 
 import org.apache.datasketches.BinomialBoundsN;
 import org.apache.datasketches.Family;
@@ -136,14 +136,14 @@ public abstract class Sketch {
       }
       case COMPACT: { //serVer 1, 2, or 3, preLongs = 1, 2, or 3
         if (serVer == 3) {
-          final long cap = srcMem.getCapacity();
-          if (cap < 16) { //EMPTY?
+          if (srcMem.getCapacity() < 16) { //EMPTY
             return EmptyCompactSketch.getInstance(srcMem);
           }
-          if (cap == 16) { //SINGLEITEM?
+
+          if (isSingleItem(srcMem)) { //SINGLEITEM?
             return SingleItemSketch.heapify(srcMem, seed);
           }
-          //not empty & not singleItem, assume cap > 16
+          //not empty & not singleItem
           final int flags = srcMem.getByte(FLAGS_BYTE);
           final boolean orderedFlag = (flags & ORDERED_FLAG_MASK) > 0;
           final boolean compactFlag = (flags & COMPACT_FLAG_MASK) > 0;
@@ -677,15 +677,14 @@ public abstract class Sketch {
           throw new SketchesArgumentException(
               "Corrupted: COMPACT family sketch image must have Read-Only flag set");
         }
-        final boolean empty = PreambleUtil.isEmpty(srcMem); //also checks memCap < 16
-        if (empty) { //empty flag or < 16 bytes
+        final boolean empty = PreambleUtil.isEmpty(srcMem);
+        if (empty) { //TODO
           return EmptyCompactSketch.getInstance(srcMem);
         }
-        //cap >= 16 and not emptyFlag. Note older sketches may have missing empty flag.
+        checkMemorySeedHash(srcMem, seed);
+        //cap >= 16 and not emptyFlag. Note very old sketches (<2014) may have missing empty flag.
         if (preLongs == 1) {
-          final short memSeedHash = (short) extractSeedHash(srcMem);
-          final short computedSeedHash = computeSeedHash(seed);
-          if ((memSeedHash == computedSeedHash) && orderedFlag) { //SINGLE ITEM
+          if (isSingleItem(srcMem)) { //SINGLE ITEM
             return SingleItemSketch.heapify(srcMem, seed);
           } else { //EMPTY
             return EmptyCompactSketch.getInstance(srcMem);
diff --git a/src/main/java/org/apache/datasketches/theta/UnionImpl.java b/src/main/java/org/apache/datasketches/theta/UnionImpl.java
index dcb5ccd..ebaae45 100644
--- a/src/main/java/org/apache/datasketches/theta/UnionImpl.java
+++ b/src/main/java/org/apache/datasketches/theta/UnionImpl.java
@@ -38,6 +38,7 @@ import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
 import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
 import static org.apache.datasketches.theta.PreambleUtil.extractUnionThetaLong;
 import static org.apache.datasketches.theta.PreambleUtil.insertUnionThetaLong;
+import static org.apache.datasketches.theta.PreambleUtil.isSingleItem;
 
 import org.apache.datasketches.Family;
 import org.apache.datasketches.HashOperations;
@@ -276,6 +277,10 @@ final class UnionImpl extends Union {
     }
     //sketchIn is valid and not empty
     Util.checkSeedHashes(seedHash_, sketchIn.getSeedHash());
+    if (sketchIn instanceof SingleItemSketch) {
+      gadget_.hashUpdate(sketchIn.getCache()[0]);
+      return;
+    }
     Sketch.checkSketchAndMemoryFlags(sketchIn);
 
     unionThetaLong_ = min(min(unionThetaLong_, sketchIn.getThetaLong()), gadget_.getThetaLong()); //Theta rule
@@ -330,7 +335,7 @@ final class UnionImpl extends Union {
     final int serVer = extractSerVer(skMem);
     final int fam = extractFamilyID(skMem);
 
-    if (serVer == 3) { //The OpenSource sketches (Aug 4, 2015)
+    if (serVer >= 3) { //The OpenSource sketches (Aug 4, 2015) starts with serVer = 3
       if ((fam < 1) || (fam > 3)) {
         throw new SketchesArgumentException(
             "Family must be Alpha, QuickSelect, or Compact: " + Family.idToFamily(fam));
@@ -339,11 +344,6 @@ final class UnionImpl extends Union {
       return;
     }
 
-    if (fam != 3) { //In older sketches this family was called the SetSketch
-      throw new SketchesArgumentException(
-          "Family must be old SET_SKETCH (now COMPACT) = 3: " + Family.idToFamily(fam));
-    }
-
     if (serVer == 2) { //older Sketch, which is compact and ordered
       Util.checkSeedHashes(seedHash_, (short)extractSeedHash(skMem));
       processVer2(skMem);
@@ -359,31 +359,26 @@ final class UnionImpl extends Union {
   }
 
   //Has seedHash, p, could have 0 entries & theta < 1.0,
-  //could be unordered, ordered, compact, or not compact, size >= 16,
+  //could be unordered, ordered, compact, or not compact,
   //could be Alpha, QuickSelect, or Compact.
   private void processVer3(final Memory skMem) {
     final int preLongs = extractPreLongs(skMem);
 
-    if (preLongs == 1) { //we know cap >= 16
-      //This test requires compact, ordered, notEmpty, ReadOnly, LE, seedHash is OK;
-      // OR the above and the SI bit is set
-      if (SingleItemSketch.testPre0SeedHash(skMem.getLong(0), seedHash_)) {
+    if (preLongs == 1) {
+      if (isSingleItem(skMem)) {
         final long hash = skMem.getLong(8);
-        //backdoor update, hash function is bypassed. A hash < 1 will be rejected later
         gadget_.hashUpdate(hash);
         return;
       }
       return; //empty
     }
-
     Util.checkSeedHashes(seedHash_, (short)extractSeedHash(skMem));
-
     final int curCountIn;
     final long thetaLongIn;
 
     if (preLongs == 2) { //exact mode
       curCountIn = extractCurCount(skMem);
-      if (curCountIn == 0) { return; } //should be > 0, but if it is return empty anyway.
+      if (curCountIn == 0) { return; } //should be > 0, but if it is 0 return empty anyway.
       thetaLongIn = Long.MAX_VALUE;
     }
 
@@ -431,6 +426,10 @@ final class UnionImpl extends Union {
   //has seedHash and p, could have 0 entries & theta,
   // can only be compact, ordered, size >= 8
   private void processVer2(final Memory skMem) {
+    final int famId = extractFamilyID(skMem);
+    if (famId != 3) {
+      throw new SketchesArgumentException("Invalid Family ID: " + famId + ". It should be 3");
+    }
     final int preLongs = extractPreLongs(skMem);
 
     if (preLongs == 1) { //does not change anything, return empty
@@ -476,6 +475,10 @@ final class UnionImpl extends Union {
   //no seedHash, assumes given seed is correct. No p, no empty flag, no concept of direct
   // can only be compact, ordered, size > 24
   private void processVer1(final Memory skMem, final int cap) {
+    final int famId = extractFamilyID(skMem);
+    if (famId != 3) {
+      throw new SketchesArgumentException("Invalid Family ID: " + famId + ". It should be 3");
+    }
     final long thetaLongIn = skMem.getLong(THETA_LONG);
     final int curCountIn = extractCurCount(skMem);
     if ((cap <= 24) || ((curCountIn == 0) && (unionThetaLong_ == Long.MAX_VALUE))) {
diff --git a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
index 308fee5..7236aa9 100644
--- a/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/UpdateSketch.java
@@ -34,10 +34,10 @@ import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
 import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
 import static org.apache.datasketches.theta.PreambleUtil.SER_VER;
 import static org.apache.datasketches.theta.PreambleUtil.SER_VER_BYTE;
+import static org.apache.datasketches.theta.PreambleUtil.checkMemorySeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractFamilyID;
 import static org.apache.datasketches.theta.PreambleUtil.extractFlags;
 import static org.apache.datasketches.theta.PreambleUtil.extractP;
-import static org.apache.datasketches.theta.PreambleUtil.extractSeedHash;
 import static org.apache.datasketches.theta.PreambleUtil.extractSerVer;
 import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
 import static org.apache.datasketches.theta.PreambleUtil.getMemBytes;
@@ -467,7 +467,7 @@ public abstract class UpdateSketch extends Sketch {
     }
 
     //Check seed hashes
-    final short seedHash = (short)extractSeedHash(srcMem);              //byte 6,7
+    final short seedHash = checkMemorySeedHash(srcMem, seed);              //byte 6,7
     Util.checkSeedHashes(seedHash, Util.computeSeedHash(seed));
 
     //Check mem capacity, lgArrLongs
diff --git a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
index ebb6ef5..f294edb 100644
--- a/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
+++ b/src/test/java/org/apache/datasketches/theta/CompactSketchTest.java
@@ -25,13 +25,12 @@ import static org.testng.Assert.assertNotNull;
 import static org.testng.Assert.assertNull;
 import static org.testng.Assert.assertTrue;
 
-import org.testng.annotations.Test;
-
+import org.apache.datasketches.Family;
+import org.apache.datasketches.SketchesArgumentException;
 import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.memory.WritableDirectHandle;
 import org.apache.datasketches.memory.WritableMemory;
-import org.apache.datasketches.Family;
-import org.apache.datasketches.SketchesArgumentException;
+import org.testng.annotations.Test;
 
 /**
  * @author Lee Rhodes
@@ -256,10 +255,10 @@ public class CompactSketchTest {
   public void checkHeapifyEmptySketch() {
     UpdateSketch sk = Sketches.updateSketchBuilder().build();
     WritableMemory wmem = WritableMemory.allocate(16); //extra bytes
-    sk.compact(false, wmem);
-    PreambleUtil.clearEmpty(wmem); //corrupt to simulate missing empty flag
-    Sketch csk = Sketch.heapify(wmem);
+    CompactSketch csk = sk.compact(false, wmem);
     assertTrue(csk instanceof EmptyCompactSketch);
+    Sketch csk2 = Sketch.heapify(wmem);
+    assertTrue(csk2 instanceof EmptyCompactSketch);
   }
 
   @Test
diff --git a/src/test/java/org/apache/datasketches/theta/EmptyTest.java b/src/test/java/org/apache/datasketches/theta/EmptyTest.java
index ba8217b..001df72 100644
--- a/src/test/java/org/apache/datasketches/theta/EmptyTest.java
+++ b/src/test/java/org/apache/datasketches/theta/EmptyTest.java
@@ -19,18 +19,12 @@
 
 package org.apache.datasketches.theta;
 
-import static org.apache.datasketches.theta.PreambleUtil.COMPACT_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.ORDERED_FLAG_MASK;
-import static org.apache.datasketches.theta.PreambleUtil.READ_ONLY_FLAG_MASK;
 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertTrue;
 
 import org.testng.annotations.Test;
 
-import org.apache.datasketches.memory.Memory;
-import org.apache.datasketches.memory.WritableMemory;
-
 
 /**
  * Empty essentially means that the sketch has never seen data.
@@ -111,45 +105,6 @@ public class EmptyTest {
     assertEquals(sk1.compact().toByteArray().length, 8);
   }
 
-  //These 3 tests reproduce a failure mode where an "old" empty sketch of 8 bytes without
-  // its empty-flag bit set is read.
-  @Test
-  public void checkBackwardCompatibility1() {
-    final int k = 16;
-    final int bytes = Sketches.getMaxUnionBytes(k); //288
-    Union union = SetOperation.builder().buildUnion(WritableMemory.allocate(bytes));
-    Memory mem = badEmptySk();
-    Sketch wsk = Sketches.wrapSketch(mem);
-    union.update(wsk); //union has memory
-  }
-
-  @Test
-  public void checkBackwardCompatibility2() {
-    Union union = SetOperation.builder().setNominalEntries(16).buildUnion();
-    Memory mem = badEmptySk();
-    Sketch wsk = Sketches.wrapSketch(mem);
-    union.update(wsk); //heap union
-  }
-
-  @Test
-  public void checkBackwardCompatibility3() {
-    Memory mem = badEmptySk();
-    Sketches.heapifySketch(mem);
-  }
-
-  private static Memory badEmptySk() { //missing the empty bit
-    final long preLongs = 1;
-    final long serVer = 3;
-    final long family = 3; //compact
-    final long flags = (ORDERED_FLAG_MASK | COMPACT_FLAG_MASK | READ_ONLY_FLAG_MASK);
-    final long seedHash = 0x93CC;
-    final long badEmptySk = (seedHash << 48) | (flags << 40)
-        | (family << 16) | (serVer << 8) | preLongs;
-    WritableMemory wmem =  WritableMemory.allocate(8);
-    wmem.putLong(0, badEmptySk);
-    return wmem;
-  }
-
   /**
    * @param s value to print
    */
diff --git a/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java b/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java
index 76d3413..cb73501 100644
--- a/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java
+++ b/src/test/java/org/apache/datasketches/theta/SingleItemSketchTest.java
@@ -23,18 +23,16 @@ import static org.apache.datasketches.Util.DEFAULT_UPDATE_SEED;
 import static org.apache.datasketches.Util.computeSeedHash;
 import static org.apache.datasketches.hash.MurmurHash3.hash;
 import static org.apache.datasketches.theta.PreambleUtil.MAX_THETA_LONG_AS_DOUBLE;
-import static org.apache.datasketches.theta.PreambleUtil.SINGLEITEM_FLAG_MASK;
 import static org.testng.Assert.assertEquals;
 import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertNull;
 import static org.testng.Assert.assertTrue;
 import static org.testng.Assert.fail;
 
-import org.testng.annotations.Test;
-
+import org.apache.datasketches.SketchesArgumentException;
 import org.apache.datasketches.memory.Memory;
 import org.apache.datasketches.memory.WritableMemory;
-import org.apache.datasketches.SketchesArgumentException;
+import org.testng.annotations.Test;
 
 /**
  * @author Lee Rhodes
@@ -173,16 +171,6 @@ public class SingleItemSketchTest {
     assertEquals(sis.getCurrentPreambleLongs(true), 1);
   }
 
-  @Test//(expectedExceptions = SketchesArgumentException.class)
-  public void testPre0Seed() {
-    long pre0_lo6 = 0X00_00_1A_00_00_03_03_01L; //no SI flag, requires not empty
-    long pre0 = ((long)DEFAULT_SEED_HASH << 48) | pre0_lo6;
-    assertTrue(SingleItemSketch.testPre0Seed(pre0, DEFAULT_UPDATE_SEED));
-    //add SI flag
-    pre0 |= ((long)SINGLEITEM_FLAG_MASK << 40);
-    assertTrue(SingleItemSketch.testPre0Seed(pre0, DEFAULT_UPDATE_SEED));
-  }
-
   @Test
   public void unionWrapped() {
     Sketch sketch = SingleItemSketch.create(1);
@@ -328,6 +316,36 @@ public class SingleItemSketchTest {
   }
 
   @Test
+  public void checkDirectUnionSingleItem2() {
+    Sketch sk = Sketch.wrap(siSkWoutSiFlag24Bytes());
+    assertEquals(sk.getEstimate(), 1.0, 0.0);
+    //println(sk.toString());
+    sk = Sketch.wrap(siSkWithSiFlag24Bytes());
+    assertEquals(sk.getEstimate(), 1.0, 0.0);
+    //println(sk.toString());
+  }
+
+  static final long SiSkPre0WithSiFlag = 0x93cc3a0000030301L;
+  static final long SiSkPre0WoutSiFlag = 0x93cc1a0000030301L;
+  static final long Hash = 0x05a186bdcb7df915L;
+
+  static Memory siSkWithSiFlag24Bytes() {
+    int cap = 24;
+    WritableMemory wmem = WritableMemory.allocate(cap);
+    wmem.putLong(0, SiSkPre0WithSiFlag);
+    wmem.putLong(8, Hash);
+    return wmem;
+  }
+
+  static Memory siSkWoutSiFlag24Bytes() {
+    int cap = 24;
+    WritableMemory wmem = WritableMemory.allocate(cap);
+    wmem.putLong(0, SiSkPre0WoutSiFlag);
+    wmem.putLong(8, Hash);
+    return wmem;
+  }
+
+  @Test
   public void printlnTest() {
     println("PRINTING: "+this.getClass().getName());
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org