You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by le...@apache.org on 2020/02/13 22:46:51 UTC

[incubator-datasketches-java] branch ThetaGetEstimate created (now 806a514)

This is an automated email from the ASF dual-hosted git repository.

leerho pushed a change to branch ThetaGetEstimate
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git.


      at 806a514  Improved Theta getEstimate() speed performance.

This branch includes the following new commits:

     new 0d3bbc6  interim update
     new 806a514  Improved Theta getEstimate() speed performance.

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 01/02: interim update

Posted by le...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch ThetaGetEstimate
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit 0d3bbc6c56c2ec882c9f868c33e29a3d3483c0c8
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Fri Feb 7 15:21:00 2020 -0800

    interim update
---
 .../java/org/apache/datasketches/WhichSketch.java  | 37 ++++++++++++++++++++++
 .../datasketches/theta/SingleItemSketch.java       |  4 +--
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/WhichSketch.java b/src/main/java/org/apache/datasketches/WhichSketch.java
new file mode 100644
index 0000000..e10bddc
--- /dev/null
+++ b/src/main/java/org/apache/datasketches/WhichSketch.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches;
+
+/**
+ * @author Lee Rhodes
+ */
+public class WhichSketch {
+
+  public static String whichSketch(final byte[] sketchBytes) {
+
+
+
+    return null;
+  }
+
+
+
+
+}
diff --git a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
index ef08498..2d56a12 100644
--- a/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/SingleItemSketch.java
@@ -66,14 +66,14 @@ public final class SingleItemSketch extends CompactSketch {
     arr[1] = hash;
   }
 
-  //Internal Constructor.All checking has been done, give the relevant seed
+  //Internal Constructor.All checking has been done, given the relevant seed
   SingleItemSketch(final long hash, final long seed) {
     final long seedHash = computeSeedHash(seed) & 0xFFFFL;
     arr[0] = (seedHash << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);
     arr[1] = hash;
   }
 
-  //All checking has been done, give the relevant seed
+  //All checking has been done, given the relevant seedHash
   SingleItemSketch(final long hash, final short seedHash) {
     final long seedH = seedHash & 0xFFFFL;
     arr[0] = (seedH << 48) | PRE0_LO6 | ((long)SINGLEITEM_FLAG_MASK << 40);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


[incubator-datasketches-java] 02/02: Improved Theta getEstimate() speed performance.

Posted by le...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

leerho pushed a commit to branch ThetaGetEstimate
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-java.git

commit 806a514ac100da15057338259233df965f933647
Author: Lee Rhodes <le...@users.noreply.github.com>
AuthorDate: Thu Feb 13 14:45:50 2020 -0800

    Improved Theta getEstimate() speed performance.
---
 .../org/apache/datasketches/theta/DirectCompactSketch.java  | 12 ++++++++++--
 .../apache/datasketches/theta/DirectQuickSelectSketchR.java |  9 +++++++++
 .../org/apache/datasketches/theta/EmptyCompactSketch.java   |  9 ++++++---
 .../java/org/apache/datasketches/theta/HeapAlphaSketch.java |  9 +++------
 .../org/apache/datasketches/theta/HeapCompactSketch.java    |  9 +++++++--
 .../apache/datasketches/theta/HeapQuickSelectSketch.java    | 13 +++++++++----
 src/main/java/org/apache/datasketches/theta/Sketch.java     | 12 +++---------
 src/main/java/org/apache/datasketches/theta/Sketches.java   |  2 +-
 .../java/org/apache/datasketches/theta/HeapAnotBTest.java   |  5 ++---
 9 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java
index 8a4b6bf..03543b5 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectCompactSketch.java
@@ -49,8 +49,11 @@ abstract class DirectCompactSketch extends CompactSketch {
   }
 
   @Override
-  public HashIterator iterator() {
-    return new MemoryHashIterator(mem_, getRetainedEntries(), getThetaLong());
+  public double getEstimate() {
+    final int curCount = extractCurCount(mem_);
+    final int preLongs = extractPreLongs(mem_);
+    final long thetaLong = (preLongs > 2) ? extractThetaLong(mem_) : Long.MAX_VALUE;
+    return Sketch.estimate(thetaLong, curCount);
   }
 
   //overidden by EmptyCompactSketch and SingleItemSketch
@@ -87,6 +90,11 @@ abstract class DirectCompactSketch extends CompactSketch {
   }
 
   @Override
+  public HashIterator iterator() {
+    return new MemoryHashIterator(mem_, getRetainedEntries(), getThetaLong());
+  }
+
+  @Override
   public byte[] toByteArray() {
     return
         compactMemoryToByteArray(mem_, getCurrentPreambleLongs(true), getRetainedEntries(true));
diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java
index c50e677..38421ff 100644
--- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java
+++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java
@@ -28,9 +28,11 @@ import static org.apache.datasketches.theta.PreambleUtil.PREAMBLE_LONGS_BYTE;
 import static org.apache.datasketches.theta.PreambleUtil.P_FLOAT;
 import static org.apache.datasketches.theta.PreambleUtil.RETAINED_ENTRIES_INT;
 import static org.apache.datasketches.theta.PreambleUtil.THETA_LONG;
+import static org.apache.datasketches.theta.PreambleUtil.extractCurCount;
 import static org.apache.datasketches.theta.PreambleUtil.extractLgArrLongs;
 import static org.apache.datasketches.theta.PreambleUtil.extractLgNomLongs;
 import static org.apache.datasketches.theta.PreambleUtil.extractPreLongs;
+import static org.apache.datasketches.theta.PreambleUtil.extractThetaLong;
 
 import org.apache.datasketches.Family;
 import org.apache.datasketches.ResizeFactor;
@@ -119,6 +121,13 @@ class DirectQuickSelectSketchR extends UpdateSketch {
   }
 
   @Override
+  public double getEstimate() {
+    final int curCount = extractCurCount(mem_);
+    final long thetaLong = extractThetaLong(mem_);
+    return Sketch.estimate(thetaLong, curCount);
+  }
+
+  @Override
   public Family getFamily() {
     final int familyID = mem_.getByte(FAMILY_BYTE) & 0XFF;
     return Family.idToFamily(familyID);
diff --git a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java
index 7f3e6e0..81d80f1 100644
--- a/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/EmptyCompactSketch.java
@@ -66,9 +66,7 @@ final class EmptyCompactSketch extends CompactSketch {
   }
 
   @Override
-  public HashIterator iterator() {
-    return new HeapHashIterator(new long[0], 0, Long.MAX_VALUE);
-  }
+  public double getEstimate() { return 0; }
 
   @Override
   public int getRetainedEntries(final boolean valid) {
@@ -100,6 +98,11 @@ final class EmptyCompactSketch extends CompactSketch {
     return true;
   }
 
+  @Override
+  public HashIterator iterator() {
+    return new HeapHashIterator(new long[0], 0, Long.MAX_VALUE);
+  }
+
   /**
    * Returns 8 bytes representing a CompactSketch that the following flags set:
    * ordered, compact, empty, readOnly. The SerVer is 3, the Family is COMPACT(3),
diff --git a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java
index 77e800e..21057d2 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapAlphaSketch.java
@@ -166,12 +166,9 @@ final class HeapAlphaSketch extends HeapUpdateSketch {
 
   @Override
   public double getEstimate() {
-    if (isEstimationMode()) {
-      final int curCount = getRetainedEntries(true);
-      final double theta = getTheta();
-      return (thetaLong_ > split1_) ? curCount / theta : (1 << lgNomLongs_) / theta;
-    }
-    return curCount_;
+    return (thetaLong_ > split1_)
+        ? Sketch.estimate(thetaLong_, curCount_)
+        : (1 << lgNomLongs_) * (MAX_THETA_LONG_AS_DOUBLE / thetaLong_);
   }
 
   @Override
diff --git a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
index eb4831d..be41215 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapCompactSketch.java
@@ -68,8 +68,8 @@ abstract class HeapCompactSketch extends CompactSketch {
   }
 
   @Override
-  public HashIterator iterator() {
-    return new HeapHashIterator(cache_, cache_.length, thetaLong_);
+  public double getEstimate() {
+    return Sketch.estimate(thetaLong_, curCount_);
   }
 
   @Override
@@ -97,6 +97,11 @@ abstract class HeapCompactSketch extends CompactSketch {
     return empty_;
   }
 
+  @Override
+  public HashIterator iterator() {
+    return new HeapHashIterator(cache_, cache_.length, thetaLong_);
+  }
+
   //restricted methods
 
   @Override
diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
index bff7d0b..cb40feb 100644
--- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
+++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java
@@ -142,13 +142,13 @@ class HeapQuickSelectSketch extends HeapUpdateSketch {
   //Sketch
 
   @Override
-  public Family getFamily() {
-    return MY_FAMILY;
+  public double getEstimate() {
+    return Sketch.estimate(thetaLong_, curCount_);
   }
 
   @Override
-  public HashIterator iterator() {
-    return new HeapHashIterator(cache_, 1 << lgArrLongs_, thetaLong_);
+  public Family getFamily() {
+    return MY_FAMILY;
   }
 
   @Override
@@ -167,6 +167,11 @@ class HeapQuickSelectSketch extends HeapUpdateSketch {
   }
 
   @Override
+  public HashIterator iterator() {
+    return new HeapHashIterator(cache_, 1 << lgArrLongs_, thetaLong_);
+  }
+
+  @Override
   public byte[] toByteArray() {
     return toByteArray(preambleLongs_, (byte) MY_FAMILY.getID());
   }
diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java
index a1188ca..50a1544 100644
--- a/src/main/java/org/apache/datasketches/theta/Sketch.java
+++ b/src/main/java/org/apache/datasketches/theta/Sketch.java
@@ -229,9 +229,7 @@ public abstract class Sketch {
    * Gets the unique count estimate.
    * @return the sketch's best estimate of the cardinality of the input stream.
    */
-  public double getEstimate() {
-    return estimate(getThetaLong(), getRetainedEntries(true), isEmpty());
-  }
+  public abstract double getEstimate();
 
   /**
    * Returns the Family that this sketch belongs to
@@ -618,12 +616,8 @@ public abstract class Sketch {
     return ((curCount == 0) && (thetaLong == Long.MAX_VALUE));
   }
 
-  static final double estimate(final long thetaLong, final int curCount, final boolean empty) {
-    if (estMode(thetaLong, empty)) {
-      final double theta = thetaLong / MAX_THETA_LONG_AS_DOUBLE;
-      return curCount / theta;
-    }
-    return curCount;
+  static final double estimate(final long thetaLong, final int curCount) {
+    return curCount * (MAX_THETA_LONG_AS_DOUBLE / thetaLong);
   }
 
   static final double lowerBound(final int curCount, final long thetaLong, final int numStdDev,
diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java
index 4f5ebe5..749cfbf 100644
--- a/src/main/java/org/apache/datasketches/theta/Sketches.java
+++ b/src/main/java/org/apache/datasketches/theta/Sketches.java
@@ -293,7 +293,7 @@ public final class Sketches {
    */
   public static double getEstimate(final Memory srcMem) {
     checkIfValidThetaSketch(srcMem);
-    return Sketch.estimate(getThetaLong(srcMem), getRetainedEntries(srcMem), getEmpty(srcMem));
+    return Sketch.estimate(getThetaLong(srcMem), getRetainedEntries(srcMem));
   }
 
   /**
diff --git a/src/test/java/org/apache/datasketches/theta/HeapAnotBTest.java b/src/test/java/org/apache/datasketches/theta/HeapAnotBTest.java
index f8ccdb6..2f1ae23 100644
--- a/src/test/java/org/apache/datasketches/theta/HeapAnotBTest.java
+++ b/src/test/java/org/apache/datasketches/theta/HeapAnotBTest.java
@@ -25,11 +25,10 @@ import static org.testng.Assert.assertFalse;
 import static org.testng.Assert.assertNull;
 import static org.testng.Assert.assertTrue;
 
-import org.testng.annotations.Test;
-
-import org.apache.datasketches.memory.WritableMemory;
 import org.apache.datasketches.Family;
 import org.apache.datasketches.Util;
+import org.apache.datasketches.memory.WritableMemory;
+import org.testng.annotations.Test;
 
 /**
  * @author Lee Rhodes


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org