You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2024/02/22 23:35:51 UTC

(datasketches-java) branch tdigest updated: renamed, added float deserialization and test

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tdigest
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git


The following commit(s) were added to refs/heads/tdigest by this push:
     new 1664cda8 renamed, added float deserialization and test
1664cda8 is described below

commit 1664cda88c739f9b5ba20160d0a04c4d918b32d4
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Feb 22 15:35:41 2024 -0800

    renamed, added float deserialization and test
---
 .../tdigest/{TDigest.java => TDigestDouble.java}   | 42 +++++++++++++---------
 .../tdigest/TDigestCrossLanguageTest.java          | 27 ++++++++++++--
 .../{TDigestTest.java => TDigestDoubleTest.java}   | 28 +++++++--------
 3 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/src/main/java/org/apache/datasketches/tdigest/TDigest.java b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
similarity index 94%
rename from src/main/java/org/apache/datasketches/tdigest/TDigest.java
rename to src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
index 3a5dad93..0d0b3172 100644
--- a/src/main/java/org/apache/datasketches/tdigest/TDigest.java
+++ b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
@@ -38,7 +38,7 @@ import org.apache.datasketches.quantilescommon.QuantilesAPI;
  * https://github.com/tdunning/t-digest
  * This implementation is similar to MergingDigest in the above implementation
  */
-public final class TDigest {
+public final class TDigestDouble {
 
   public static final boolean USE_ALTERNATING_SORT = true;
   public static final boolean USE_TWO_LEVEL_COMPRESSION = true;
@@ -70,11 +70,11 @@ public final class TDigest {
 
   enum flags { IS_EMPTY, REVERSE_MERGE };
 
-  public TDigest(final int k) {
+  public TDigestDouble(final int k) {
     this(false, k, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, null, null, 0);
   }
 
-  private TDigest(final boolean reverseMerge, final int k, final double min, final double max,
+  private TDigestDouble(final boolean reverseMerge, final int k, final double min, final double max,
       final double[] means, final long[] weights, final long weight) {
     reverseMerge_ = reverseMerge; 
     k_ = k;
@@ -99,13 +99,12 @@ public final class TDigest {
     bufferWeights_ = new long[bufferCapacity_];
     numCentroids_ = 0;
     numBuffered_ = 0;
-    centroidsWeight_ = 0;
+    centroidsWeight_ = weight;
     bufferedWeight_ = 0;
     if (weight > 0) {
       System.arraycopy(means, 0, centroidMeans_, 0, means.length);
       System.arraycopy(weights, 0, centroidWeights_, 0, weights.length);
       numCentroids_ = means.length;
-      centroidsWeight_ = weight;
     }
   }
 
@@ -124,7 +123,7 @@ public final class TDigest {
     maxValue_ = Math.max(maxValue_, value);
   }
 
-  public void merge(final TDigest other) {
+  public void merge(final TDigestDouble other) {
     if (other.isEmpty()) return;
     int num = numCentroids_ + numBuffered_ + other.numCentroids_ + other.numBuffered_;
     if (num <= bufferCapacity_) {
@@ -305,7 +304,11 @@ public final class TDigest {
     return bytes;
   }
 
-  public static TDigest heapify(final Memory mem) {
+  public static TDigestDouble heapify(final Memory mem) {
+    return heapify(mem, false);
+  }
+
+  public static TDigestDouble heapify(final Memory mem, final boolean isFloat) {
     final Buffer buff = mem.asBuffer();
     final byte preambleLongs = buff.getByte();
     final byte serialVersion = buff.getByte();
@@ -325,26 +328,33 @@ public final class TDigest {
       throw new SketchesArgumentException("Preamble longs mismatch: expected " + expectedPreambleLongs + ", actual " + preambleLongs);
     }
     buff.getShort(); // unused
-    if (isEmpty) return new TDigest(k);
+    if (isEmpty) return new TDigestDouble(k);
     final int numCentroids = buff.getInt();
     buff.getInt(); // unused
-    final double min = buff.getDouble();
-    final double max = buff.getDouble();
+    final double min;
+    final double max;
+    if (isFloat) {
+      min = buff.getFloat();
+      max = buff.getFloat();
+    } else {
+      min = buff.getDouble();
+      max = buff.getDouble();
+    }
     final double[] means = new double[numCentroids];
     final long[] weights = new long[numCentroids];
     long totalWeight = 0;
     for (int i = 0; i < numCentroids; i++) {
-      means[i] = buff.getDouble();
-      weights[i] = buff.getLong();
+      means[i] = isFloat ? buff.getFloat() : buff.getDouble();
+      weights[i] = isFloat ? buff.getInt() : buff.getLong();
       totalWeight += weights[i];
     }
     final boolean reverseMerge = (flagsByte & (1 << flags.REVERSE_MERGE.ordinal())) > 0;
-    return new TDigest(reverseMerge, k, min, max, means, weights, totalWeight);
+    return new TDigestDouble(reverseMerge, k, min, max, means, weights, totalWeight);
   }
 
   // compatibility with the format of the reference implementation
   // default byte order of ByteBuffer is used there, which is big endian
-  private static TDigest heapifyCompat(final Memory mem) {
+  private static TDigestDouble heapifyCompat(final Memory mem) {
     final Buffer buff = mem.asBuffer(ByteOrder.BIG_ENDIAN);
     final int type = buff.getInt();
     if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
@@ -363,7 +373,7 @@ public final class TDigest {
         means[i] = buff.getDouble();
         totalWeight += weights[i];
       }
-      return new TDigest(false, k, min, max, means, weights, totalWeight);
+      return new TDigestDouble(false, k, min, max, means, weights, totalWeight);
     }
     // COMPAT_FLOAT: compatibility with asSmallBytes()
     final double min = buff.getDouble(); // reference implementation uses doubles for min and max
@@ -381,7 +391,7 @@ public final class TDigest {
       means[i] = buff.getFloat();
       totalWeight += weights[i];
     }
-    return new TDigest(false, k, min, max, means, weights, totalWeight);
+    return new TDigestDouble(false, k, min, max, means, weights, totalWeight);
   }
 
   /**
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
index 19c48b57..38a56f63 100644
--- a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
@@ -30,7 +30,6 @@ import java.io.IOException;
 import java.nio.file.Files;
 
 import org.apache.datasketches.memory.Memory;
-import org.testng.Assert;
 import org.testng.annotations.Test;
 
 public class TDigestCrossLanguageTest {
@@ -40,7 +39,29 @@ public class TDigestCrossLanguageTest {
     final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
     for (int n: nArr) {
       final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_double_n" + n + "_cpp.sk"));
-      final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+      final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes));
+      assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty());
+      assertEquals(td.getTotalWeight(), n);
+      if (n > 0) {
+        assertEquals(td.getMinValue(), 1);
+        assertEquals(td.getMaxValue(), n);
+        assertEquals(td.getRank(0), 0);
+        assertEquals(td.getRank(n + 1), 1);
+        if (n == 1) {
+          assertEquals(td.getRank(n), 0.5);
+        } else {
+          assertEquals(td.getRank(n / 2), 0.5, n * 0.01);
+        }
+      }
+    }
+  }
+
+  @Test(groups = {CHECK_CPP_FILES})
+  public void deserializeFromCppFloat() throws IOException {
+    final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
+    for (int n: nArr) {
+      final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_float_n" + n + "_cpp.sk"));
+      final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes), true);
       assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty());
       assertEquals(td.getTotalWeight(), n);
       if (n > 0) {
@@ -61,7 +82,7 @@ public class TDigestCrossLanguageTest {
   public void generateForCppDouble() throws IOException {
     final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
     for (int n: nArr) {
-      final TDigest td = new TDigest(100);
+      final TDigestDouble td = new TDigestDouble(100);
       for (int i = 1; i <= n; i++) td.update(i);
       Files.newOutputStream(javaPath.resolve("tdigest_double_n" + n + "_java.sk")).write(td.toByteArray());
     }
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
similarity index 87%
rename from src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
rename to src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
index 40182644..0baf7963 100644
--- a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
@@ -29,11 +29,11 @@ import org.apache.datasketches.common.TestUtil;
 import org.apache.datasketches.memory.Memory;
 import org.testng.annotations.Test;
 
-public class TDigestTest {
+public class TDigestDoubleTest {
 
   @Test
   public void empty() {
-    final TDigest td = new TDigest(100);
+    final TDigestDouble td = new TDigestDouble(100);
     assertTrue(td.isEmpty());
     assertEquals(td.getK(), 100);
     assertEquals(td.getTotalWeight(), 0);
@@ -45,7 +45,7 @@ public class TDigestTest {
 
   @Test
   public void oneValue() {
-    final TDigest td = new TDigest(100);
+    final TDigestDouble td = new TDigestDouble(100);
     td.update(1);
     assertFalse(td.isEmpty());
     assertEquals(td.getK(), 100);
@@ -62,7 +62,7 @@ public class TDigestTest {
 
   @Test
   public void manyValues() {
-    final TDigest td = new TDigest(100);
+    final TDigestDouble td = new TDigestDouble(100);
     final int n = 10000;
     for (int i = 0; i < n; i++) td.update(i);
 //    System.out.println(td.toString(true));
@@ -86,10 +86,10 @@ public class TDigestTest {
 
   @Test
   public void mergeSmall() {
-    final TDigest td1 = new TDigest(100);
+    final TDigestDouble td1 = new TDigestDouble(100);
     td1.update(1);
     td1.update(2);
-    final TDigest td2 = new TDigest(100);
+    final TDigestDouble td2 = new TDigestDouble(100);
     td2.update(2);
     td2.update(3);
     td1.merge(td2);
@@ -101,8 +101,8 @@ public class TDigestTest {
   @Test
   public void mergeLarge() {
     final int n = 10000;
-    final TDigest td1 = new TDigest(100);
-    final TDigest td2 = new TDigest(100);
+    final TDigestDouble td1 = new TDigestDouble(100);
+    final TDigestDouble td2 = new TDigestDouble(100);
     for (int i = 0; i < n / 2; i++) {
       td1.update(i);
       td2.update(n / 2 + i);
@@ -116,9 +116,9 @@ public class TDigestTest {
 
   @Test
   public void serializeDeserializeEmpty() {
-    final TDigest td1 = new TDigest(100);
+    final TDigestDouble td1 = new TDigestDouble(100);
     final byte[] bytes = td1.toByteArray();
-    final TDigest td2 = TDigest.heapify(Memory.wrap(bytes));
+    final TDigestDouble td2 = TDigestDouble.heapify(Memory.wrap(bytes));
     assertEquals(td2.getK(), td1.getK());
     assertEquals(td2.getTotalWeight(), td1.getTotalWeight());
     assertEquals(td2.isEmpty(), td1.isEmpty());
@@ -126,10 +126,10 @@ public class TDigestTest {
 
   @Test
   public void serializeDeserializeNonEmpty() {
-    final TDigest td1 = new TDigest(100);
+    final TDigestDouble td1 = new TDigestDouble(100);
     for (int i = 0; i < 10000; i++) td1.update(i);
     final byte[] bytes = td1.toByteArray();
-    final TDigest td2 = TDigest.heapify(Memory.wrap(bytes));
+    final TDigestDouble td2 = TDigestDouble.heapify(Memory.wrap(bytes));
     assertEquals(td2.getK(), td1.getK());
     assertEquals(td2.getTotalWeight(), td1.getTotalWeight());
     assertEquals(td2.isEmpty(), td1.isEmpty());
@@ -142,7 +142,7 @@ public class TDigestTest {
   @Test
   public void deserializeFromReferenceImplementationDouble() {
     final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_double.sk");
-    final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+    final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes));
     final int n = 10000;
     assertEquals(td.getTotalWeight(), n);
     assertEquals(td.getMinValue(), 0);
@@ -157,7 +157,7 @@ public class TDigestTest {
   @Test
   public void deserializeFromReferenceImplementationFloat() {
     final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_float.sk");
-    final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+    final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes));
     final int n = 10000;
     assertEquals(td.getTotalWeight(), n);
     assertEquals(td.getMinValue(), 0);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org