You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2024/02/21 01:19:24 UTC

(datasketches-java) branch tdigest updated: deserialize compatibility with ref implementation, cross-language test

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tdigest
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git


The following commit(s) were added to refs/heads/tdigest by this push:
     new 8f489cae deserialize compatibility with ref implementation, cross-language test
8f489cae is described below

commit 8f489cae5e0e555510f2e46ce241afd272bb535d
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Feb 20 17:19:13 2024 -0800

    deserialize compatibility with ref implementation, cross-language test
---
 .../org/apache/datasketches/tdigest/TDigest.java   |  46 +++++++++++++-
 .../tdigest/TDigestCrossLanguageTest.java          |  70 +++++++++++++++++++++
 .../apache/datasketches/tdigest/TDigestTest.java   |  31 +++++++++
 .../resources/tdigest_ref_k100_n10000_double.sk    | Bin 0 -> 976 bytes
 .../resources/tdigest_ref_k100_n10000_float.sk     | Bin 0 -> 502 bytes
 5 files changed, 146 insertions(+), 1 deletion(-)

diff --git a/src/main/java/org/apache/datasketches/tdigest/TDigest.java b/src/main/java/org/apache/datasketches/tdigest/TDigest.java
index 4bb8a9f5..3a5dad93 100644
--- a/src/main/java/org/apache/datasketches/tdigest/TDigest.java
+++ b/src/main/java/org/apache/datasketches/tdigest/TDigest.java
@@ -19,6 +19,7 @@
 
 package org.apache.datasketches.tdigest;
 
+import java.nio.ByteOrder;
 import java.util.function.Function;
 
 import org.apache.datasketches.common.SketchesArgumentException;
@@ -280,7 +281,7 @@ public final class TDigest {
   public byte[] toByteArray() {
     mergeBuffered(); // side effect
     final byte preambleLongs = isEmpty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NON_EMPTY;
-    final int sizeBytes = preambleLongs * Long.BYTES + 2 * Double.BYTES + (Double.BYTES + Long.BYTES) * numCentroids_;
+    int sizeBytes = preambleLongs * Long.BYTES + (isEmpty() ? 0 : 2 * Double.BYTES + (Double.BYTES + Long.BYTES) * numCentroids_);
     final byte[] bytes = new byte[sizeBytes];
     final WritableBuffer wbuf = WritableMemory.writableWrap(bytes).asWritableBuffer();
     wbuf.putByte(preambleLongs);
@@ -310,6 +311,7 @@ public final class TDigest {
     final byte serialVersion = buff.getByte();
     final byte sketchType = buff.getByte();
     if (sketchType != SKETCH_TYPE) {
+      if (preambleLongs == 0 && serialVersion == 0 && sketchType == 0) return heapifyCompat(mem);
       throw new SketchesArgumentException("Sketch type mismatch: expected " + SKETCH_TYPE + ", actual " + sketchType);
     }
     if (serialVersion != SERIAL_VERSION) {
@@ -340,6 +342,48 @@ public final class TDigest {
     return new TDigest(reverseMerge, k, min, max, means, weights, totalWeight);
   }
 
+  // compatibility with the format of the reference implementation
+  // default byte order of ByteBuffer is used there, which is big endian
+  private static TDigest heapifyCompat(final Memory mem) {
+    final Buffer buff = mem.asBuffer(ByteOrder.BIG_ENDIAN);
+    final int type = buff.getInt();
+    if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
+      throw new SketchesArgumentException("unexpected compatibility type " + type);
+    }
+    if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
+      final double min = buff.getDouble();
+      final double max = buff.getDouble();
+      final int k = (int) buff.getDouble();
+      final int numCentroids = buff.getInt();
+      final double[] means = new double[numCentroids];
+      final long[] weights = new long[numCentroids];
+      long totalWeight = 0;
+      for (int i = 0; i < numCentroids; i++) {
+        weights[i] = (long) buff.getDouble();
+        means[i] = buff.getDouble();
+        totalWeight += weights[i];
+      }
+      return new TDigest(false, k, min, max, means, weights, totalWeight);
+    }
+    // COMPAT_FLOAT: compatibility with asSmallBytes()
+    final double min = buff.getDouble(); // reference implementation uses doubles for min and max
+    final double max = buff.getDouble();
+    final int k = (int) buff.getFloat();
+    // reference implementation stores capacities of the array of centroids and the buffer as shorts
+    // they can be derived from k in the constructor
+    buff.getInt(); // unused
+    final int numCentroids = buff.getShort();
+    final double[] means = new double[numCentroids];
+    final long[] weights = new long[numCentroids];
+    long totalWeight = 0;
+    for (int i = 0; i < numCentroids; i++) {
+      weights[i] = (long) buff.getFloat();
+      means[i] = buff.getFloat();
+      totalWeight += weights[i];
+    }
+    return new TDigest(false, k, min, max, means, weights, totalWeight);
+  }
+
   /**
    * Returns summary information about this TDigest. Used for debugging.
    * @return summary of the TDigest
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
new file mode 100644
index 00000000..19c48b57
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.tdigest;
+
+import static org.apache.datasketches.common.TestUtil.CHECK_CPP_FILES;
+import static org.apache.datasketches.common.TestUtil.GENERATE_JAVA_FILES;
+import static org.apache.datasketches.common.TestUtil.cppPath;
+import static org.apache.datasketches.common.TestUtil.javaPath;
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.file.Files;
+
+import org.apache.datasketches.memory.Memory;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class TDigestCrossLanguageTest {
+
+  @Test(groups = {CHECK_CPP_FILES})
+  public void deserializeFromCppDouble() throws IOException {
+    final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
+    for (int n: nArr) {
+      final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_double_n" + n + "_cpp.sk"));
+      final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+      assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty());
+      assertEquals(td.getTotalWeight(), n);
+      if (n > 0) {
+        assertEquals(td.getMinValue(), 1);
+        assertEquals(td.getMaxValue(), n);
+        assertEquals(td.getRank(0), 0);
+        assertEquals(td.getRank(n + 1), 1);
+        if (n == 1) {
+          assertEquals(td.getRank(n), 0.5);
+        } else {
+          assertEquals(td.getRank(n / 2), 0.5, n * 0.01);
+        }
+      }
+    }
+  }
+
+  @Test(groups = {GENERATE_JAVA_FILES})
+  public void generateForCppDouble() throws IOException {
+    final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
+    for (int n: nArr) {
+      final TDigest td = new TDigest(100);
+      for (int i = 1; i <= n; i++) td.update(i);
+      Files.newOutputStream(javaPath.resolve("tdigest_double_n" + n + "_java.sk")).write(td.toByteArray());
+    }
+  }
+
+}
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
index aad69c90..40182644 100644
--- a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
@@ -25,6 +25,7 @@ import static org.testng.Assert.assertThrows;
 import static org.testng.Assert.assertTrue;
 
 import org.apache.datasketches.common.SketchesStateException;
+import org.apache.datasketches.common.TestUtil;
 import org.apache.datasketches.memory.Memory;
 import org.testng.annotations.Test;
 
@@ -137,4 +138,34 @@ public class TDigestTest {
     assertEquals(td2.getRank(5000), td1.getRank(5000));
     assertEquals(td2.getQuantile(0.5), td1.getQuantile(0.5));
   }
+
+  @Test
+  public void deserializeFromReferenceImplementationDouble() {
+    final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_double.sk");
+    final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+    final int n = 10000;
+    assertEquals(td.getTotalWeight(), n);
+    assertEquals(td.getMinValue(), 0);
+    assertEquals(td.getMaxValue(), n - 1);
+    assertEquals(td.getRank(0), 0, 0.0001);
+    assertEquals(td.getRank(n / 4), 0.25, 0.0001);
+    assertEquals(td.getRank(n / 2), 0.5, 0.0001);
+    assertEquals(td.getRank(n * 3 / 4), 0.75, 0.0001);
+    assertEquals(td.getRank(n), 1);
+  }
+
+  @Test
+  public void deserializeFromReferenceImplementationFloat() {
+    final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_float.sk");
+    final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+    final int n = 10000;
+    assertEquals(td.getTotalWeight(), n);
+    assertEquals(td.getMinValue(), 0);
+    assertEquals(td.getMaxValue(), n - 1);
+    assertEquals(td.getRank(0), 0, 0.0001);
+    assertEquals(td.getRank(n / 4), 0.25, 0.0001);
+    assertEquals(td.getRank(n / 2), 0.5, 0.0001);
+    assertEquals(td.getRank(n * 3 / 4), 0.75, 0.0001);
+    assertEquals(td.getRank(n), 1);
+  }
 }
diff --git a/src/test/resources/tdigest_ref_k100_n10000_double.sk b/src/test/resources/tdigest_ref_k100_n10000_double.sk
new file mode 100644
index 00000000..f6f4510e
Binary files /dev/null and b/src/test/resources/tdigest_ref_k100_n10000_double.sk differ
diff --git a/src/test/resources/tdigest_ref_k100_n10000_float.sk b/src/test/resources/tdigest_ref_k100_n10000_float.sk
new file mode 100644
index 00000000..16d79811
Binary files /dev/null and b/src/test/resources/tdigest_ref_k100_n10000_float.sk differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org