You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2024/02/21 01:19:24 UTC
(datasketches-java) branch tdigest updated: deserialize compatibility with ref implementation, cross-language test
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch tdigest
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
The following commit(s) were added to refs/heads/tdigest by this push:
new 8f489cae deserialize compatibility with ref implementation, cross-language test
8f489cae is described below
commit 8f489cae5e0e555510f2e46ce241afd272bb535d
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Tue Feb 20 17:19:13 2024 -0800
deserialize compatibility with ref implementation, cross-language test
---
.../org/apache/datasketches/tdigest/TDigest.java | 46 +++++++++++++-
.../tdigest/TDigestCrossLanguageTest.java | 70 +++++++++++++++++++++
.../apache/datasketches/tdigest/TDigestTest.java | 31 +++++++++
.../resources/tdigest_ref_k100_n10000_double.sk | Bin 0 -> 976 bytes
.../resources/tdigest_ref_k100_n10000_float.sk | Bin 0 -> 502 bytes
5 files changed, 146 insertions(+), 1 deletion(-)
diff --git a/src/main/java/org/apache/datasketches/tdigest/TDigest.java b/src/main/java/org/apache/datasketches/tdigest/TDigest.java
index 4bb8a9f5..3a5dad93 100644
--- a/src/main/java/org/apache/datasketches/tdigest/TDigest.java
+++ b/src/main/java/org/apache/datasketches/tdigest/TDigest.java
@@ -19,6 +19,7 @@
package org.apache.datasketches.tdigest;
+import java.nio.ByteOrder;
import java.util.function.Function;
import org.apache.datasketches.common.SketchesArgumentException;
@@ -280,7 +281,7 @@ public final class TDigest {
public byte[] toByteArray() {
mergeBuffered(); // side effect
final byte preambleLongs = isEmpty() ? PREAMBLE_LONGS_EMPTY : PREAMBLE_LONGS_NON_EMPTY;
- final int sizeBytes = preambleLongs * Long.BYTES + 2 * Double.BYTES + (Double.BYTES + Long.BYTES) * numCentroids_;
+ int sizeBytes = preambleLongs * Long.BYTES + (isEmpty() ? 0 : 2 * Double.BYTES + (Double.BYTES + Long.BYTES) * numCentroids_);
final byte[] bytes = new byte[sizeBytes];
final WritableBuffer wbuf = WritableMemory.writableWrap(bytes).asWritableBuffer();
wbuf.putByte(preambleLongs);
@@ -310,6 +311,7 @@ public final class TDigest {
final byte serialVersion = buff.getByte();
final byte sketchType = buff.getByte();
if (sketchType != SKETCH_TYPE) {
+ if (preambleLongs == 0 && serialVersion == 0 && sketchType == 0) return heapifyCompat(mem);
throw new SketchesArgumentException("Sketch type mismatch: expected " + SKETCH_TYPE + ", actual " + sketchType);
}
if (serialVersion != SERIAL_VERSION) {
@@ -340,6 +342,48 @@ public final class TDigest {
return new TDigest(reverseMerge, k, min, max, means, weights, totalWeight);
}
+ // compatibility with the format of the reference implementation
+ // default byte order of ByteBuffer is used there, which is big endian
+ private static TDigest heapifyCompat(final Memory mem) {
+ final Buffer buff = mem.asBuffer(ByteOrder.BIG_ENDIAN);
+ final int type = buff.getInt();
+ if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
+ throw new SketchesArgumentException("unexpected compatibility type " + type);
+ }
+ if (type == COMPAT_DOUBLE) { // compatibility with asBytes()
+ final double min = buff.getDouble();
+ final double max = buff.getDouble();
+ final int k = (int) buff.getDouble();
+ final int numCentroids = buff.getInt();
+ final double[] means = new double[numCentroids];
+ final long[] weights = new long[numCentroids];
+ long totalWeight = 0;
+ for (int i = 0; i < numCentroids; i++) {
+ weights[i] = (long) buff.getDouble();
+ means[i] = buff.getDouble();
+ totalWeight += weights[i];
+ }
+ return new TDigest(false, k, min, max, means, weights, totalWeight);
+ }
+ // COMPAT_FLOAT: compatibility with asSmallBytes()
+ final double min = buff.getDouble(); // reference implementation uses doubles for min and max
+ final double max = buff.getDouble();
+ final int k = (int) buff.getFloat();
+ // reference implementation stores capacities of the array of centroids and the buffer as shorts
+ // they can be derived from k in the constructor
+ buff.getInt(); // unused
+ final int numCentroids = buff.getShort();
+ final double[] means = new double[numCentroids];
+ final long[] weights = new long[numCentroids];
+ long totalWeight = 0;
+ for (int i = 0; i < numCentroids; i++) {
+ weights[i] = (long) buff.getFloat();
+ means[i] = buff.getFloat();
+ totalWeight += weights[i];
+ }
+ return new TDigest(false, k, min, max, means, weights, totalWeight);
+ }
+
/**
* Returns summary information about this TDigest. Used for debugging.
* @return summary of the TDigest
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
new file mode 100644
index 00000000..19c48b57
--- /dev/null
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.tdigest;
+
+import static org.apache.datasketches.common.TestUtil.CHECK_CPP_FILES;
+import static org.apache.datasketches.common.TestUtil.GENERATE_JAVA_FILES;
+import static org.apache.datasketches.common.TestUtil.cppPath;
+import static org.apache.datasketches.common.TestUtil.javaPath;
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.file.Files;
+
+import org.apache.datasketches.memory.Memory;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class TDigestCrossLanguageTest {
+
+ @Test(groups = {CHECK_CPP_FILES})
+ public void deserializeFromCppDouble() throws IOException {
+ final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
+ for (int n: nArr) {
+ final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_double_n" + n + "_cpp.sk"));
+ final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+ assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty());
+ assertEquals(td.getTotalWeight(), n);
+ if (n > 0) {
+ assertEquals(td.getMinValue(), 1);
+ assertEquals(td.getMaxValue(), n);
+ assertEquals(td.getRank(0), 0);
+ assertEquals(td.getRank(n + 1), 1);
+ if (n == 1) {
+ assertEquals(td.getRank(n), 0.5);
+ } else {
+ assertEquals(td.getRank(n / 2), 0.5, n * 0.01);
+ }
+ }
+ }
+ }
+
+ @Test(groups = {GENERATE_JAVA_FILES})
+ public void generateForCppDouble() throws IOException {
+ final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
+ for (int n: nArr) {
+ final TDigest td = new TDigest(100);
+ for (int i = 1; i <= n; i++) td.update(i);
+ Files.newOutputStream(javaPath.resolve("tdigest_double_n" + n + "_java.sk")).write(td.toByteArray());
+ }
+ }
+
+}
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
index aad69c90..40182644 100644
--- a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
@@ -25,6 +25,7 @@ import static org.testng.Assert.assertThrows;
import static org.testng.Assert.assertTrue;
import org.apache.datasketches.common.SketchesStateException;
+import org.apache.datasketches.common.TestUtil;
import org.apache.datasketches.memory.Memory;
import org.testng.annotations.Test;
@@ -137,4 +138,34 @@ public class TDigestTest {
assertEquals(td2.getRank(5000), td1.getRank(5000));
assertEquals(td2.getQuantile(0.5), td1.getQuantile(0.5));
}
+
+ @Test
+ public void deserializeFromReferenceImplementationDouble() {
+ final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_double.sk");
+ final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+ final int n = 10000;
+ assertEquals(td.getTotalWeight(), n);
+ assertEquals(td.getMinValue(), 0);
+ assertEquals(td.getMaxValue(), n - 1);
+ assertEquals(td.getRank(0), 0, 0.0001);
+ assertEquals(td.getRank(n / 4), 0.25, 0.0001);
+ assertEquals(td.getRank(n / 2), 0.5, 0.0001);
+ assertEquals(td.getRank(n * 3 / 4), 0.75, 0.0001);
+ assertEquals(td.getRank(n), 1);
+ }
+
+ @Test
+ public void deserializeFromReferenceImplementationFloat() {
+ final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_float.sk");
+ final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+ final int n = 10000;
+ assertEquals(td.getTotalWeight(), n);
+ assertEquals(td.getMinValue(), 0);
+ assertEquals(td.getMaxValue(), n - 1);
+ assertEquals(td.getRank(0), 0, 0.0001);
+ assertEquals(td.getRank(n / 4), 0.25, 0.0001);
+ assertEquals(td.getRank(n / 2), 0.5, 0.0001);
+ assertEquals(td.getRank(n * 3 / 4), 0.75, 0.0001);
+ assertEquals(td.getRank(n), 1);
+ }
}
diff --git a/src/test/resources/tdigest_ref_k100_n10000_double.sk b/src/test/resources/tdigest_ref_k100_n10000_double.sk
new file mode 100644
index 00000000..f6f4510e
Binary files /dev/null and b/src/test/resources/tdigest_ref_k100_n10000_double.sk differ
diff --git a/src/test/resources/tdigest_ref_k100_n10000_float.sk b/src/test/resources/tdigest_ref_k100_n10000_float.sk
new file mode 100644
index 00000000..16d79811
Binary files /dev/null and b/src/test/resources/tdigest_ref_k100_n10000_float.sk differ
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org