You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2024/02/22 23:35:51 UTC
(datasketches-java) branch tdigest updated: renamed, added float deserialization and test
This is an automated email from the ASF dual-hosted git repository.
alsay pushed a commit to branch tdigest
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git
The following commit(s) were added to refs/heads/tdigest by this push:
new 1664cda8 renamed, added float deserialization and test
1664cda8 is described below
commit 1664cda88c739f9b5ba20160d0a04c4d918b32d4
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Thu Feb 22 15:35:41 2024 -0800
renamed, added float deserialization and test
---
.../tdigest/{TDigest.java => TDigestDouble.java} | 42 +++++++++++++---------
.../tdigest/TDigestCrossLanguageTest.java | 27 ++++++++++++--
.../{TDigestTest.java => TDigestDoubleTest.java} | 28 +++++++--------
3 files changed, 64 insertions(+), 33 deletions(-)
diff --git a/src/main/java/org/apache/datasketches/tdigest/TDigest.java b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
similarity index 94%
rename from src/main/java/org/apache/datasketches/tdigest/TDigest.java
rename to src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
index 3a5dad93..0d0b3172 100644
--- a/src/main/java/org/apache/datasketches/tdigest/TDigest.java
+++ b/src/main/java/org/apache/datasketches/tdigest/TDigestDouble.java
@@ -38,7 +38,7 @@ import org.apache.datasketches.quantilescommon.QuantilesAPI;
* https://github.com/tdunning/t-digest
* This implementation is similar to MergingDigest in the above implementation
*/
-public final class TDigest {
+public final class TDigestDouble {
public static final boolean USE_ALTERNATING_SORT = true;
public static final boolean USE_TWO_LEVEL_COMPRESSION = true;
@@ -70,11 +70,11 @@ public final class TDigest {
enum flags { IS_EMPTY, REVERSE_MERGE };
- public TDigest(final int k) {
+ public TDigestDouble(final int k) {
this(false, k, Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY, null, null, 0);
}
- private TDigest(final boolean reverseMerge, final int k, final double min, final double max,
+ private TDigestDouble(final boolean reverseMerge, final int k, final double min, final double max,
final double[] means, final long[] weights, final long weight) {
reverseMerge_ = reverseMerge;
k_ = k;
@@ -99,13 +99,12 @@ public final class TDigest {
bufferWeights_ = new long[bufferCapacity_];
numCentroids_ = 0;
numBuffered_ = 0;
- centroidsWeight_ = 0;
+ centroidsWeight_ = weight;
bufferedWeight_ = 0;
if (weight > 0) {
System.arraycopy(means, 0, centroidMeans_, 0, means.length);
System.arraycopy(weights, 0, centroidWeights_, 0, weights.length);
numCentroids_ = means.length;
- centroidsWeight_ = weight;
}
}
@@ -124,7 +123,7 @@ public final class TDigest {
maxValue_ = Math.max(maxValue_, value);
}
- public void merge(final TDigest other) {
+ public void merge(final TDigestDouble other) {
if (other.isEmpty()) return;
int num = numCentroids_ + numBuffered_ + other.numCentroids_ + other.numBuffered_;
if (num <= bufferCapacity_) {
@@ -305,7 +304,11 @@ public final class TDigest {
return bytes;
}
- public static TDigest heapify(final Memory mem) {
+ public static TDigestDouble heapify(final Memory mem) {
+ return heapify(mem, false);
+ }
+
+ public static TDigestDouble heapify(final Memory mem, final boolean isFloat) {
final Buffer buff = mem.asBuffer();
final byte preambleLongs = buff.getByte();
final byte serialVersion = buff.getByte();
@@ -325,26 +328,33 @@ public final class TDigest {
throw new SketchesArgumentException("Preamble longs mismatch: expected " + expectedPreambleLongs + ", actual " + preambleLongs);
}
buff.getShort(); // unused
- if (isEmpty) return new TDigest(k);
+ if (isEmpty) return new TDigestDouble(k);
final int numCentroids = buff.getInt();
buff.getInt(); // unused
- final double min = buff.getDouble();
- final double max = buff.getDouble();
+ final double min;
+ final double max;
+ if (isFloat) {
+ min = buff.getFloat();
+ max = buff.getFloat();
+ } else {
+ min = buff.getDouble();
+ max = buff.getDouble();
+ }
final double[] means = new double[numCentroids];
final long[] weights = new long[numCentroids];
long totalWeight = 0;
for (int i = 0; i < numCentroids; i++) {
- means[i] = buff.getDouble();
- weights[i] = buff.getLong();
+ means[i] = isFloat ? buff.getFloat() : buff.getDouble();
+ weights[i] = isFloat ? buff.getInt() : buff.getLong();
totalWeight += weights[i];
}
final boolean reverseMerge = (flagsByte & (1 << flags.REVERSE_MERGE.ordinal())) > 0;
- return new TDigest(reverseMerge, k, min, max, means, weights, totalWeight);
+ return new TDigestDouble(reverseMerge, k, min, max, means, weights, totalWeight);
}
// compatibility with the format of the reference implementation
// default byte order of ByteBuffer is used there, which is big endian
- private static TDigest heapifyCompat(final Memory mem) {
+ private static TDigestDouble heapifyCompat(final Memory mem) {
final Buffer buff = mem.asBuffer(ByteOrder.BIG_ENDIAN);
final int type = buff.getInt();
if (type != COMPAT_DOUBLE && type != COMPAT_FLOAT) {
@@ -363,7 +373,7 @@ public final class TDigest {
means[i] = buff.getDouble();
totalWeight += weights[i];
}
- return new TDigest(false, k, min, max, means, weights, totalWeight);
+ return new TDigestDouble(false, k, min, max, means, weights, totalWeight);
}
// COMPAT_FLOAT: compatibility with asSmallBytes()
final double min = buff.getDouble(); // reference implementation uses doubles for min and max
@@ -381,7 +391,7 @@ public final class TDigest {
means[i] = buff.getFloat();
totalWeight += weights[i];
}
- return new TDigest(false, k, min, max, means, weights, totalWeight);
+ return new TDigestDouble(false, k, min, max, means, weights, totalWeight);
}
/**
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
index 19c48b57..38a56f63 100644
--- a/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestCrossLanguageTest.java
@@ -30,7 +30,6 @@ import java.io.IOException;
import java.nio.file.Files;
import org.apache.datasketches.memory.Memory;
-import org.testng.Assert;
import org.testng.annotations.Test;
public class TDigestCrossLanguageTest {
@@ -40,7 +39,29 @@ public class TDigestCrossLanguageTest {
final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
for (int n: nArr) {
final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_double_n" + n + "_cpp.sk"));
- final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+ final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes));
+ assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty());
+ assertEquals(td.getTotalWeight(), n);
+ if (n > 0) {
+ assertEquals(td.getMinValue(), 1);
+ assertEquals(td.getMaxValue(), n);
+ assertEquals(td.getRank(0), 0);
+ assertEquals(td.getRank(n + 1), 1);
+ if (n == 1) {
+ assertEquals(td.getRank(n), 0.5);
+ } else {
+ assertEquals(td.getRank(n / 2), 0.5, n * 0.01);
+ }
+ }
+ }
+ }
+
+ @Test(groups = {CHECK_CPP_FILES})
+ public void deserializeFromCppFloat() throws IOException {
+ final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
+ for (int n: nArr) {
+ final byte[] bytes = Files.readAllBytes(cppPath.resolve("tdigest_float_n" + n + "_cpp.sk"));
+ final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes), true);
assertTrue(n == 0 ? td.isEmpty() : !td.isEmpty());
assertEquals(td.getTotalWeight(), n);
if (n > 0) {
@@ -61,7 +82,7 @@ public class TDigestCrossLanguageTest {
public void generateForCppDouble() throws IOException {
final int[] nArr = {0, 1, 10, 100, 1000, 10_000, 100_000, 1_000_000};
for (int n: nArr) {
- final TDigest td = new TDigest(100);
+ final TDigestDouble td = new TDigestDouble(100);
for (int i = 1; i <= n; i++) td.update(i);
Files.newOutputStream(javaPath.resolve("tdigest_double_n" + n + "_java.sk")).write(td.toByteArray());
}
diff --git a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
similarity index 87%
rename from src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
rename to src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
index 40182644..0baf7963 100644
--- a/src/test/java/org/apache/datasketches/tdigest/TDigestTest.java
+++ b/src/test/java/org/apache/datasketches/tdigest/TDigestDoubleTest.java
@@ -29,11 +29,11 @@ import org.apache.datasketches.common.TestUtil;
import org.apache.datasketches.memory.Memory;
import org.testng.annotations.Test;
-public class TDigestTest {
+public class TDigestDoubleTest {
@Test
public void empty() {
- final TDigest td = new TDigest(100);
+ final TDigestDouble td = new TDigestDouble(100);
assertTrue(td.isEmpty());
assertEquals(td.getK(), 100);
assertEquals(td.getTotalWeight(), 0);
@@ -45,7 +45,7 @@ public class TDigestTest {
@Test
public void oneValue() {
- final TDigest td = new TDigest(100);
+ final TDigestDouble td = new TDigestDouble(100);
td.update(1);
assertFalse(td.isEmpty());
assertEquals(td.getK(), 100);
@@ -62,7 +62,7 @@ public class TDigestTest {
@Test
public void manyValues() {
- final TDigest td = new TDigest(100);
+ final TDigestDouble td = new TDigestDouble(100);
final int n = 10000;
for (int i = 0; i < n; i++) td.update(i);
// System.out.println(td.toString(true));
@@ -86,10 +86,10 @@ public class TDigestTest {
@Test
public void mergeSmall() {
- final TDigest td1 = new TDigest(100);
+ final TDigestDouble td1 = new TDigestDouble(100);
td1.update(1);
td1.update(2);
- final TDigest td2 = new TDigest(100);
+ final TDigestDouble td2 = new TDigestDouble(100);
td2.update(2);
td2.update(3);
td1.merge(td2);
@@ -101,8 +101,8 @@ public class TDigestTest {
@Test
public void mergeLarge() {
final int n = 10000;
- final TDigest td1 = new TDigest(100);
- final TDigest td2 = new TDigest(100);
+ final TDigestDouble td1 = new TDigestDouble(100);
+ final TDigestDouble td2 = new TDigestDouble(100);
for (int i = 0; i < n / 2; i++) {
td1.update(i);
td2.update(n / 2 + i);
@@ -116,9 +116,9 @@ public class TDigestTest {
@Test
public void serializeDeserializeEmpty() {
- final TDigest td1 = new TDigest(100);
+ final TDigestDouble td1 = new TDigestDouble(100);
final byte[] bytes = td1.toByteArray();
- final TDigest td2 = TDigest.heapify(Memory.wrap(bytes));
+ final TDigestDouble td2 = TDigestDouble.heapify(Memory.wrap(bytes));
assertEquals(td2.getK(), td1.getK());
assertEquals(td2.getTotalWeight(), td1.getTotalWeight());
assertEquals(td2.isEmpty(), td1.isEmpty());
@@ -126,10 +126,10 @@ public class TDigestTest {
@Test
public void serializeDeserializeNonEmpty() {
- final TDigest td1 = new TDigest(100);
+ final TDigestDouble td1 = new TDigestDouble(100);
for (int i = 0; i < 10000; i++) td1.update(i);
final byte[] bytes = td1.toByteArray();
- final TDigest td2 = TDigest.heapify(Memory.wrap(bytes));
+ final TDigestDouble td2 = TDigestDouble.heapify(Memory.wrap(bytes));
assertEquals(td2.getK(), td1.getK());
assertEquals(td2.getTotalWeight(), td1.getTotalWeight());
assertEquals(td2.isEmpty(), td1.isEmpty());
@@ -142,7 +142,7 @@ public class TDigestTest {
@Test
public void deserializeFromReferenceImplementationDouble() {
final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_double.sk");
- final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+ final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes));
final int n = 10000;
assertEquals(td.getTotalWeight(), n);
assertEquals(td.getMinValue(), 0);
@@ -157,7 +157,7 @@ public class TDigestTest {
@Test
public void deserializeFromReferenceImplementationFloat() {
final byte[] bytes = TestUtil.getResourceBytes("tdigest_ref_k100_n10000_float.sk");
- final TDigest td = TDigest.heapify(Memory.wrap(bytes));
+ final TDigestDouble td = TDigestDouble.heapify(Memory.wrap(bytes));
final int n = 10000;
assertEquals(td.getTotalWeight(), n);
assertEquals(td.getMinValue(), 0);
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org