You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by pl...@apache.org on 2024/03/17 12:26:13 UTC

(datasketches-go) branch float64-serde created (now bceb87d)

This is an automated email from the ASF dual-hosted git repository.

placave pushed a change to branch float64-serde
in repository https://gitbox.apache.org/repos/asf/datasketches-go.git


      at bceb87d  Fix serialization issue, bug and deterministism + Add double serde

This branch includes the following new commits:

     new bceb87d  Fix serialization issue, bug and deterministism + Add double serde

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org


(datasketches-go) 01/01: Fix serialization issue, bug and deterministism + Add double serde

Posted by pl...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

placave pushed a commit to branch float64-serde
in repository https://gitbox.apache.org/repos/asf/datasketches-go.git

commit bceb87d463c52c37b34b4cef5cc2af692e41afbc
Author: Pierre Lacave <pi...@datadoghq.com>
AuthorDate: Sun Mar 17 13:25:59 2024 +0100

    Fix serialization issue, bug and deterministism + Add double serde
---
 common/array_of_doubles_serde.go                   |  82 +++++++++++++++++++++
 kll/items_sketch.go                                |  47 ++++++++----
 kll/items_sketch_test.go                           |  55 +++++++++++++-
 kll/items_sletch_serialization_test.go             |  69 ++++++++++++++++-
 .../java_generated_files/kll_double_n0_java.sk     | Bin 0 -> 8 bytes
 .../kll_double_n1000000_java.sk                    | Bin 0 -> 5000 bytes
 .../kll_double_n100000_java.sk                     | Bin 0 -> 4728 bytes
 .../java_generated_files/kll_double_n10000_java.sk | Bin 0 -> 4372 bytes
 .../java_generated_files/kll_double_n1000_java.sk  | Bin 0 -> 2640 bytes
 .../java_generated_files/kll_double_n100_java.sk   | Bin 0 -> 840 bytes
 .../java_generated_files/kll_double_n10_java.sk    | Bin 0 -> 120 bytes
 .../java_generated_files/kll_double_n1_java.sk     | Bin 0 -> 16 bytes
 .../kll_string_n1000000_java.sk                    | Bin 6848 -> 6848 bytes
 .../kll_string_n100000_java.sk                     | Bin 5896 -> 5896 bytes
 .../java_generated_files/kll_string_n10000_java.sk | Bin 4913 -> 4913 bytes
 .../java_generated_files/kll_string_n1000_java.sk  | Bin 2640 -> 2640 bytes
 16 files changed, 234 insertions(+), 19 deletions(-)

diff --git a/common/array_of_doubles_serde.go b/common/array_of_doubles_serde.go
new file mode 100644
index 0000000..84d5872
--- /dev/null
+++ b/common/array_of_doubles_serde.go
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package common
+
+import (
+	"encoding/binary"
+	"github.com/twmb/murmur3"
+	"math"
+)
+
+type ArrayOfDoublesSerDe struct {
+	scratch [8]byte
+}
+
+func (f ArrayOfDoublesSerDe) Identity() float64 {
+	return 0
+}
+
+func (f ArrayOfDoublesSerDe) Hash(item float64) uint64 {
+	binary.LittleEndian.PutUint64(f.scratch[:], math.Float64bits(item))
+	return murmur3.SeedSum64(_DEFAULT_SERDE_HASH_SEED, f.scratch[:])
+}
+
+func (f ArrayOfDoublesSerDe) LessFn() LessFn[float64] {
+	return func(a float64, b float64) bool {
+		return a < b
+	}
+}
+
+func (f ArrayOfDoublesSerDe) SizeOf(item float64) int {
+	return 8
+}
+
+func (f ArrayOfDoublesSerDe) SizeOfMany(mem []byte, offsetBytes int, numItems int) (int, error) {
+	return numItems * 8, nil
+}
+
+func (f ArrayOfDoublesSerDe) SerializeOneToSlice(item float64) []byte {
+	bytes := make([]byte, 8)
+	binary.LittleEndian.PutUint64(bytes, math.Float64bits(item))
+	return bytes
+}
+
+func (f ArrayOfDoublesSerDe) SerializeManyToSlice(item []float64) []byte {
+	if len(item) == 0 {
+		return []byte{}
+	}
+	bytes := make([]byte, 8*len(item))
+	offset := 0
+	for i := 0; i < len(item); i++ {
+		binary.LittleEndian.PutUint64(bytes[offset:], math.Float64bits(item[i]))
+		offset += 8
+	}
+	return bytes
+}
+
+func (f ArrayOfDoublesSerDe) DeserializeManyFromSlice(mem []byte, offsetBytes int, numItems int) ([]float64, error) {
+	if numItems == 0 {
+		return []float64{}, nil
+	}
+	array := make([]float64, 0, numItems)
+	for i := 0; i < numItems; i++ {
+		array = append(array, math.Float64frombits(binary.LittleEndian.Uint64(mem[offsetBytes:])))
+		offsetBytes += 8
+	}
+	return array, nil
+}
diff --git a/kll/items_sketch.go b/kll/items_sketch.go
index 8728746..3cce1b9 100644
--- a/kll/items_sketch.go
+++ b/kll/items_sketch.go
@@ -31,8 +31,8 @@ import (
 	"fmt"
 	"github.com/apache/datasketches-go/common"
 	"github.com/apache/datasketches-go/internal"
+	"math/rand"
 	"sort"
-	"unsafe"
 )
 
 type ItemsSketch[C comparable] struct {
@@ -51,6 +51,9 @@ type ItemsSketch[C comparable] struct {
 	maxItem           *C
 	sortedView        *ItemsSketchSortedView[C]
 	itemsSketchOp     common.ItemSketchOp[C]
+
+	// Force deterministic offset for test, so that we can compare results across implementation.
+	deterministicOffsetForTest bool
 }
 
 const (
@@ -68,6 +71,9 @@ var (
 		3486784401, 10460353203, 31381059609, 94143178827, 282429536481,
 		847288609443, 2541865828329, 7625597484987, 22876792454961, 68630377364883,
 		205891132094649}
+
+	// Used for deterministic rand behavior in tests
+	nextOffsetForTest = 0
 )
 
 // NewKllItemsSketch create a new ItemsSketch with the given k and m.
@@ -498,7 +504,6 @@ func (s *ItemsSketch[C]) ToSlice() ([]byte, error) {
 			return nil, err
 		}
 		copy(bytesOut[_DATA_START_ADR_SINGLE_ITEM:], siByteArr)
-		//wbuf.incrementPosition(-len);
 		return bytesOut, nil
 	}
 
@@ -607,7 +612,7 @@ func (s *ItemsSketch[C]) getSingleItemSizeBytes() (int, error) {
 	if err != nil {
 		return 0, err
 	}
-	return s.itemsSketchOp.SizeOf(v) + int(unsafe.Sizeof(uint32(1))), nil
+	return s.itemsSketchOp.SizeOf(v), nil
 }
 
 func (s *ItemsSketch[C]) getSingleItemByteArr() ([]byte, error) {
@@ -736,7 +741,7 @@ func (s *ItemsSketch[C]) mergeItemsSketch(other *ItemsSketch[C]) {
 			otherNumLevels, otherLevelsArr, otherItemsArr, s.itemsSketchOp.LessFn())
 
 		// notice that workbuf is being used as both the input and output
-		result := generalItemsCompress(s.k, s.m, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, s.itemsSketchOp.LessFn())
+		result := generalItemsCompress(s.k, s.m, provisionalNumLevels, workbuf, worklevels, workbuf, outlevels, s.isLevelZeroSorted, s.itemsSketchOp.LessFn(), s.deterministicOffsetForTest)
 		targetItemCount := result[1] //was finalCapacity. Max size given k, m, numLevels
 		curItemCount := result[2]    //was finalPop
 
@@ -844,9 +849,9 @@ func (s *ItemsSketch[C]) compressWhileUpdatingSketch() {
 		})
 	}
 	if popAbove == 0 {
-		randomlyHalveUpItems(myItemsArr, adjBeg, adjPop)
+		randomlyHalveUpItems(myItemsArr, adjBeg, adjPop, s.deterministicOffsetForTest)
 	} else {
-		randomlyHalveDownItems(myItemsArr, adjBeg, adjPop)
+		randomlyHalveDownItems(myItemsArr, adjBeg, adjPop, s.deterministicOffsetForTest)
 		mergeSortedItemsArrays(
 			myItemsArr, adjBeg, halfAdjPop,
 			myItemsArr, rawEnd, popAbove,
@@ -980,10 +985,12 @@ func intCapAuxAux(k uint16, depth uint8) uint32 {
 	return uint32(k)
 }
 
-func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32) {
+func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32, deterministicOffsetForTest bool) {
 	halfLength := length / 2
-	//offset := rand.Intn(2)
-	offset := 1
+	offset := rand.Intn(2)
+	if deterministicOffsetForTest {
+		offset = deterministicOffset()
+	}
 	j := (start + length) - 1 - uint32(offset)
 	for i := (start + length) - 1; i >= (start + halfLength); i-- {
 		buf[i] = buf[j]
@@ -991,10 +998,12 @@ func randomlyHalveUpItems[C comparable](buf []C, start uint32, length uint32) {
 	}
 }
 
-func randomlyHalveDownItems[C comparable](buf []C, start uint32, length uint32) {
+func randomlyHalveDownItems[C comparable](buf []C, start uint32, length uint32, deterministicOffsetForTest bool) {
 	halfLength := length / 2
-	//offset := rand.Intn(2)
-	offset := 1
+	offset := rand.Intn(2)
+	if deterministicOffsetForTest {
+		offset = deterministicOffset()
+	}
 	j := start + uint32(offset)
 	for i := start; i < (start + halfLength); i++ {
 		buf[i] = buf[j]
@@ -1074,7 +1083,9 @@ func generalItemsCompress[C comparable](
 	outBuf []C,
 	outLevels []uint32,
 	isLevelZeroSorted bool,
-	lessFn common.LessFn[C]) []uint32 {
+	lessFn common.LessFn[C],
+	deterministicOffsetForTest bool,
+) []uint32 {
 	numLevels := numLevelsIn
 	currentItemCount := inLevels[numLevels] - inLevels[0]        // decreases with each compaction
 	targetItemCount := computeTotalItemCapacity(k, m, numLevels) // increases if we add levels
@@ -1131,9 +1142,9 @@ func generalItemsCompress[C comparable](
 			}
 
 			if popAbove == 0 {
-				randomlyHalveUpItems(inBuf, adjBeg, adjPop)
+				randomlyHalveUpItems(inBuf, adjBeg, adjPop, deterministicOffsetForTest)
 			} else {
-				randomlyHalveDownItems(inBuf, adjBeg, adjPop)
+				randomlyHalveDownItems(inBuf, adjBeg, adjPop, deterministicOffsetForTest)
 				mergeSortedItemsArrays(
 					inBuf, adjBeg, halfAdjPop,
 					inBuf, rawLim, popAbove,
@@ -1162,3 +1173,9 @@ func generalItemsCompress[C comparable](
 
 	return []uint32{uint32(numLevels), targetItemCount, currentItemCount}
 }
+
+func deterministicOffset() int {
+	result := nextOffsetForTest
+	nextOffsetForTest = 1 - nextOffsetForTest
+	return result
+}
diff --git a/kll/items_sketch_test.go b/kll/items_sketch_test.go
index 0d36c95..473aeee 100644
--- a/kll/items_sketch_test.go
+++ b/kll/items_sketch_test.go
@@ -835,7 +835,7 @@ func TestItemsSketch_SerializeDeserializeMultipleValue(t *testing.T) {
 	assert.Equal(t, mem, mem2)
 }
 
-func TestSerializeDeserialize(t *testing.T) {
+func TestSerializeDeserializeString(t *testing.T) {
 	nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
 	serde := common.ArrayOfStringsSerDe{}
 	for _, n := range nArr {
@@ -888,3 +888,56 @@ func TestSerializeDeserialize(t *testing.T) {
 		}
 	}
 }
+
+func TestSerializeDeserializeFloat(t *testing.T) {
+	nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
+	serde := common.ArrayOfDoublesSerDe{}
+	for _, n := range nArr {
+		sk, err := NewKllItemsSketchWithDefault[float64](serde)
+		assert.NoError(t, err)
+		for i := 1; i <= n; i++ {
+			sk.Update(float64(i))
+		}
+		slc, err := sk.ToSlice()
+		assert.NoError(t, err)
+
+		sketch, err := NewKllItemsSketchFromSlice[float64](slc, serde)
+		if err != nil {
+			return
+		}
+
+		assert.Equal(t, sketch.GetK(), uint16(200))
+		if n == 0 {
+			assert.True(t, sketch.IsEmpty())
+		} else {
+			assert.False(t, sketch.IsEmpty())
+		}
+
+		if n > 100 {
+			assert.True(t, sketch.IsEstimationMode())
+		} else {
+			assert.False(t, sketch.IsEstimationMode())
+		}
+
+		if n > 0 {
+			minV, err := sketch.GetMinItem()
+			assert.NoError(t, err)
+			assert.Equal(t, minV, float64(1))
+
+			maxV, err := sketch.GetMaxItem()
+			assert.NoError(t, err)
+			assert.Equal(t, maxV, float64(n))
+
+			weight := int64(0)
+			it := sketch.GetIterator()
+			lessFn := serde.LessFn()
+			for it.Next() {
+				qut := it.GetQuantile()
+				assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut))
+				assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut))
+				weight += it.GetWeight()
+			}
+			assert.Equal(t, weight, int64(n))
+		}
+	}
+}
diff --git a/kll/items_sletch_serialization_test.go b/kll/items_sletch_serialization_test.go
index 16ff817..4d01a7f 100644
--- a/kll/items_sletch_serialization_test.go
+++ b/kll/items_sletch_serialization_test.go
@@ -27,14 +27,17 @@ import (
 )
 
 func TestGenerateGoFiles(t *testing.T) {
-	if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 {
-		t.Skipf("%s not set", internal.DSketchTestGenerateGo)
-	}
+	//if len(os.Getenv(internal.DSketchTestGenerateGo)) == 0 {
+	//	t.Skipf("%s not set", internal.DSketchTestGenerateGo)
+	//}
+
+	os.Mkdir(internal.GoPath, 0755)
 
 	nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
 	for _, n := range nArr {
 		digits := numDigits(n)
 		sk, err := NewKllItemsSketchWithDefault[string](common.ArrayOfStringsSerDe{})
+		sk.deterministicOffsetForTest = true
 		assert.NoError(t, err)
 		for i := 1; i <= n; i++ {
 			sk.Update(intToFixedLengthString(i, digits))
@@ -44,6 +47,19 @@ func TestGenerateGoFiles(t *testing.T) {
 		err = os.WriteFile(fmt.Sprintf("%s/kll_string_n%d_go.sk", internal.GoPath, n), slc, 0644)
 		assert.NoError(t, err)
 	}
+
+	for _, n := range nArr {
+		sk, err := NewKllItemsSketchWithDefault[float64](common.ArrayOfDoublesSerDe{})
+		sk.deterministicOffsetForTest = true
+		assert.NoError(t, err)
+		for i := 1; i <= n; i++ {
+			sk.Update(float64(i))
+		}
+		slc, err := sk.ToSlice()
+		assert.NoError(t, err)
+		err = os.WriteFile(fmt.Sprintf("%s/kll_double_n%d_go.sk", internal.GoPath, n), slc, 0644)
+		assert.NoError(t, err)
+	}
 }
 
 func TestJavaCompat(t *testing.T) {
@@ -94,4 +110,51 @@ func TestJavaCompat(t *testing.T) {
 			}
 		}
 	})
+
+	t.Run("Java KLL Double", func(t *testing.T) {
+		nArr := []int{0, 1, 10, 100, 1000, 10000, 100000, 1000000}
+		serde := common.ArrayOfDoublesSerDe{}
+		for _, n := range nArr {
+			bytes, err := os.ReadFile(fmt.Sprintf("%s/kll_double_n%d_java.sk", internal.JavaPath, n))
+			assert.NoError(t, err)
+			sketch, err := NewKllItemsSketchFromSlice[float64](bytes, serde)
+			if err != nil {
+				return
+			}
+
+			assert.Equal(t, sketch.GetK(), uint16(200))
+			if n == 0 {
+				assert.True(t, sketch.IsEmpty())
+			} else {
+				assert.False(t, sketch.IsEmpty())
+			}
+
+			if n > 100 {
+				assert.True(t, sketch.IsEstimationMode())
+			} else {
+				assert.False(t, sketch.IsEstimationMode())
+			}
+
+			if n > 0 {
+				minV, err := sketch.GetMinItem()
+				assert.NoError(t, err)
+				assert.Equal(t, minV, float64(1))
+
+				maxV, err := sketch.GetMaxItem()
+				assert.NoError(t, err)
+				assert.Equal(t, maxV, float64(n))
+
+				weight := int64(0)
+				it := sketch.GetIterator()
+				lessFn := serde.LessFn()
+				for it.Next() {
+					qut := it.GetQuantile()
+					assert.True(t, lessFn(minV, qut) || minV == qut, fmt.Sprintf("min: \"%v\" \"%v\"", minV, qut))
+					assert.True(t, !lessFn(maxV, qut) || maxV == qut, fmt.Sprintf("max: \"%v\" \"%v\"", maxV, qut))
+					weight += it.GetWeight()
+				}
+				assert.Equal(t, weight, int64(n))
+			}
+		}
+	})
 }
diff --git a/serialization_test_data/java_generated_files/kll_double_n0_java.sk b/serialization_test_data/java_generated_files/kll_double_n0_java.sk
new file mode 100644
index 0000000..afd2209
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n0_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk b/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk
new file mode 100644
index 0000000..171a3a0
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n1000000_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n100000_java.sk b/serialization_test_data/java_generated_files/kll_double_n100000_java.sk
new file mode 100644
index 0000000..5b57a7a
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n100000_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n10000_java.sk b/serialization_test_data/java_generated_files/kll_double_n10000_java.sk
new file mode 100644
index 0000000..0de0b17
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n10000_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n1000_java.sk b/serialization_test_data/java_generated_files/kll_double_n1000_java.sk
new file mode 100644
index 0000000..a2737f4
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n1000_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n100_java.sk b/serialization_test_data/java_generated_files/kll_double_n100_java.sk
new file mode 100644
index 0000000..b0d578a
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n100_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n10_java.sk b/serialization_test_data/java_generated_files/kll_double_n10_java.sk
new file mode 100644
index 0000000..fb526ef
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n10_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_double_n1_java.sk b/serialization_test_data/java_generated_files/kll_double_n1_java.sk
new file mode 100644
index 0000000..5eceb40
Binary files /dev/null and b/serialization_test_data/java_generated_files/kll_double_n1_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk b/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk
index 6ba10e5..e5d0a29 100644
Binary files a/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n1000000_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_string_n100000_java.sk b/serialization_test_data/java_generated_files/kll_string_n100000_java.sk
index 73fa4bb..e3382f2 100644
Binary files a/serialization_test_data/java_generated_files/kll_string_n100000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n100000_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_string_n10000_java.sk b/serialization_test_data/java_generated_files/kll_string_n10000_java.sk
index 644a3e4..8c4ba2d 100644
Binary files a/serialization_test_data/java_generated_files/kll_string_n10000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n10000_java.sk differ
diff --git a/serialization_test_data/java_generated_files/kll_string_n1000_java.sk b/serialization_test_data/java_generated_files/kll_string_n1000_java.sk
index f16897c..858cac2 100644
Binary files a/serialization_test_data/java_generated_files/kll_string_n1000_java.sk and b/serialization_test_data/java_generated_files/kll_string_n1000_java.sk differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org