You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2021/04/26 15:46:48 UTC
[impala] 04/04: IMPALA-10631: Upgrade DataSketches to version 3.0.0
This is an automated email from the ASF dual-hosted git repository.
joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
commit 75fa056dc04c6cf2c153de29595d5568db6a074f
Author: Fucun Chu <ch...@hotmail.com>
AuthorDate: Fri Apr 2 00:35:26 2021 +0800
IMPALA-10631: Upgrade DataSketches to version 3.0.0
Upgrade the external DataSketches files CPC/HLL/KLL/Theta to version
3.0.0
tests:
-Ran the tests from tests/query_test/test_datasketches.py
Change-Id: I37622a7643d015b80f55b802421eae826aa7a4f9
Reviewed-on: http://gerrit.cloudera.org:8080/17294
Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
be/src/exprs/datasketches-test.cc | 12 +-
.../datasketches/AuxHashMap-internal.hpp | 89 +--
be/src/thirdparty/datasketches/AuxHashMap.hpp | 17 +-
.../datasketches/CompositeInterpolationXTable.hpp | 4 +-
.../datasketches/CouponHashSet-internal.hpp | 82 +-
be/src/thirdparty/datasketches/CouponHashSet.hpp | 20 +-
.../datasketches/CouponList-internal.hpp | 122 ++-
be/src/thirdparty/datasketches/CouponList.hpp | 23 +-
.../thirdparty/datasketches/CubicInterpolation.hpp | 4 +-
be/src/thirdparty/datasketches/HarmonicNumbers.hpp | 4 +-
.../thirdparty/datasketches/Hll4Array-internal.hpp | 29 +-
be/src/thirdparty/datasketches/Hll4Array.hpp | 2 +-
.../thirdparty/datasketches/Hll6Array-internal.hpp | 31 +-
be/src/thirdparty/datasketches/Hll6Array.hpp | 5 +-
.../thirdparty/datasketches/Hll8Array-internal.hpp | 31 +-
be/src/thirdparty/datasketches/Hll8Array.hpp | 5 +-
.../thirdparty/datasketches/HllArray-internal.hpp | 83 +-
be/src/thirdparty/datasketches/HllArray.hpp | 16 +-
.../thirdparty/datasketches/HllSketch-internal.hpp | 20 +-
be/src/thirdparty/datasketches/HllSketchImpl.hpp | 3 +-
.../datasketches/HllSketchImplFactory.hpp | 65 +-
.../thirdparty/datasketches/HllUnion-internal.hpp | 31 +-
be/src/thirdparty/datasketches/HllUtil.hpp | 2 +-
be/src/thirdparty/datasketches/MurmurHash3.h | 7 +
be/src/thirdparty/datasketches/README.md | 6 +-
.../datasketches/RelativeErrorTables.hpp | 2 +-
.../bounds_on_ratios_in_sampled_sets.hpp | 136 ++++
.../bounds_on_ratios_in_theta_sketched_sets.hpp | 135 ++++
be/src/thirdparty/datasketches/cpc_common.hpp | 3 +
be/src/thirdparty/datasketches/cpc_compressor.hpp | 4 +-
.../datasketches/cpc_compressor_impl.hpp | 47 +-
be/src/thirdparty/datasketches/cpc_sketch.hpp | 13 +-
be/src/thirdparty/datasketches/cpc_sketch_impl.hpp | 33 +-
be/src/thirdparty/datasketches/cpc_union.hpp | 4 +-
be/src/thirdparty/datasketches/cpc_union_impl.hpp | 12 +-
be/src/thirdparty/datasketches/cpc_util.hpp | 6 -
be/src/thirdparty/datasketches/hll.hpp | 40 +-
be/src/thirdparty/datasketches/icon_estimator.hpp | 6 +-
.../datasketches/kll_quantile_calculator.hpp | 2 +-
.../datasketches/kll_quantile_calculator_impl.hpp | 6 +-
be/src/thirdparty/datasketches/kll_sketch.hpp | 13 +-
be/src/thirdparty/datasketches/kll_sketch_impl.hpp | 168 ++--
.../thirdparty/datasketches/memory_operations.hpp | 12 +
be/src/thirdparty/datasketches/theta_a_not_b.hpp | 41 +-
.../thirdparty/datasketches/theta_a_not_b_impl.hpp | 51 +-
...ubicInterpolation.hpp => theta_comparators.hpp} | 39 +-
...InterpolationXTable.hpp => theta_constants.hpp} | 24 +-
be/src/thirdparty/datasketches/theta_helpers.hpp | 54 ++
.../thirdparty/datasketches/theta_intersection.hpp | 51 +-
.../datasketches/theta_intersection_base.hpp | 59 ++
.../datasketches/theta_intersection_base_impl.hpp | 121 +++
.../datasketches/theta_intersection_impl.hpp | 98 +--
...tionXTable.hpp => theta_jaccard_similarity.hpp} | 25 +-
.../datasketches/theta_jaccard_similarity_base.hpp | 156 ++++
.../datasketches/theta_set_difference_base.hpp | 54 ++
.../theta_set_difference_base_impl.hpp | 85 +++
be/src/thirdparty/datasketches/theta_sketch.hpp | 398 ++++------
.../thirdparty/datasketches/theta_sketch_impl.hpp | 850 +++++----------------
be/src/thirdparty/datasketches/theta_union.hpp | 87 +--
.../thirdparty/datasketches/theta_union_base.hpp | 60 ++
.../datasketches/theta_union_base_impl.hpp | 89 +++
.../thirdparty/datasketches/theta_union_impl.hpp | 82 +-
.../datasketches/theta_update_sketch_base.hpp | 243 ++++++
.../datasketches/theta_update_sketch_base_impl.hpp | 394 ++++++++++
be/src/thirdparty/datasketches/u32_table.hpp | 6 +-
be/src/thirdparty/datasketches/u32_table_impl.hpp | 18 +-
66 files changed, 2545 insertions(+), 1895 deletions(-)
diff --git a/be/src/exprs/datasketches-test.cc b/be/src/exprs/datasketches-test.cc
index 687f070..368050d 100644
--- a/be/src/exprs/datasketches-test.cc
+++ b/be/src/exprs/datasketches-test.cc
@@ -174,22 +174,22 @@ TEST(TestDataSketchesTheta, UseDataSketchesInterface) {
datasketches::update_theta_sketch sketch1 =
datasketches::update_theta_sketch::builder().build();
for (int key = 0; key < 100000; key++) sketch1.update(key);
- sketch1.serialize(sketch_stream1);
+ sketch1.compact().serialize(sketch_stream1);
// 100000 distinct keys
datasketches::update_theta_sketch sketch2 =
datasketches::update_theta_sketch::builder().build();
for (int key = 50000; key < 150000; key++) sketch2.update(key);
- sketch2.serialize(sketch_stream2);
+ sketch2.compact().serialize(sketch_stream2);
}
// this section deserializes the sketches, produces union and intersection
{
- datasketches::update_theta_sketch sketch1 =
- datasketches::update_theta_sketch::deserialize(sketch_stream1);
+ datasketches::compact_theta_sketch sketch1 =
+ datasketches::compact_theta_sketch::deserialize(sketch_stream1);
- datasketches::update_theta_sketch sketch2 =
- datasketches::update_theta_sketch::deserialize(sketch_stream2);
+ datasketches::compact_theta_sketch sketch2 =
+ datasketches::compact_theta_sketch::deserialize(sketch_stream2);
// union opertion
datasketches::theta_union u = datasketches::theta_union::builder().build();
diff --git a/be/src/thirdparty/datasketches/AuxHashMap-internal.hpp b/be/src/thirdparty/datasketches/AuxHashMap-internal.hpp
index 9a8e135..60142ec 100644
--- a/be/src/thirdparty/datasketches/AuxHashMap-internal.hpp
+++ b/be/src/thirdparty/datasketches/AuxHashMap-internal.hpp
@@ -26,42 +26,28 @@
namespace datasketches {
template<typename A>
-AuxHashMap<A>::AuxHashMap(int lgAuxArrInts, int lgConfigK)
- : lgConfigK(lgConfigK),
- lgAuxArrInts(lgAuxArrInts),
- auxCount(0) {
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- const int numItems = 1 << lgAuxArrInts;
- auxIntArr = intAlloc().allocate(numItems);
- std::fill(auxIntArr, auxIntArr + numItems, 0);
-}
-
-template<typename A>
-AuxHashMap<A>* AuxHashMap<A>::newAuxHashMap(int lgAuxArrInts, int lgConfigK) {
- return new (ahmAlloc().allocate(1)) AuxHashMap<A>(lgAuxArrInts, lgConfigK);
-}
+AuxHashMap<A>::AuxHashMap(int lgAuxArrInts, int lgConfigK, const A& allocator):
+lgConfigK(lgConfigK),
+lgAuxArrInts(lgAuxArrInts),
+auxCount(0),
+entries(1 << lgAuxArrInts, 0, allocator)
+{}
template<typename A>
-AuxHashMap<A>::AuxHashMap(const AuxHashMap& that)
- : lgConfigK(that.lgConfigK),
- lgAuxArrInts(that.lgAuxArrInts),
- auxCount(that.auxCount) {
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- const int numItems = 1 << lgAuxArrInts;
- auxIntArr = intAlloc().allocate(numItems);
- std::copy(that.auxIntArr, that.auxIntArr + numItems, auxIntArr);
+AuxHashMap<A>* AuxHashMap<A>::newAuxHashMap(int lgAuxArrInts, int lgConfigK, const A& allocator) {
+ return new (ahmAlloc(allocator).allocate(1)) AuxHashMap<A>(lgAuxArrInts, lgConfigK, allocator);
}
template<typename A>
AuxHashMap<A>* AuxHashMap<A>::newAuxHashMap(const AuxHashMap& that) {
- return new (ahmAlloc().allocate(1)) AuxHashMap<A>(that);
+ return new (ahmAlloc(that.entries.get_allocator()).allocate(1)) AuxHashMap<A>(that);
}
template<typename A>
AuxHashMap<A>* AuxHashMap<A>::deserialize(const void* bytes, size_t len,
int lgConfigK,
int auxCount, int lgAuxArrInts,
- bool srcCompact) {
+ bool srcCompact, const A& allocator) {
int lgArrInts = lgAuxArrInts;
if (srcCompact) { // early compact versions didn't use LgArr byte field so ignore input
lgArrInts = HllUtil<A>::computeLgArrInts(HLL, auxCount, lgConfigK);
@@ -77,7 +63,7 @@ AuxHashMap<A>* AuxHashMap<A>::deserialize(const void* bytes, size_t len,
if (len < auxCount * sizeof(int)) {
throw std::out_of_range("Input array too small to hold AuxHashMap image");
}
- auxHashMap = new (ahmAlloc().allocate(1)) AuxHashMap<A>(lgArrInts, lgConfigK);
+ auxHashMap = new (ahmAlloc(allocator).allocate(1)) AuxHashMap<A>(lgArrInts, lgConfigK, allocator);
for (int i = 0; i < auxCount; ++i) {
int pair = auxPtr[i];
int slotNo = HllUtil<A>::getLow26(pair) & configKmask;
@@ -89,7 +75,7 @@ AuxHashMap<A>* AuxHashMap<A>::deserialize(const void* bytes, size_t len,
if (len < itemsToRead * sizeof(int)) {
throw std::out_of_range("Input array too small to hold AuxHashMap image");
}
- auxHashMap = new (ahmAlloc().allocate(1)) AuxHashMap<A>(lgArrInts, lgConfigK);
+ auxHashMap = new (ahmAlloc(allocator).allocate(1)) AuxHashMap<A>(lgArrInts, lgConfigK, allocator);
for (int i = 0; i < itemsToRead; ++i) {
int pair = auxPtr[i];
if (pair == HllUtil<A>::EMPTY) { continue; }
@@ -110,7 +96,7 @@ AuxHashMap<A>* AuxHashMap<A>::deserialize(const void* bytes, size_t len,
template<typename A>
AuxHashMap<A>* AuxHashMap<A>::deserialize(std::istream& is, const int lgConfigK,
const int auxCount, const int lgAuxArrInts,
- const bool srcCompact) {
+ const bool srcCompact, const A& allocator) {
int lgArrInts = lgAuxArrInts;
if (srcCompact) { // early compact versions didn't use LgArr byte field so ignore input
lgArrInts = HllUtil<A>::computeLgArrInts(HLL, auxCount, lgConfigK);
@@ -118,7 +104,7 @@ AuxHashMap<A>* AuxHashMap<A>::deserialize(std::istream& is, const int lgConfigK,
lgArrInts = lgAuxArrInts;
}
- AuxHashMap<A>* auxHashMap = new (ahmAlloc().allocate(1)) AuxHashMap<A>(lgArrInts, lgConfigK);
+ AuxHashMap<A>* auxHashMap = new (ahmAlloc(allocator).allocate(1)) AuxHashMap<A>(lgArrInts, lgConfigK, allocator);
typedef std::unique_ptr<AuxHashMap<A>, std::function<void(AuxHashMap<A>*)>> aux_hash_map_ptr;
aux_hash_map_ptr aux_ptr(auxHashMap, auxHashMap->make_deleter());
@@ -153,23 +139,17 @@ AuxHashMap<A>* AuxHashMap<A>::deserialize(std::istream& is, const int lgConfigK,
}
template<typename A>
-AuxHashMap<A>::~AuxHashMap<A>() {
- // should be no way to have an object without a valid array
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- intAlloc().deallocate(auxIntArr, 1 << lgAuxArrInts);
-}
-
-template<typename A>
std::function<void(AuxHashMap<A>*)> AuxHashMap<A>::make_deleter() {
return [](AuxHashMap<A>* ptr) {
+ ahmAlloc alloc(ptr->entries.get_allocator());
ptr->~AuxHashMap();
- ahmAlloc().deallocate(ptr, 1);
+ alloc.deallocate(ptr, 1);
};
}
template<typename A>
AuxHashMap<A>* AuxHashMap<A>::copy() const {
- return new (ahmAlloc().allocate(1)) AuxHashMap<A>(*this);
+ return new (ahmAlloc(entries.get_allocator()).allocate(1)) AuxHashMap<A>(*this);
}
template<typename A>
@@ -179,7 +159,7 @@ int AuxHashMap<A>::getAuxCount() const {
template<typename A>
int* AuxHashMap<A>::getAuxIntArr(){
- return auxIntArr;
+ return entries.data();
}
template<typename A>
@@ -199,7 +179,7 @@ int AuxHashMap<A>::getUpdatableSizeBytes() const {
template<typename A>
void AuxHashMap<A>::mustAdd(const int slotNo, const int value) {
- const int index = find(auxIntArr, lgAuxArrInts, lgConfigK, slotNo);
+ const int index = find(entries.data(), lgAuxArrInts, lgConfigK, slotNo);
const int entry_pair = HllUtil<A>::pair(slotNo, value);
if (index >= 0) {
throw std::invalid_argument("Found a slotNo that should not be there: SlotNo: "
@@ -207,16 +187,16 @@ void AuxHashMap<A>::mustAdd(const int slotNo, const int value) {
}
// found empty entry
- auxIntArr[~index] = entry_pair;
+ entries[~index] = entry_pair;
++auxCount;
checkGrow();
}
template<typename A>
int AuxHashMap<A>::mustFindValueFor(const int slotNo) const {
- const int index = find(auxIntArr, lgAuxArrInts, lgConfigK, slotNo);
+ const int index = find(entries.data(), lgAuxArrInts, lgConfigK, slotNo);
if (index >= 0) {
- return HllUtil<A>::getValue(auxIntArr[index]);
+ return HllUtil<A>::getValue(entries[index]);
}
throw std::invalid_argument("slotNo not found: " + std::to_string(slotNo));
@@ -224,9 +204,9 @@ int AuxHashMap<A>::mustFindValueFor(const int slotNo) const {
template<typename A>
void AuxHashMap<A>::mustReplace(const int slotNo, const int value) {
- const int idx = find(auxIntArr, lgAuxArrInts, lgConfigK, slotNo);
+ const int idx = find(entries.data(), lgAuxArrInts, lgConfigK, slotNo);
if (idx >= 0) {
- auxIntArr[idx] = HllUtil<A>::pair(slotNo, value);
+ entries[idx] = HllUtil<A>::pair(slotNo, value);
return;
}
@@ -243,23 +223,18 @@ void AuxHashMap<A>::checkGrow() {
template<typename A>
void AuxHashMap<A>::growAuxSpace() {
- int* oldArray = auxIntArr;
- const int oldArrLen = 1 << lgAuxArrInts;
const int configKmask = (1 << lgConfigK) - 1;
const int newArrLen = 1 << ++lgAuxArrInts;
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- auxIntArr = intAlloc().allocate(newArrLen);
- std::fill(auxIntArr, auxIntArr + newArrLen, 0);
- for (int i = 0; i < oldArrLen; ++i) {
- const int fetched = oldArray[i];
+ vector_int entries_new(newArrLen, 0, entries.get_allocator());
+ for (size_t i = 0; i < entries.size(); ++i) {
+ const int fetched = entries[i];
if (fetched != HllUtil<A>::EMPTY) {
// find empty in new array
- const int idx = find(auxIntArr, lgAuxArrInts, lgConfigK, fetched & configKmask);
- auxIntArr[~idx] = fetched;
+ const int idx = find(entries_new.data(), lgAuxArrInts, lgConfigK, fetched & configKmask);
+ entries_new[~idx] = fetched;
}
}
-
- intAlloc().deallocate(oldArray, oldArrLen);
+ entries = std::move(entries_new);
}
//Searches the Aux arr hash table for an empty or a matching slotNo depending on the context.
@@ -290,12 +265,12 @@ int AuxHashMap<A>::find(const int* auxArr, const int lgAuxArrInts, const int lgC
template<typename A>
coupon_iterator<A> AuxHashMap<A>::begin(bool all) const {
- return coupon_iterator<A>(auxIntArr, 1 << lgAuxArrInts, 0, all);
+ return coupon_iterator<A>(entries.data(), 1 << lgAuxArrInts, 0, all);
}
template<typename A>
coupon_iterator<A> AuxHashMap<A>::end() const {
- return coupon_iterator<A>(auxIntArr, 1 << lgAuxArrInts, 1 << lgAuxArrInts, false);
+ return coupon_iterator<A>(entries.data(), 1 << lgAuxArrInts, 1 << lgAuxArrInts, false);
}
}
diff --git a/be/src/thirdparty/datasketches/AuxHashMap.hpp b/be/src/thirdparty/datasketches/AuxHashMap.hpp
index b37e85c..e18f15d 100644
--- a/be/src/thirdparty/datasketches/AuxHashMap.hpp
+++ b/be/src/thirdparty/datasketches/AuxHashMap.hpp
@@ -28,22 +28,21 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A>
class AuxHashMap final {
public:
- explicit AuxHashMap(int lgAuxArrInts, int lgConfigK);
- explicit AuxHashMap(const AuxHashMap<A>& that);
- static AuxHashMap* newAuxHashMap(int lgAuxArrInts, int lgConfigK);
+ AuxHashMap(int lgAuxArrInts, int lgConfigK, const A& allocator);
+ static AuxHashMap* newAuxHashMap(int lgAuxArrInts, int lgConfigK, const A& allocator);
static AuxHashMap* newAuxHashMap(const AuxHashMap<A>& that);
static AuxHashMap* deserialize(const void* bytes, size_t len,
int lgConfigK,
int auxCount, int lgAuxArrInts,
- bool srcCompact);
+ bool srcCompact, const A& allocator);
static AuxHashMap* deserialize(std::istream& is, int lgConfigK,
int auxCount, int lgAuxArrInts,
- bool srcCompact);
- virtual ~AuxHashMap();
+ bool srcCompact, const A& allocator);
+ virtual ~AuxHashMap() = default;
static std::function<void(AuxHashMap<A>*)> make_deleter();
AuxHashMap* copy() const;
@@ -64,6 +63,8 @@ class AuxHashMap final {
private:
typedef typename std::allocator_traits<A>::template rebind_alloc<AuxHashMap<A>> ahmAlloc;
+ using vector_int = std::vector<int, typename std::allocator_traits<A>::template rebind_alloc<int>>;
+
// static so it can be used when resizing
static int find(const int* auxArr, int lgAuxArrInts, int lgConfigK, int slotNo);
@@ -73,7 +74,7 @@ class AuxHashMap final {
const int lgConfigK;
int lgAuxArrInts;
int auxCount;
- int* auxIntArr;
+ vector_int entries;
};
}
diff --git a/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp b/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp
index 8baecbe..0fa0af8 100644
--- a/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp
+++ b/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp
@@ -24,7 +24,7 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A = std::allocator<uint8_t>>
class CompositeInterpolationXTable {
public:
static int get_y_stride(int logK);
@@ -37,4 +37,4 @@ class CompositeInterpolationXTable {
#include "CompositeInterpolationXTable-internal.hpp"
-#endif /* _COMPOSITEINTERPOLATIONXTABLE_HPP_ */
\ No newline at end of file
+#endif /* _COMPOSITEINTERPOLATIONXTABLE_HPP_ */
diff --git a/be/src/thirdparty/datasketches/CouponHashSet-internal.hpp b/be/src/thirdparty/datasketches/CouponHashSet-internal.hpp
index 35facfe..29a3ea7 100644
--- a/be/src/thirdparty/datasketches/CouponHashSet-internal.hpp
+++ b/be/src/thirdparty/datasketches/CouponHashSet-internal.hpp
@@ -31,8 +31,8 @@ template<typename A>
static int find(const int* array, const int lgArrInts, const int coupon);
template<typename A>
-CouponHashSet<A>::CouponHashSet(const int lgConfigK, const target_hll_type tgtHllType)
- : CouponList<A>(lgConfigK, tgtHllType, hll_mode::SET)
+CouponHashSet<A>::CouponHashSet(const int lgConfigK, const target_hll_type tgtHllType, const A& allocator)
+ : CouponList<A>(lgConfigK, tgtHllType, hll_mode::SET, allocator)
{
if (lgConfigK <= 7) {
throw std::invalid_argument("CouponHashSet must be initialized with lgConfigK > 7. Found: "
@@ -41,27 +41,21 @@ CouponHashSet<A>::CouponHashSet(const int lgConfigK, const target_hll_type tgtHl
}
template<typename A>
-CouponHashSet<A>::CouponHashSet(const CouponHashSet<A>& that)
- : CouponList<A>(that) {}
-
-template<typename A>
CouponHashSet<A>::CouponHashSet(const CouponHashSet<A>& that, const target_hll_type tgtHllType)
: CouponList<A>(that, tgtHllType) {}
template<typename A>
-CouponHashSet<A>::~CouponHashSet() {}
-
-template<typename A>
std::function<void(HllSketchImpl<A>*)> CouponHashSet<A>::get_deleter() const {
return [](HllSketchImpl<A>* ptr) {
CouponHashSet<A>* chs = static_cast<CouponHashSet<A>*>(ptr);
+ ChsAlloc chsa(chs->getAllocator());
chs->~CouponHashSet();
- chsAlloc().deallocate(chs, 1);
+ chsa.deallocate(chs, 1);
};
}
template<typename A>
-CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len) {
+CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len, const A& allocator) {
if (len < HllUtil<A>::HASH_SET_INT_ARR_START) { // hard-coded
throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
}
@@ -79,7 +73,7 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len) {
const hll_mode mode = HllSketchImpl<A>::extractCurMode(data[HllUtil<A>::MODE_BYTE]);
if (mode != SET) {
- throw std::invalid_argument("Calling set construtor with non-set mode data");
+ throw std::invalid_argument("Calling set constructor with non-set mode data");
}
const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[HllUtil<A>::MODE_BYTE]);
@@ -106,7 +100,8 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len) {
+ ", found: " + std::to_string(len));
}
- CouponHashSet<A>* sketch = new (chsAlloc().allocate(1)) CouponHashSet<A>(lgK, tgtHllType);
+ ChsAlloc chsa(allocator);
+ CouponHashSet<A>* sketch = new (chsa.allocate(1)) CouponHashSet<A>(lgK, tgtHllType, allocator);
if (compactFlag) {
const uint8_t* curPos = data + HllUtil<A>::HASH_SET_INT_ARR_START;
@@ -116,24 +111,19 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(const void* bytes, size_t len) {
sketch->couponUpdate(coupon);
}
} else {
- int* oldArr = sketch->couponIntArr;
- const size_t oldArrLen = 1 << sketch->lgCouponArrInts;
- sketch->lgCouponArrInts = lgArrInts;
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- sketch->couponIntArr = intAlloc().allocate(1 << lgArrInts);
+ sketch->coupons.resize(1 << lgArrInts);
sketch->couponCount = couponCount;
// only need to read valid coupons, unlike in stream case
- std::memcpy(sketch->couponIntArr,
+ std::memcpy(sketch->coupons.data(),
data + HllUtil<A>::HASH_SET_INT_ARR_START,
couponCount * sizeof(int));
- intAlloc().deallocate(oldArr, oldArrLen);
}
return sketch;
}
template<typename A>
-CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is) {
+CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is, const A& allocator) {
uint8_t listHeader[8];
is.read((char*)listHeader, 8 * sizeof(uint8_t));
@@ -149,7 +139,7 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is) {
hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[HllUtil<A>::MODE_BYTE]);
if (mode != SET) {
- throw std::invalid_argument("Calling set construtor with non-set mode data");
+ throw std::invalid_argument("Calling set constructor with non-set mode data");
}
target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[HllUtil<A>::MODE_BYTE]);
@@ -168,7 +158,8 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is) {
lgArrInts = HllUtil<A>::computeLgArrInts(SET, couponCount, lgK);
}
- CouponHashSet<A>* sketch = new (chsAlloc().allocate(1)) CouponHashSet<A>(lgK, tgtHllType);
+ ChsAlloc chsa(allocator);
+ CouponHashSet<A>* sketch = new (chsa.allocate(1)) CouponHashSet<A>(lgK, tgtHllType, allocator);
typedef std::unique_ptr<CouponHashSet<A>, std::function<void(HllSketchImpl<A>*)>> coupon_hash_set_ptr;
coupon_hash_set_ptr ptr(sketch, sketch->get_deleter());
@@ -181,13 +172,10 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is) {
sketch->couponUpdate(coupon);
}
} else {
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- intAlloc().deallocate(sketch->couponIntArr, 1 << sketch->lgCouponArrInts);
- sketch->lgCouponArrInts = lgArrInts;
- sketch->couponIntArr = intAlloc().allocate(1 << lgArrInts);
+ sketch->coupons.resize(1 << lgArrInts);
sketch->couponCount = couponCount;
// for stream processing, read entire list so read pointer ends up set correctly
- is.read((char*)sketch->couponIntArr, (1 << sketch->lgCouponArrInts) * sizeof(int));
+ is.read((char*)sketch->coupons.data(), sketch->coupons.size() * sizeof(int));
}
if (!is.good())
@@ -198,21 +186,24 @@ CouponHashSet<A>* CouponHashSet<A>::newSet(std::istream& is) {
template<typename A>
CouponHashSet<A>* CouponHashSet<A>::copy() const {
- return new (chsAlloc().allocate(1)) CouponHashSet<A>(*this);
+ ChsAlloc chsa(this->coupons.get_allocator());
+ return new (chsa.allocate(1)) CouponHashSet<A>(*this);
}
template<typename A>
CouponHashSet<A>* CouponHashSet<A>::copyAs(const target_hll_type tgtHllType) const {
- return new (chsAlloc().allocate(1)) CouponHashSet<A>(*this, tgtHllType);
+ ChsAlloc chsa(this->coupons.get_allocator());
+ return new (chsa.allocate(1)) CouponHashSet<A>(*this, tgtHllType);
}
template<typename A>
HllSketchImpl<A>* CouponHashSet<A>::couponUpdate(int coupon) {
- const int index = find<A>(this->couponIntArr, this->lgCouponArrInts, coupon);
+ const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(this->coupons.size());
+ const int index = find<A>(this->coupons.data(), lgCouponArrInts, coupon);
if (index >= 0) {
return this; // found duplicate, ignore
}
- this->couponIntArr[~index] = coupon; // found empty
+ this->coupons[~index] = coupon; // found empty
++this->couponCount;
if (checkGrowOrPromote()) {
return this->promoteHeapListOrSetToHll(*this);
@@ -232,39 +223,34 @@ int CouponHashSet<A>::getPreInts() const {
template<typename A>
bool CouponHashSet<A>::checkGrowOrPromote() {
- if ((HllUtil<A>::RESIZE_DENOM * this->couponCount) > (HllUtil<A>::RESIZE_NUMER * (1 << this->lgCouponArrInts))) {
- if (this->lgCouponArrInts == (this->lgConfigK - 3)) { // at max size
+ if (static_cast<size_t>(HllUtil<A>::RESIZE_DENOM * this->couponCount) > (HllUtil<A>::RESIZE_NUMER * this->coupons.size())) {
+ const uint8_t lgCouponArrInts = count_trailing_zeros_in_u32(this->coupons.size());
+ if (lgCouponArrInts == (this->lgConfigK - 3)) { // at max size
return true; // promote to HLL
}
- int tgtLgCoupArrSize = this->lgCouponArrInts + 1;
- growHashSet(this->lgCouponArrInts, tgtLgCoupArrSize);
+ growHashSet(lgCouponArrInts + 1);
}
return false;
}
template<typename A>
-void CouponHashSet<A>::growHashSet(const int srcLgCoupArrSize, const int tgtLgCoupArrSize) {
+void CouponHashSet<A>::growHashSet(int tgtLgCoupArrSize) {
const int tgtLen = 1 << tgtLgCoupArrSize;
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- int* tgtCouponIntArr = intAlloc().allocate(tgtLen);
- std::fill(tgtCouponIntArr, tgtCouponIntArr + tgtLen, 0);
+ vector_int coupons_new(tgtLen, 0, this->coupons.get_allocator());
- const int srcLen = 1 << srcLgCoupArrSize;
+ const int srcLen = this->coupons.size();
for (int i = 0; i < srcLen; ++i) { // scan existing array for non-zero values
- const int fetched = this->couponIntArr[i];
+ const int fetched = this->coupons[i];
if (fetched != HllUtil<A>::EMPTY) {
- const int idx = find<A>(tgtCouponIntArr, tgtLgCoupArrSize, fetched); // search TGT array
+ const int idx = find<A>(coupons_new.data(), tgtLgCoupArrSize, fetched); // search TGT array
if (idx < 0) { // found EMPTY
- tgtCouponIntArr[~idx] = fetched; // insert
+ coupons_new[~idx] = fetched; // insert
continue;
}
throw std::runtime_error("Error: Found duplicate coupon");
}
}
-
- intAlloc().deallocate(this->couponIntArr, 1 << this->lgCouponArrInts);
- this->couponIntArr = tgtCouponIntArr;
- this->lgCouponArrInts = tgtLgCoupArrSize;
+ this->coupons = std::move(coupons_new);
}
template<typename A>
diff --git a/be/src/thirdparty/datasketches/CouponHashSet.hpp b/be/src/thirdparty/datasketches/CouponHashSet.hpp
index 7aaffc3..b9b99b7 100644
--- a/be/src/thirdparty/datasketches/CouponHashSet.hpp
+++ b/be/src/thirdparty/datasketches/CouponHashSet.hpp
@@ -24,20 +24,20 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A>
class CouponHashSet : public CouponList<A> {
public:
- static CouponHashSet* newSet(const void* bytes, size_t len);
- static CouponHashSet* newSet(std::istream& is);
- explicit CouponHashSet(int lgConfigK, target_hll_type tgtHllType);
- explicit CouponHashSet(const CouponHashSet& that, target_hll_type tgtHllType);
- explicit CouponHashSet(const CouponHashSet& that);
+ static CouponHashSet* newSet(const void* bytes, size_t len, const A& allocator);
+ static CouponHashSet* newSet(std::istream& is, const A& allocator);
+ CouponHashSet(int lgConfigK, target_hll_type tgtHllType, const A& allocator);
+ CouponHashSet(const CouponHashSet& that, target_hll_type tgtHllType);
- virtual ~CouponHashSet();
+ virtual ~CouponHashSet() = default;
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
protected:
-
+ using vector_int = std::vector<int, typename std::allocator_traits<A>::template rebind_alloc<int>>;
+
virtual CouponHashSet* copy() const;
virtual CouponHashSet* copyAs(target_hll_type tgtHllType) const;
@@ -49,9 +49,9 @@ class CouponHashSet : public CouponList<A> {
friend class HllSketchImplFactory<A>;
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<CouponHashSet<A>> chsAlloc;
+ using ChsAlloc = typename std::allocator_traits<A>::template rebind_alloc<CouponHashSet<A>>;
bool checkGrowOrPromote();
- void growHashSet(int srcLgCoupArrSize, int tgtLgCoupArrSize);
+ void growHashSet(int tgtLgCoupArrSize);
};
}
diff --git a/be/src/thirdparty/datasketches/CouponList-internal.hpp b/be/src/thirdparty/datasketches/CouponList-internal.hpp
index 1800a37..fd304c8 100644
--- a/be/src/thirdparty/datasketches/CouponList-internal.hpp
+++ b/be/src/thirdparty/datasketches/CouponList-internal.hpp
@@ -23,6 +23,7 @@
#include "CouponList.hpp"
#include "CubicInterpolation.hpp"
#include "HllUtil.hpp"
+#include "count_zeros.hpp"
#include <algorithm>
#include <cmath>
@@ -30,74 +31,45 @@
namespace datasketches {
template<typename A>
-CouponList<A>::CouponList(const int lgConfigK, const target_hll_type tgtHllType, const hll_mode mode)
- : HllSketchImpl<A>(lgConfigK, tgtHllType, mode, false) {
- if (mode == hll_mode::LIST) {
- lgCouponArrInts = HllUtil<A>::LG_INIT_LIST_SIZE;
- } else { // mode == SET
- lgCouponArrInts = HllUtil<A>::LG_INIT_SET_SIZE;
- }
- oooFlag = false;
- const int arrayLen = 1 << lgCouponArrInts;
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- couponIntArr = intAlloc().allocate(arrayLen);
- std::fill(couponIntArr, couponIntArr + arrayLen, 0);
- couponCount = 0;
-}
-
-template<typename A>
-CouponList<A>::CouponList(const CouponList& that)
- : HllSketchImpl<A>(that.lgConfigK, that.tgtHllType, that.mode, false),
- lgCouponArrInts(that.lgCouponArrInts),
- couponCount(that.couponCount),
- oooFlag(that.oooFlag) {
-
- const int numItems = 1 << lgCouponArrInts;
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- couponIntArr = intAlloc().allocate(numItems);
- std::copy(that.couponIntArr, that.couponIntArr + numItems, couponIntArr);
-}
-
-template<typename A>
-CouponList<A>::CouponList(const CouponList& that, const target_hll_type tgtHllType)
- : HllSketchImpl<A>(that.lgConfigK, tgtHllType, that.mode, false),
- lgCouponArrInts(that.lgCouponArrInts),
- couponCount(that.couponCount),
- oooFlag(that.oooFlag) {
-
- const int numItems = 1 << lgCouponArrInts;
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- couponIntArr = intAlloc().allocate(numItems);
- std::copy(that.couponIntArr, that.couponIntArr + numItems, couponIntArr);
-}
+CouponList<A>::CouponList(const int lgConfigK, const target_hll_type tgtHllType, const hll_mode mode, const A& allocator):
+HllSketchImpl<A>(lgConfigK, tgtHllType, mode, false),
+couponCount(0),
+oooFlag(false),
+coupons(1 << (mode == hll_mode::LIST ? HllUtil<A>::LG_INIT_LIST_SIZE : HllUtil<A>::LG_INIT_SET_SIZE), 0, allocator)
+{}
template<typename A>
-CouponList<A>::~CouponList() {
- typedef typename std::allocator_traits<A>::template rebind_alloc<int> intAlloc;
- intAlloc().deallocate(couponIntArr, 1 << lgCouponArrInts);
-}
+CouponList<A>::CouponList(const CouponList& that, const target_hll_type tgtHllType):
+HllSketchImpl<A>(that.lgConfigK, tgtHllType, that.mode, false),
+couponCount(that.couponCount),
+oooFlag(that.oooFlag),
+coupons(that.coupons)
+{}
template<typename A>
std::function<void(HllSketchImpl<A>*)> CouponList<A>::get_deleter() const {
return [](HllSketchImpl<A>* ptr) {
CouponList<A>* cl = static_cast<CouponList<A>*>(ptr);
+ ClAlloc cla(cl->getAllocator());
cl->~CouponList();
- clAlloc().deallocate(cl, 1);
+ cla.deallocate(cl, 1);
};
}
template<typename A>
CouponList<A>* CouponList<A>::copy() const {
- return new (clAlloc().allocate(1)) CouponList<A>(*this);
+ ClAlloc cla(coupons.get_allocator());
+ return new (cla.allocate(1)) CouponList<A>(*this);
}
template<typename A>
CouponList<A>* CouponList<A>::copyAs(target_hll_type tgtHllType) const {
- return new (clAlloc().allocate(1)) CouponList<A>(*this, tgtHllType);
+ ClAlloc cla(coupons.get_allocator());
+ return new (cla.allocate(1)) CouponList<A>(*this, tgtHllType);
}
template<typename A>
-CouponList<A>* CouponList<A>::newList(const void* bytes, size_t len) {
+CouponList<A>* CouponList<A>::newList(const void* bytes, size_t len, const A& allocator) {
if (len < HllUtil<A>::LIST_INT_ARR_START) {
throw std::out_of_range("Input data length insufficient to hold CouponHashSet");
}
@@ -115,7 +87,7 @@ CouponList<A>* CouponList<A>::newList(const void* bytes, size_t len) {
hll_mode mode = HllSketchImpl<A>::extractCurMode(data[HllUtil<A>::MODE_BYTE]);
if (mode != LIST) {
- throw std::invalid_argument("Calling set construtor with non-list mode data");
+ throw std::invalid_argument("Calling list constructor with non-list mode data");
}
target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(data[HllUtil<A>::MODE_BYTE]);
@@ -133,20 +105,21 @@ CouponList<A>* CouponList<A>::newList(const void* bytes, size_t len) {
+ ", found: " + std::to_string(len));
}
- CouponList<A>* sketch = new (clAlloc().allocate(1)) CouponList<A>(lgK, tgtHllType, mode);
+ ClAlloc cla(allocator);
+ CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
sketch->couponCount = couponCount;
sketch->putOutOfOrderFlag(oooFlag); // should always be false for LIST
if (!emptyFlag) {
// only need to read valid coupons, unlike in stream case
- std::memcpy(sketch->couponIntArr, data + HllUtil<A>::LIST_INT_ARR_START, couponCount * sizeof(int));
+ std::memcpy(sketch->coupons.data(), data + HllUtil<A>::LIST_INT_ARR_START, couponCount * sizeof(int));
}
return sketch;
}
template<typename A>
-CouponList<A>* CouponList<A>::newList(std::istream& is) {
+CouponList<A>* CouponList<A>::newList(std::istream& is, const A& allocator) {
uint8_t listHeader[8];
is.read((char*)listHeader, 8 * sizeof(uint8_t));
@@ -162,7 +135,7 @@ CouponList<A>* CouponList<A>::newList(std::istream& is) {
hll_mode mode = HllSketchImpl<A>::extractCurMode(listHeader[HllUtil<A>::MODE_BYTE]);
if (mode != LIST) {
- throw std::invalid_argument("Calling list construtor with non-list mode data");
+ throw std::invalid_argument("Calling list constructor with non-list mode data");
}
const target_hll_type tgtHllType = HllSketchImpl<A>::extractTgtHllType(listHeader[HllUtil<A>::MODE_BYTE]);
@@ -172,8 +145,9 @@ CouponList<A>* CouponList<A>::newList(std::istream& is) {
const bool oooFlag = ((listHeader[HllUtil<A>::FLAGS_BYTE] & HllUtil<A>::OUT_OF_ORDER_FLAG_MASK) ? true : false);
const bool emptyFlag = ((listHeader[HllUtil<A>::FLAGS_BYTE] & HllUtil<A>::EMPTY_FLAG_MASK) ? true : false);
- CouponList<A>* sketch = new (clAlloc().allocate(1)) CouponList<A>(lgK, tgtHllType, mode);
- typedef std::unique_ptr<CouponList<A>, std::function<void(HllSketchImpl<A>*)>> coupon_list_ptr;
+ ClAlloc cla(allocator);
+ CouponList<A>* sketch = new (cla.allocate(1)) CouponList<A>(lgK, tgtHllType, mode, allocator);
+ using coupon_list_ptr = std::unique_ptr<CouponList<A>, std::function<void(HllSketchImpl<A>*)>>;
coupon_list_ptr ptr(sketch, sketch->get_deleter());
const int couponCount = listHeader[HllUtil<A>::LIST_COUNT_BYTE];
sketch->couponCount = couponCount;
@@ -183,8 +157,8 @@ CouponList<A>* CouponList<A>::newList(std::istream& is) {
// For stream processing, need to read entire number written to stream so read
// pointer ends up set correctly.
// If not compact, still need to read empty items even though in order.
- const int numToRead = (compact ? couponCount : (1 << sketch->lgCouponArrInts));
- is.read((char*)sketch->couponIntArr, numToRead * sizeof(int));
+ const int numToRead = (compact ? couponCount : sketch->coupons.size());
+ is.read((char*)sketch->coupons.data(), numToRead * sizeof(int));
}
if (!is.good())
@@ -196,14 +170,14 @@ CouponList<A>* CouponList<A>::newList(std::istream& is) {
template<typename A>
vector_u8<A> CouponList<A>::serialize(bool compact, unsigned header_size_bytes) const {
const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
- vector_u8<A> byteArr(sketchSizeBytes);
+ vector_u8<A> byteArr(sketchSizeBytes, 0, getAllocator());
uint8_t* bytes = byteArr.data() + header_size_bytes;
bytes[HllUtil<A>::PREAMBLE_INTS_BYTE] = static_cast<uint8_t>(getPreInts());
bytes[HllUtil<A>::SER_VER_BYTE] = static_cast<uint8_t>(HllUtil<A>::SER_VER);
bytes[HllUtil<A>::FAMILY_BYTE] = static_cast<uint8_t>(HllUtil<A>::FAMILY_ID);
bytes[HllUtil<A>::LG_K_BYTE] = static_cast<uint8_t>(this->lgConfigK);
- bytes[HllUtil<A>::LG_ARR_BYTE] = static_cast<uint8_t>(lgCouponArrInts);
+ bytes[HllUtil<A>::LG_ARR_BYTE] = count_trailing_zeros_in_u32(coupons.size());
bytes[HllUtil<A>::FLAGS_BYTE] = this->makeFlagsByte(compact);
bytes[HllUtil<A>::LIST_COUNT_BYTE] = static_cast<uint8_t>(this->mode == LIST ? couponCount : 0);
bytes[HllUtil<A>::MODE_BYTE] = this->makeModeByte();
@@ -217,7 +191,7 @@ vector_u8<A> CouponList<A>::serialize(bool compact, unsigned header_size_bytes)
const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
switch (sw) {
case 0: { // src updatable, dst updatable
- std::memcpy(bytes + getMemDataStart(), getCouponIntArr(), (1 << lgCouponArrInts) * sizeof(int));
+ std::memcpy(bytes + getMemDataStart(), coupons.data(), coupons.size() * sizeof(int));
break;
}
case 1: { // src updatable, dst compact
@@ -247,7 +221,7 @@ void CouponList<A>::serialize(std::ostream& os, const bool compact) const {
os.write((char*)&familyId, sizeof(familyId));
const uint8_t lgKByte((uint8_t) this->lgConfigK);
os.write((char*)&lgKByte, sizeof(lgKByte));
- const uint8_t lgArrIntsByte((uint8_t) lgCouponArrInts);
+ const uint8_t lgArrIntsByte(count_trailing_zeros_in_u32(coupons.size()));
os.write((char*)&lgArrIntsByte, sizeof(lgArrIntsByte));
const uint8_t flagsByte(this->makeFlagsByte(compact));
os.write((char*)&flagsByte, sizeof(flagsByte));
@@ -273,7 +247,7 @@ void CouponList<A>::serialize(std::ostream& os, const bool compact) const {
const int sw = (isCompact() ? 2 : 0) | (compact ? 1 : 0);
switch (sw) {
case 0: { // src updatable, dst updatable
- os.write((char*)getCouponIntArr(), (1 << lgCouponArrInts) * sizeof(int));
+ os.write((char*)coupons.data(), coupons.size() * sizeof(int));
break;
}
case 1: { // src updatable, dst compact
@@ -292,13 +266,12 @@ void CouponList<A>::serialize(std::ostream& os, const bool compact) const {
template<typename A>
HllSketchImpl<A>* CouponList<A>::couponUpdate(int coupon) {
- const int len = 1 << lgCouponArrInts;
- for (int i = 0; i < len; ++i) { // search for empty slot
- const int couponAtIdx = couponIntArr[i];
+ for (size_t i = 0; i < coupons.size(); ++i) { // search for empty slot
+ const int couponAtIdx = coupons[i];
if (couponAtIdx == HllUtil<A>::EMPTY) {
- couponIntArr[i] = coupon; // the actual update
+ coupons[i] = coupon; // the actual update
++couponCount;
- if (couponCount >= len) { // array full
+ if (couponCount == static_cast<int>(coupons.size())) { // array full
if (this->lgConfigK < 8) {
return promoteHeapListOrSetToHll(*this);
}
@@ -348,7 +321,7 @@ bool CouponList<A>::isEmpty() const { return getCouponCount() == 0; }
template<typename A>
int CouponList<A>::getUpdatableSerializationBytes() const {
- return getMemDataStart() + (4 << getLgCouponArrInts());
+ return getMemDataStart() + coupons.size() * sizeof(int);
}
template<typename A>
@@ -383,13 +356,8 @@ void CouponList<A>::putOutOfOrderFlag(bool oooFlag) {
}
template<typename A>
-int CouponList<A>::getLgCouponArrInts() const {
- return lgCouponArrInts;
-}
-
-template<typename A>
-int* CouponList<A>::getCouponIntArr() const {
- return couponIntArr;
+A CouponList<A>::getAllocator() const {
+ return coupons.get_allocator();
}
template<typename A>
@@ -404,12 +372,12 @@ HllSketchImpl<A>* CouponList<A>::promoteHeapListOrSetToHll(CouponList& src) {
template<typename A>
coupon_iterator<A> CouponList<A>::begin(bool all) const {
- return coupon_iterator<A>(couponIntArr, 1 << lgCouponArrInts, 0, all);
+ return coupon_iterator<A>(coupons.data(), coupons.size(), 0, all);
}
template<typename A>
coupon_iterator<A> CouponList<A>::end() const {
- return coupon_iterator<A>(couponIntArr, 1 << lgCouponArrInts, 1 << lgCouponArrInts, false);
+ return coupon_iterator<A>(coupons.data(), coupons.size(), coupons.size(), false);
}
}
diff --git a/be/src/thirdparty/datasketches/CouponList.hpp b/be/src/thirdparty/datasketches/CouponList.hpp
index 063805b..c19569e 100644
--- a/be/src/thirdparty/datasketches/CouponList.hpp
+++ b/be/src/thirdparty/datasketches/CouponList.hpp
@@ -30,19 +30,18 @@ namespace datasketches {
template<typename A>
class HllSketchImplFactory;
-template<typename A = std::allocator<char>>
+template<typename A>
class CouponList : public HllSketchImpl<A> {
public:
- explicit CouponList(int lgConfigK, target_hll_type tgtHllType, hll_mode mode);
- explicit CouponList(const CouponList& that);
- explicit CouponList(const CouponList& that, target_hll_type tgtHllType);
+ CouponList(int lgConfigK, target_hll_type tgtHllType, hll_mode mode, const A& allocator);
+ CouponList(const CouponList& that, target_hll_type tgtHllType);
- static CouponList* newList(const void* bytes, size_t len);
- static CouponList* newList(std::istream& is);
+ static CouponList* newList(const void* bytes, size_t len, const A& allocator);
+ static CouponList* newList(std::istream& is, const A& allocator);
virtual vector_u8<A> serialize(bool compact, unsigned header_size_bytes) const;
virtual void serialize(std::ostream& os, bool compact) const;
- virtual ~CouponList();
+ virtual ~CouponList() = default;
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
virtual CouponList* copy() const;
@@ -62,7 +61,9 @@ class CouponList : public HllSketchImpl<A> {
coupon_iterator<A> end() const;
protected:
- typedef typename std::allocator_traits<A>::template rebind_alloc<CouponList<A>> clAlloc;
+ using ClAlloc = typename std::allocator_traits<A>::template rebind_alloc<CouponList<A>>;
+
+ using vector_int = std::vector<int, typename std::allocator_traits<A>::template rebind_alloc<int>>;
HllSketchImpl<A>* promoteHeapListToSet(CouponList& list);
HllSketchImpl<A>* promoteHeapListOrSetToHll(CouponList& src);
@@ -75,13 +76,11 @@ class CouponList : public HllSketchImpl<A> {
virtual bool isOutOfOrderFlag() const;
virtual void putOutOfOrderFlag(bool oooFlag);
- virtual int getLgCouponArrInts() const;
- virtual int* getCouponIntArr() const;
+ virtual A getAllocator() const;
- int lgCouponArrInts;
int couponCount;
bool oooFlag;
- int* couponIntArr;
+ vector_int coupons;
friend class HllSketchImplFactory<A>;
};
diff --git a/be/src/thirdparty/datasketches/CubicInterpolation.hpp b/be/src/thirdparty/datasketches/CubicInterpolation.hpp
index b9cdfe7..58fb7d7 100644
--- a/be/src/thirdparty/datasketches/CubicInterpolation.hpp
+++ b/be/src/thirdparty/datasketches/CubicInterpolation.hpp
@@ -24,7 +24,7 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A = std::allocator<uint8_t>>
class CubicInterpolation {
public:
static double usingXAndYTables(const double xArr[], const double yArr[],
@@ -40,4 +40,4 @@ class CubicInterpolation {
#include "CubicInterpolation-internal.hpp"
-#endif /* _CUBICINTERPOLATION_HPP_ */
\ No newline at end of file
+#endif /* _CUBICINTERPOLATION_HPP_ */
diff --git a/be/src/thirdparty/datasketches/HarmonicNumbers.hpp b/be/src/thirdparty/datasketches/HarmonicNumbers.hpp
index 501ce0c..34b830a 100644
--- a/be/src/thirdparty/datasketches/HarmonicNumbers.hpp
+++ b/be/src/thirdparty/datasketches/HarmonicNumbers.hpp
@@ -25,7 +25,7 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A = std::allocator<uint8_t>>
class HarmonicNumbers {
public:
/**
@@ -45,4 +45,4 @@ class HarmonicNumbers {
#include "HarmonicNumbers-internal.hpp"
-#endif /* _HARMONICNUMBERS_HPP_ */
\ No newline at end of file
+#endif /* _HARMONICNUMBERS_HPP_ */
diff --git a/be/src/thirdparty/datasketches/Hll4Array-internal.hpp b/be/src/thirdparty/datasketches/Hll4Array-internal.hpp
index 8498bb8..f93014a 100644
--- a/be/src/thirdparty/datasketches/Hll4Array-internal.hpp
+++ b/be/src/thirdparty/datasketches/Hll4Array-internal.hpp
@@ -30,13 +30,12 @@
namespace datasketches {
template<typename A>
-Hll4Array<A>::Hll4Array(const int lgConfigK, const bool startFullSize) :
- HllArray<A>(lgConfigK, target_hll_type::HLL_4, startFullSize) {
+Hll4Array<A>::Hll4Array(const int lgConfigK, const bool startFullSize, const A& allocator):
+HllArray<A>(lgConfigK, target_hll_type::HLL_4, startFullSize, allocator),
+auxHashMap(nullptr)
+{
const int numBytes = this->hll4ArrBytes(lgConfigK);
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> uint8Alloc;
- this->hllByteArr = uint8Alloc().allocate(numBytes);
- std::fill(this->hllByteArr, this->hllByteArr + numBytes, 0);
- auxHashMap = nullptr;
+ this->hllByteArr.resize(numBytes, 0);
}
template<typename A>
@@ -63,17 +62,19 @@ Hll4Array<A>::~Hll4Array() {
template<typename A>
std::function<void(HllSketchImpl<A>*)> Hll4Array<A>::get_deleter() const {
return [](HllSketchImpl<A>* ptr) {
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>> hll4Alloc;
Hll4Array<A>* hll = static_cast<Hll4Array<A>*>(ptr);
+ using Hll4Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>>;
+ Hll4Alloc hll4Alloc(hll->getAllocator());
hll->~Hll4Array();
- hll4Alloc().deallocate(hll, 1);
+ hll4Alloc.deallocate(hll, 1);
};
}
template<typename A>
Hll4Array<A>* Hll4Array<A>::copy() const {
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>> hll4Alloc;
- return new (hll4Alloc().allocate(1)) Hll4Array<A>(*this);
+ using Hll4Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>>;
+ Hll4Alloc hll4Alloc(this->getAllocator());
+ return new (hll4Alloc.allocate(1)) Hll4Array<A>(*this);
}
template<typename A>
@@ -195,7 +196,7 @@ void Hll4Array<A>::internalHll4Update(const int slotNo, const int newVal) {
// added to the exception table
putSlot(slotNo, HllUtil<A>::AUX_TOKEN);
if (auxHashMap == nullptr) {
- auxHashMap = AuxHashMap<A>::newAuxHashMap(HllUtil<A>::LG_AUX_ARR_INTS[this->lgConfigK], this->lgConfigK);
+ auxHashMap = AuxHashMap<A>::newAuxHashMap(HllUtil<A>::LG_AUX_ARR_INTS[this->lgConfigK], this->lgConfigK, this->getAllocator());
}
auxHashMap->mustAdd(slotNo, newVal);
}
@@ -285,7 +286,7 @@ void Hll4Array<A>::shiftToBiggerCurMin() {
} else { //newShiftedVal >= AUX_TOKEN
// the former exception remains an exception, so must be added to the newAuxMap
if (newAuxMap == nullptr) {
- newAuxMap = AuxHashMap<A>::newAuxHashMap(HllUtil<A>::LG_AUX_ARR_INTS[this->lgConfigK], this->lgConfigK);
+ newAuxMap = AuxHashMap<A>::newAuxHashMap(HllUtil<A>::LG_AUX_ARR_INTS[this->lgConfigK], this->lgConfigK, this->getAllocator());
}
newAuxMap->mustAdd(slotNum, oldActualVal);
}
@@ -315,12 +316,12 @@ void Hll4Array<A>::shiftToBiggerCurMin() {
template<typename A>
typename HllArray<A>::const_iterator Hll4Array<A>::begin(bool all) const {
- return typename HllArray<A>::const_iterator(this->hllByteArr, 1 << this->lgConfigK, 0, this->tgtHllType, auxHashMap, this->curMin, all);
+ return typename HllArray<A>::const_iterator(this->hllByteArr.data(), 1 << this->lgConfigK, 0, this->tgtHllType, auxHashMap, this->curMin, all);
}
template<typename A>
typename HllArray<A>::const_iterator Hll4Array<A>::end() const {
- return typename HllArray<A>::const_iterator(this->hllByteArr, 1 << this->lgConfigK, 1 << this->lgConfigK, this->tgtHllType, auxHashMap, this->curMin, false);
+ return typename HllArray<A>::const_iterator(this->hllByteArr.data(), 1 << this->lgConfigK, 1 << this->lgConfigK, this->tgtHllType, auxHashMap, this->curMin, false);
}
template<typename A>
diff --git a/be/src/thirdparty/datasketches/Hll4Array.hpp b/be/src/thirdparty/datasketches/Hll4Array.hpp
index ff56c86..38b2c94 100644
--- a/be/src/thirdparty/datasketches/Hll4Array.hpp
+++ b/be/src/thirdparty/datasketches/Hll4Array.hpp
@@ -31,7 +31,7 @@ class Hll4Iterator;
template<typename A>
class Hll4Array final : public HllArray<A> {
public:
- explicit Hll4Array(int lgConfigK, bool startFullSize);
+ explicit Hll4Array(int lgConfigK, bool startFullSize, const A& allocator);
explicit Hll4Array(const Hll4Array<A>& that);
virtual ~Hll4Array();
diff --git a/be/src/thirdparty/datasketches/Hll6Array-internal.hpp b/be/src/thirdparty/datasketches/Hll6Array-internal.hpp
index a318564..e9f6e9f 100644
--- a/be/src/thirdparty/datasketches/Hll6Array-internal.hpp
+++ b/be/src/thirdparty/datasketches/Hll6Array-internal.hpp
@@ -27,40 +27,29 @@
namespace datasketches {
template<typename A>
-Hll6Array<A>::Hll6Array(const int lgConfigK, const bool startFullSize) :
- HllArray<A>(lgConfigK, target_hll_type::HLL_6, startFullSize) {
- const int numBytes = this->hll6ArrBytes(lgConfigK);
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> uint8Alloc;
- this->hllByteArr = uint8Alloc().allocate(numBytes);
- std::fill(this->hllByteArr, this->hllByteArr + numBytes, 0);
-}
-
-template<typename A>
-Hll6Array<A>::Hll6Array(const Hll6Array<A>& that) :
- HllArray<A>(that)
+Hll6Array<A>::Hll6Array(const int lgConfigK, const bool startFullSize, const A& allocator):
+HllArray<A>(lgConfigK, target_hll_type::HLL_6, startFullSize, allocator)
{
- // can determine hllByteArr size in parent class, no need to allocate here
-}
-
-template<typename A>
-Hll6Array<A>::~Hll6Array() {
- // hllByteArr deleted in parent
+ const int numBytes = this->hll6ArrBytes(lgConfigK);
+ this->hllByteArr.resize(numBytes, 0);
}
template<typename A>
std::function<void(HllSketchImpl<A>*)> Hll6Array<A>::get_deleter() const {
return [](HllSketchImpl<A>* ptr) {
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>> hll6Alloc;
+ using Hll6Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>>;
Hll6Array<A>* hll = static_cast<Hll6Array<A>*>(ptr);
+ Hll6Alloc hll6Alloc(hll->getAllocator());
hll->~Hll6Array();
- hll6Alloc().deallocate(hll, 1);
+ hll6Alloc.deallocate(hll, 1);
};
}
template<typename A>
Hll6Array<A>* Hll6Array<A>::copy() const {
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>> hll6Alloc;
- return new (hll6Alloc().allocate(1)) Hll6Array<A>(*this);
+ using Hll6Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>>;
+ Hll6Alloc hll6Alloc(this->getAllocator());
+ return new (hll6Alloc.allocate(1)) Hll6Array<A>(*this);
}
template<typename A>
diff --git a/be/src/thirdparty/datasketches/Hll6Array.hpp b/be/src/thirdparty/datasketches/Hll6Array.hpp
index 5178de8..03370b2 100644
--- a/be/src/thirdparty/datasketches/Hll6Array.hpp
+++ b/be/src/thirdparty/datasketches/Hll6Array.hpp
@@ -30,10 +30,9 @@ class Hll6Iterator;
template<typename A>
class Hll6Array final : public HllArray<A> {
public:
- explicit Hll6Array(int lgConfigK, bool startFullSize);
- explicit Hll6Array(const Hll6Array<A>& that);
+ Hll6Array(int lgConfigK, bool startFullSize, const A& allocator);
- virtual ~Hll6Array();
+ virtual ~Hll6Array() = default;
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
virtual Hll6Array* copy() const;
diff --git a/be/src/thirdparty/datasketches/Hll8Array-internal.hpp b/be/src/thirdparty/datasketches/Hll8Array-internal.hpp
index cb14a0f..f27a796 100644
--- a/be/src/thirdparty/datasketches/Hll8Array-internal.hpp
+++ b/be/src/thirdparty/datasketches/Hll8Array-internal.hpp
@@ -25,40 +25,29 @@
namespace datasketches {
template<typename A>
-Hll8Array<A>::Hll8Array(const int lgConfigK, const bool startFullSize) :
- HllArray<A>(lgConfigK, target_hll_type::HLL_8, startFullSize) {
- const int numBytes = this->hll8ArrBytes(lgConfigK);
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> uint8Alloc;
- this->hllByteArr = uint8Alloc().allocate(numBytes);
- std::fill(this->hllByteArr, this->hllByteArr + numBytes, 0);
-}
-
-template<typename A>
-Hll8Array<A>::Hll8Array(const Hll8Array<A>& that) :
- HllArray<A>(that)
+Hll8Array<A>::Hll8Array(const int lgConfigK, const bool startFullSize, const A& allocator):
+HllArray<A>(lgConfigK, target_hll_type::HLL_8, startFullSize, allocator)
{
- // can determine hllByteArr size in parent class, no need to allocate here
-}
-
-template<typename A>
-Hll8Array<A>::~Hll8Array() {
- // hllByteArr deleted in parent
+ const int numBytes = this->hll8ArrBytes(lgConfigK);
+ this->hllByteArr.resize(numBytes, 0);
}
template<typename A>
std::function<void(HllSketchImpl<A>*)> Hll8Array<A>::get_deleter() const {
return [](HllSketchImpl<A>* ptr) {
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>> hll8Alloc;
Hll8Array<A>* hll = static_cast<Hll8Array<A>*>(ptr);
+ using Hll8Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>>;
+ Hll8Alloc hll8Alloc(hll->getAllocator());
hll->~Hll8Array();
- hll8Alloc().deallocate(hll, 1);
+ hll8Alloc.deallocate(hll, 1);
};
}
template<typename A>
Hll8Array<A>* Hll8Array<A>::copy() const {
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>> hll8Alloc;
- return new (hll8Alloc().allocate(1)) Hll8Array<A>(*this);
+ using Hll8Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>>;
+ Hll8Alloc hll8Alloc(this->getAllocator());
+ return new (hll8Alloc.allocate(1)) Hll8Array<A>(*this);
}
template<typename A>
diff --git a/be/src/thirdparty/datasketches/Hll8Array.hpp b/be/src/thirdparty/datasketches/Hll8Array.hpp
index 2b0aefc..ea9a5bd 100644
--- a/be/src/thirdparty/datasketches/Hll8Array.hpp
+++ b/be/src/thirdparty/datasketches/Hll8Array.hpp
@@ -30,10 +30,9 @@ class Hll8Iterator;
template<typename A>
class Hll8Array final : public HllArray<A> {
public:
- explicit Hll8Array(int lgConfigK, bool startFullSize);
- explicit Hll8Array(const Hll8Array& that);
+ Hll8Array(int lgConfigK, bool startFullSize, const A& allocator);
- virtual ~Hll8Array();
+ virtual ~Hll8Array() = default;
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const;
virtual Hll8Array<A>* copy() const;
diff --git a/be/src/thirdparty/datasketches/HllArray-internal.hpp b/be/src/thirdparty/datasketches/HllArray-internal.hpp
index 0a4bdce..4479417 100644
--- a/be/src/thirdparty/datasketches/HllArray-internal.hpp
+++ b/be/src/thirdparty/datasketches/HllArray-internal.hpp
@@ -35,48 +35,16 @@
namespace datasketches {
template<typename A>
-HllArray<A>::HllArray(const int lgConfigK, const target_hll_type tgtHllType, bool startFullSize)
- : HllSketchImpl<A>(lgConfigK, tgtHllType, hll_mode::HLL, startFullSize) {
- hipAccum = 0.0;
- kxq0 = 1 << lgConfigK;
- kxq1 = 0.0;
- curMin = 0;
- numAtCurMin = 1 << lgConfigK;
- oooFlag = false;
- hllByteArr = nullptr; // allocated in derived class
-}
-
-template<typename A>
-HllArray<A>::HllArray(const HllArray<A>& that):
-HllSketchImpl<A>(that.lgConfigK, that.tgtHllType, hll_mode::HLL, that.startFullSize),
-hipAccum(that.hipAccum),
-kxq0(that.kxq0),
-kxq1(that.kxq1),
-hllByteArr(nullptr),
-curMin(that.curMin),
-numAtCurMin(that.numAtCurMin),
-oooFlag(that.oooFlag)
-{
- const int arrayLen = that.getHllByteArrBytes();
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> uint8Alloc;
- hllByteArr = uint8Alloc().allocate(arrayLen);
- std::copy(that.hllByteArr, that.hllByteArr + arrayLen, hllByteArr);
-}
-
-template<typename A>
-HllArray<A>::~HllArray() {
- // need to determine number of bytes to deallocate
- int hllArrBytes = 0;
- if (this->tgtHllType == target_hll_type::HLL_4) {
- hllArrBytes = hll4ArrBytes(this->lgConfigK);
- } else if (this->tgtHllType == target_hll_type::HLL_6) {
- hllArrBytes = hll6ArrBytes(this->lgConfigK);
- } else { // tgtHllType == HLL_8
- hllArrBytes = hll8ArrBytes(this->lgConfigK);
- }
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint8_t> uint8Alloc;
- uint8Alloc().deallocate(hllByteArr, hllArrBytes);
-}
+HllArray<A>::HllArray(const int lgConfigK, const target_hll_type tgtHllType, bool startFullSize, const A& allocator):
+HllSketchImpl<A>(lgConfigK, tgtHllType, hll_mode::HLL, startFullSize),
+hipAccum(0.0),
+kxq0(1 << lgConfigK),
+kxq1(0.0),
+hllByteArr(allocator),
+curMin(0),
+numAtCurMin(1 << lgConfigK),
+oooFlag(false)
+{}
template<typename A>
HllArray<A>* HllArray<A>::copyAs(const target_hll_type tgtHllType) const {
@@ -93,7 +61,7 @@ HllArray<A>* HllArray<A>::copyAs(const target_hll_type tgtHllType) const {
}
template<typename A>
-HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len) {
+HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len, const A& allocator) {
if (len < HllUtil<A>::HLL_BYTE_ARR_START) {
throw std::out_of_range("Input data length insufficient to hold HLL array");
}
@@ -143,11 +111,11 @@ HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len) {
int auxLgIntArrSize = (int) data[4];
const size_t offset = HllUtil<A>::HLL_BYTE_ARR_START + arrayBytes;
const uint8_t* auxDataStart = data + offset;
- auxHashMap = AuxHashMap<A>::deserialize(auxDataStart, len - offset, lgK, auxCount, auxLgIntArrSize, comapctFlag);
+ auxHashMap = AuxHashMap<A>::deserialize(auxDataStart, len - offset, lgK, auxCount, auxLgIntArrSize, comapctFlag, allocator);
aux_ptr = aux_hash_map_ptr(auxHashMap, auxHashMap->make_deleter());
}
- HllArray<A>* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag);
+ HllArray<A>* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag, allocator);
sketch->putCurMin(curMin);
sketch->putOutOfOrderFlag(oooFlag);
if (!oooFlag) sketch->putHipAccum(hip);
@@ -155,7 +123,7 @@ HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len) {
sketch->putKxQ1(kxq1);
sketch->putNumAtCurMin(numAtCurMin);
- std::memcpy(sketch->hllByteArr, data + HllUtil<A>::HLL_BYTE_ARR_START, arrayBytes);
+ std::memcpy(sketch->hllByteArr.data(), data + HllUtil<A>::HLL_BYTE_ARR_START, arrayBytes);
if (auxHashMap != nullptr)
((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
@@ -165,7 +133,7 @@ HllArray<A>* HllArray<A>::newHll(const void* bytes, size_t len) {
}
template<typename A>
-HllArray<A>* HllArray<A>::newHll(std::istream& is) {
+HllArray<A>* HllArray<A>::newHll(std::istream& is, const A& allocator) {
uint8_t listHeader[8];
is.read((char*)listHeader, 8 * sizeof(uint8_t));
@@ -192,7 +160,7 @@ HllArray<A>* HllArray<A>::newHll(std::istream& is) {
const int lgK = (int) listHeader[HllUtil<A>::LG_K_BYTE];
const int curMin = (int) listHeader[HllUtil<A>::HLL_CUR_MIN_BYTE];
- HllArray* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag);
+ HllArray* sketch = HllSketchImplFactory<A>::newHll(lgK, tgtHllType, startFullSizeFlag, allocator);
typedef std::unique_ptr<HllArray<A>, std::function<void(HllSketchImpl<A>*)>> hll_array_ptr;
hll_array_ptr sketch_ptr(sketch, sketch->get_deleter());
sketch->putCurMin(curMin);
@@ -211,11 +179,11 @@ HllArray<A>* HllArray<A>::newHll(std::istream& is) {
is.read((char*)&auxCount, sizeof(auxCount));
sketch->putNumAtCurMin(numAtCurMin);
- is.read((char*)sketch->hllByteArr, sketch->getHllByteArrBytes());
+ is.read((char*)sketch->hllByteArr.data(), sketch->getHllByteArrBytes());
if (auxCount > 0) { // necessarily TgtHllType == HLL_4
int auxLgIntArrSize = listHeader[4];
- AuxHashMap<A>* auxHashMap = AuxHashMap<A>::deserialize(is, lgK, auxCount, auxLgIntArrSize, comapctFlag);
+ AuxHashMap<A>* auxHashMap = AuxHashMap<A>::deserialize(is, lgK, auxCount, auxLgIntArrSize, comapctFlag, allocator);
((Hll4Array<A>*)sketch)->putAuxHashMap(auxHashMap);
}
@@ -228,7 +196,7 @@ HllArray<A>* HllArray<A>::newHll(std::istream& is) {
template<typename A>
vector_u8<A> HllArray<A>::serialize(bool compact, unsigned header_size_bytes) const {
const size_t sketchSizeBytes = (compact ? getCompactSerializationBytes() : getUpdatableSerializationBytes()) + header_size_bytes;
- vector_u8<A> byteArr(sketchSizeBytes);
+ vector_u8<A> byteArr(sketchSizeBytes, 0, getAllocator());
uint8_t* bytes = byteArr.data() + header_size_bytes;
AuxHashMap<A>* auxHashMap = getAuxHashMap();
@@ -249,7 +217,7 @@ vector_u8<A> HllArray<A>::serialize(bool compact, unsigned header_size_bytes) co
std::memcpy(bytes + HllUtil<A>::AUX_COUNT_INT, &auxCount, sizeof(int));
const int hllByteArrBytes = getHllByteArrBytes();
- std::memcpy(bytes + getMemDataStart(), hllByteArr, hllByteArrBytes);
+ std::memcpy(bytes + getMemDataStart(), hllByteArr.data(), hllByteArrBytes);
// aux map if HLL_4
if (this->tgtHllType == HLL_4) {
@@ -309,7 +277,7 @@ void HllArray<A>::serialize(std::ostream& os, const bool compact) const {
const int auxCount = (auxHashMap == nullptr ? 0 : auxHashMap->getAuxCount());
os.write((char*)&auxCount, sizeof(auxCount));
- os.write((char*)hllByteArr, getHllByteArrBytes());
+ os.write((char*)hllByteArr.data(), getHllByteArrBytes());
// aux map if HLL_4
if (this->tgtHllType == HLL_4) {
@@ -639,12 +607,12 @@ double HllArray<A>::getHllRawEstimate(const int lgConfigK, const double kxqSum)
template<typename A>
typename HllArray<A>::const_iterator HllArray<A>::begin(bool all) const {
- return const_iterator(hllByteArr, 1 << this->lgConfigK, 0, this->tgtHllType, nullptr, 0, all);
+ return const_iterator(hllByteArr.data(), 1 << this->lgConfigK, 0, this->tgtHllType, nullptr, 0, all);
}
template<typename A>
typename HllArray<A>::const_iterator HllArray<A>::end() const {
- return const_iterator(hllByteArr, 1 << this->lgConfigK, 1 << this->lgConfigK, this->tgtHllType, nullptr, 0, false);
+ return const_iterator(hllByteArr.data(), 1 << this->lgConfigK, 1 << this->lgConfigK, this->tgtHllType, nullptr, 0, false);
}
template<typename A>
@@ -701,6 +669,11 @@ uint8_t HllArray<A>::const_iterator::get_value(const uint8_t* array, size_t inde
return array[index];
}
+template<typename A>
+A HllArray<A>::getAllocator() const {
+ return hllByteArr.get_allocator();
+}
+
}
#endif // _HLLARRAY_INTERNAL_HPP_
diff --git a/be/src/thirdparty/datasketches/HllArray.hpp b/be/src/thirdparty/datasketches/HllArray.hpp
index 1cc64ea..e7be8c1 100644
--- a/be/src/thirdparty/datasketches/HllArray.hpp
+++ b/be/src/thirdparty/datasketches/HllArray.hpp
@@ -28,19 +28,18 @@ namespace datasketches {
template<typename A>
class AuxHashMap;
-template<typename A = std::allocator<char>>
+template<typename A>
class HllArray : public HllSketchImpl<A> {
public:
- explicit HllArray(int lgConfigK, target_hll_type tgtHllType, bool startFullSize);
- explicit HllArray(const HllArray<A>& that);
+ HllArray(int lgConfigK, target_hll_type tgtHllType, bool startFullSize, const A& allocator);
- static HllArray* newHll(const void* bytes, size_t len);
- static HllArray* newHll(std::istream& is);
+ static HllArray* newHll(const void* bytes, size_t len, const A& allocator);
+ static HllArray* newHll(std::istream& is, const A& allocator);
virtual vector_u8<A> serialize(bool compact, unsigned header_size_bytes) const;
virtual void serialize(std::ostream& os, bool compact) const;
- virtual ~HllArray();
+ virtual ~HllArray() = default;
virtual std::function<void(HllSketchImpl<A>*)> get_deleter() const = 0;
virtual HllArray* copy() const = 0;
@@ -95,6 +94,8 @@ class HllArray : public HllSketchImpl<A> {
virtual const_iterator begin(bool all = false) const;
virtual const_iterator end() const;
+ virtual A getAllocator() const;
+
protected:
void hipAndKxQIncrementalUpdate(uint8_t oldValue, uint8_t newValue);
double getHllBitMapEstimate(int lgConfigK, int curMin, int numAtCurMin) const;
@@ -103,7 +104,7 @@ class HllArray : public HllSketchImpl<A> {
double hipAccum;
double kxq0;
double kxq1;
- uint8_t* hllByteArr; //init by sub-classes
+ vector_u8<A> hllByteArr; //init by sub-classes
int curMin; //always zero for Hll6 and Hll8, only tracked by Hll4Array
int numAtCurMin; //interpreted as num zeros when curMin == 0
bool oooFlag; //Out-Of-Order Flag
@@ -115,7 +116,6 @@ template<typename A>
class HllArray<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint32_t> {
public:
const_iterator(const uint8_t* array, size_t array_slze, size_t index, target_hll_type hll_type, const AuxHashMap<A>* exceptions, uint8_t offset, bool all);
- //const_iterator(const uint8_t* array, size_t array_slze, size_t index, target_hll_type hll_type, const AuxHashMap<A>* exceptions, uint8_t offset);
const_iterator& operator++();
bool operator!=(const const_iterator& other) const;
uint32_t operator*() const;
diff --git a/be/src/thirdparty/datasketches/HllSketch-internal.hpp b/be/src/thirdparty/datasketches/HllSketch-internal.hpp
index dd16955..8f7d1f4 100644
--- a/be/src/thirdparty/datasketches/HllSketch-internal.hpp
+++ b/be/src/thirdparty/datasketches/HllSketch-internal.hpp
@@ -42,28 +42,26 @@ typedef union {
} longDoubleUnion;
template<typename A>
-hll_sketch_alloc<A>::hll_sketch_alloc(int lg_config_k, target_hll_type tgt_type, bool start_full_size) {
+hll_sketch_alloc<A>::hll_sketch_alloc(int lg_config_k, target_hll_type tgt_type, bool start_full_size, const A& allocator) {
HllUtil<A>::checkLgK(lg_config_k);
if (start_full_size) {
- sketch_impl = HllSketchImplFactory<A>::newHll(lg_config_k, tgt_type, start_full_size);
+ sketch_impl = HllSketchImplFactory<A>::newHll(lg_config_k, tgt_type, start_full_size, allocator);
} else {
typedef typename std::allocator_traits<A>::template rebind_alloc<CouponList<A>> clAlloc;
- sketch_impl = new (clAlloc().allocate(1)) CouponList<A>(lg_config_k, tgt_type, hll_mode::LIST);
+ sketch_impl = new (clAlloc(allocator).allocate(1)) CouponList<A>(lg_config_k, tgt_type, hll_mode::LIST, allocator);
}
}
template<typename A>
-hll_sketch_alloc<A> hll_sketch_alloc<A>::deserialize(std::istream& is) {
- HllSketchImpl<A>* impl = HllSketchImplFactory<A>::deserialize(is);
- hll_sketch_alloc<A> sketch(impl);
- return sketch;
+hll_sketch_alloc<A> hll_sketch_alloc<A>::deserialize(std::istream& is, const A& allocator) {
+ HllSketchImpl<A>* impl = HllSketchImplFactory<A>::deserialize(is, allocator);
+ return hll_sketch_alloc<A>(impl);
}
template<typename A>
-hll_sketch_alloc<A> hll_sketch_alloc<A>::deserialize(const void* bytes, size_t len) {
- HllSketchImpl<A>* impl = HllSketchImplFactory<A>::deserialize(bytes, len);
- hll_sketch_alloc<A> sketch(impl);
- return sketch;
+hll_sketch_alloc<A> hll_sketch_alloc<A>::deserialize(const void* bytes, size_t len, const A& allocator) {
+ HllSketchImpl<A>* impl = HllSketchImplFactory<A>::deserialize(bytes, len, allocator);
+ return hll_sketch_alloc<A>(impl);
}
template<typename A>
diff --git a/be/src/thirdparty/datasketches/HllSketchImpl.hpp b/be/src/thirdparty/datasketches/HllSketchImpl.hpp
index 82180b4..9f53705 100644
--- a/be/src/thirdparty/datasketches/HllSketchImpl.hpp
+++ b/be/src/thirdparty/datasketches/HllSketchImpl.hpp
@@ -27,7 +27,7 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A>
class HllSketchImpl {
public:
HllSketchImpl(int lgConfigK, target_hll_type tgtHllType, hll_mode mode, bool startFullSize);
@@ -66,6 +66,7 @@ class HllSketchImpl {
virtual bool isEmpty() const = 0;
virtual bool isOutOfOrderFlag() const = 0;
virtual void putOutOfOrderFlag(bool oooFlag) = 0;
+ virtual A getAllocator() const = 0;
bool isStartFullSize() const;
protected:
diff --git a/be/src/thirdparty/datasketches/HllSketchImplFactory.hpp b/be/src/thirdparty/datasketches/HllSketchImplFactory.hpp
index eb8dd77..85f9618 100644
--- a/be/src/thirdparty/datasketches/HllSketchImplFactory.hpp
+++ b/be/src/thirdparty/datasketches/HllSketchImplFactory.hpp
@@ -31,15 +31,15 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A>
class HllSketchImplFactory final {
public:
- static HllSketchImpl<A>* deserialize(std::istream& os);
- static HllSketchImpl<A>* deserialize(const void* bytes, size_t len);
+ static HllSketchImpl<A>* deserialize(std::istream& os, const A& allocator);
+ static HllSketchImpl<A>* deserialize(const void* bytes, size_t len, const A& allocator);
static CouponHashSet<A>* promoteListToSet(const CouponList<A>& list);
static HllArray<A>* promoteListOrSetToHll(const CouponList<A>& list);
- static HllArray<A>* newHll(int lgConfigK, target_hll_type tgtHllType, bool startFullSize = false);
+ static HllArray<A>* newHll(int lgConfigK, target_hll_type tgtHllType, bool startFullSize, const A& allocator);
// resets the input impl, deleting the input pointer and returning a new pointer
static HllSketchImpl<A>* reset(HllSketchImpl<A>* impl, bool startFullSize);
@@ -51,8 +51,8 @@ public:
template<typename A>
CouponHashSet<A>* HllSketchImplFactory<A>::promoteListToSet(const CouponList<A>& list) {
- typedef typename std::allocator_traits<A>::template rebind_alloc<CouponHashSet<A>> chsAlloc;
- CouponHashSet<A>* chSet = new (chsAlloc().allocate(1)) CouponHashSet<A>(list.getLgConfigK(), list.getTgtHllType());
+ using ChsAlloc = typename std::allocator_traits<A>::template rebind_alloc<CouponHashSet<A>>;
+ CouponHashSet<A>* chSet = new (ChsAlloc(list.getAllocator()).allocate(1)) CouponHashSet<A>(list.getLgConfigK(), list.getTgtHllType(), list.getAllocator());
for (auto coupon: list) {
chSet->couponUpdate(coupon);
}
@@ -61,7 +61,7 @@ CouponHashSet<A>* HllSketchImplFactory<A>::promoteListToSet(const CouponList<A>&
template<typename A>
HllArray<A>* HllSketchImplFactory<A>::promoteListOrSetToHll(const CouponList<A>& src) {
- HllArray<A>* tgtHllArr = HllSketchImplFactory<A>::newHll(src.getLgConfigK(), src.getTgtHllType());
+ HllArray<A>* tgtHllArr = HllSketchImplFactory<A>::newHll(src.getLgConfigK(), src.getTgtHllType(), false, src.getAllocator());
tgtHllArr->putKxQ0(1 << src.getLgConfigK());
for (auto coupon: src) {
tgtHllArr->couponUpdate(coupon);
@@ -72,48 +72,48 @@ HllArray<A>* HllSketchImplFactory<A>::promoteListOrSetToHll(const CouponList<A>&
}
template<typename A>
-HllSketchImpl<A>* HllSketchImplFactory<A>::deserialize(std::istream& is) {
+HllSketchImpl<A>* HllSketchImplFactory<A>::deserialize(std::istream& is, const A& allocator) {
// we'll hand off the sketch based on PreInts so we don't need
// to move the stream pointer back and forth -- perhaps somewhat fragile?
const int preInts = is.peek();
if (preInts == HllUtil<A>::HLL_PREINTS) {
- return HllArray<A>::newHll(is);
+ return HllArray<A>::newHll(is, allocator);
} else if (preInts == HllUtil<A>::HASH_SET_PREINTS) {
- return CouponHashSet<A>::newSet(is);
+ return CouponHashSet<A>::newSet(is, allocator);
} else if (preInts == HllUtil<A>::LIST_PREINTS) {
- return CouponList<A>::newList(is);
+ return CouponList<A>::newList(is, allocator);
} else {
throw std::invalid_argument("Attempt to deserialize unknown object type");
}
}
template<typename A>
-HllSketchImpl<A>* HllSketchImplFactory<A>::deserialize(const void* bytes, size_t len) {
+HllSketchImpl<A>* HllSketchImplFactory<A>::deserialize(const void* bytes, size_t len, const A& allocator) {
// read current mode directly
const int preInts = static_cast<const uint8_t*>(bytes)[0];
if (preInts == HllUtil<A>::HLL_PREINTS) {
- return HllArray<A>::newHll(bytes, len);
+ return HllArray<A>::newHll(bytes, len, allocator);
} else if (preInts == HllUtil<A>::HASH_SET_PREINTS) {
- return CouponHashSet<A>::newSet(bytes, len);
+ return CouponHashSet<A>::newSet(bytes, len, allocator);
} else if (preInts == HllUtil<A>::LIST_PREINTS) {
- return CouponList<A>::newList(bytes, len);
+ return CouponList<A>::newList(bytes, len, allocator);
} else {
throw std::invalid_argument("Attempt to deserialize unknown object type");
}
}
template<typename A>
-HllArray<A>* HllSketchImplFactory<A>::newHll(int lgConfigK, target_hll_type tgtHllType, bool startFullSize) {
+HllArray<A>* HllSketchImplFactory<A>::newHll(int lgConfigK, target_hll_type tgtHllType, bool startFullSize, const A& allocator) {
switch (tgtHllType) {
case HLL_8:
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>> hll8Alloc;
- return new (hll8Alloc().allocate(1)) Hll8Array<A>(lgConfigK, startFullSize);
+ using Hll8Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>>;
+ return new (Hll8Alloc(allocator).allocate(1)) Hll8Array<A>(lgConfigK, startFullSize, allocator);
case HLL_6:
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>> hll6Alloc;
- return new (hll6Alloc().allocate(1)) Hll6Array<A>(lgConfigK, startFullSize);
+ using Hll6Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>>;
+ return new (Hll6Alloc(allocator).allocate(1)) Hll6Array<A>(lgConfigK, startFullSize, allocator);
case HLL_4:
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>> hll4Alloc;
- return new (hll4Alloc().allocate(1)) Hll4Array<A>(lgConfigK, startFullSize);
+ using Hll4Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>>;
+ return new (Hll4Alloc(allocator).allocate(1)) Hll4Array<A>(lgConfigK, startFullSize, allocator);
}
throw std::logic_error("Invalid target_hll_type");
}
@@ -121,12 +121,12 @@ HllArray<A>* HllSketchImplFactory<A>::newHll(int lgConfigK, target_hll_type tgtH
template<typename A>
HllSketchImpl<A>* HllSketchImplFactory<A>::reset(HllSketchImpl<A>* impl, bool startFullSize) {
if (startFullSize) {
- HllArray<A>* hll = newHll(impl->getLgConfigK(), impl->getTgtHllType(), startFullSize);
+ HllArray<A>* hll = newHll(impl->getLgConfigK(), impl->getTgtHllType(), startFullSize, impl->getAllocator());
impl->get_deleter()(impl);
return hll;
} else {
- typedef typename std::allocator_traits<A>::template rebind_alloc<CouponList<A>> clAlloc;
- CouponList<A>* cl = new (clAlloc().allocate(1)) CouponList<A>(impl->getLgConfigK(), impl->getTgtHllType(), hll_mode::LIST);
+ using ClAlloc = typename std::allocator_traits<A>::template rebind_alloc<CouponList<A>>;
+ CouponList<A>* cl = new (ClAlloc(impl->getAllocator()).allocate(1)) CouponList<A>(impl->getLgConfigK(), impl->getTgtHllType(), hll_mode::LIST, impl->getAllocator());
impl->get_deleter()(impl);
return cl;
}
@@ -135,8 +135,9 @@ HllSketchImpl<A>* HllSketchImplFactory<A>::reset(HllSketchImpl<A>* impl, bool st
template<typename A>
Hll4Array<A>* HllSketchImplFactory<A>::convertToHll4(const HllArray<A>& srcHllArr) {
const int lgConfigK = srcHllArr.getLgConfigK();
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>> hll4Alloc;
- Hll4Array<A>* hll4Array = new (hll4Alloc().allocate(1)) Hll4Array<A>(lgConfigK, srcHllArr.isStartFullSize());
+ using Hll4Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll4Array<A>>;
+ Hll4Array<A>* hll4Array = new (Hll4Alloc(srcHllArr.getAllocator()).allocate(1))
+ Hll4Array<A>(lgConfigK, srcHllArr.isStartFullSize(), srcHllArr.getAllocator());
hll4Array->putOutOfOrderFlag(srcHllArr.isOutOfOrderFlag());
hll4Array->mergeHll(srcHllArr);
hll4Array->putHipAccum(srcHllArr.getHipAccum());
@@ -146,8 +147,9 @@ Hll4Array<A>* HllSketchImplFactory<A>::convertToHll4(const HllArray<A>& srcHllAr
template<typename A>
Hll6Array<A>* HllSketchImplFactory<A>::convertToHll6(const HllArray<A>& srcHllArr) {
const int lgConfigK = srcHllArr.getLgConfigK();
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>> hll6Alloc;
- Hll6Array<A>* hll6Array = new (hll6Alloc().allocate(1)) Hll6Array<A>(lgConfigK, srcHllArr.isStartFullSize());
+ using Hll6Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll6Array<A>>;
+ Hll6Array<A>* hll6Array = new (Hll6Alloc(srcHllArr.getAllocator()).allocate(1))
+ Hll6Array<A>(lgConfigK, srcHllArr.isStartFullSize(), srcHllArr.getAllocator());
hll6Array->putOutOfOrderFlag(srcHllArr.isOutOfOrderFlag());
hll6Array->mergeHll(srcHllArr);
hll6Array->putHipAccum(srcHllArr.getHipAccum());
@@ -157,8 +159,9 @@ Hll6Array<A>* HllSketchImplFactory<A>::convertToHll6(const HllArray<A>& srcHllAr
template<typename A>
Hll8Array<A>* HllSketchImplFactory<A>::convertToHll8(const HllArray<A>& srcHllArr) {
const int lgConfigK = srcHllArr.getLgConfigK();
- typedef typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>> hll8Alloc;
- Hll8Array<A>* hll8Array = new (hll8Alloc().allocate(1)) Hll8Array<A>(lgConfigK, srcHllArr.isStartFullSize());
+ using Hll8Alloc = typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>>;
+ Hll8Array<A>* hll8Array = new (Hll8Alloc(srcHllArr.getAllocator()).allocate(1))
+ Hll8Array<A>(lgConfigK, srcHllArr.isStartFullSize(), srcHllArr.getAllocator());
hll8Array->putOutOfOrderFlag(srcHllArr.isOutOfOrderFlag());
hll8Array->mergeHll(srcHllArr);
hll8Array->putHipAccum(srcHllArr.getHipAccum());
diff --git a/be/src/thirdparty/datasketches/HllUnion-internal.hpp b/be/src/thirdparty/datasketches/HllUnion-internal.hpp
index 0d12fd3..716fab6 100644
--- a/be/src/thirdparty/datasketches/HllUnion-internal.hpp
+++ b/be/src/thirdparty/datasketches/HllUnion-internal.hpp
@@ -32,9 +32,9 @@
namespace datasketches {
template<typename A>
-hll_union_alloc<A>::hll_union_alloc(const int lg_max_k):
+hll_union_alloc<A>::hll_union_alloc(const int lg_max_k, const A& allocator):
lg_max_k(HllUtil<A>::checkLgK(lg_max_k)),
- gadget(lg_max_k, target_hll_type::HLL_8)
+ gadget(lg_max_k, target_hll_type::HLL_8, false, allocator)
{}
template<typename A>
@@ -150,16 +150,6 @@ double hll_union_alloc<A>::get_upper_bound(const int num_std_dev) const {
}
template<typename A>
-int hll_union_alloc<A>::get_compact_serialization_bytes() const {
- return gadget.get_compact_serialization_bytes();
-}
-
-template<typename A>
-int hll_union_alloc<A>::get_updatable_serialization_bytes() const {
- return gadget.get_updatable_serialization_bytes();
-}
-
-template<typename A>
int hll_union_alloc<A>::get_lg_config_k() const {
return gadget.get_lg_config_k();
}
@@ -170,11 +160,6 @@ void hll_union_alloc<A>::reset() {
}
template<typename A>
-bool hll_union_alloc<A>::is_compact() const {
- return gadget.is_compact();
-}
-
-template<typename A>
bool hll_union_alloc<A>::is_empty() const {
return gadget.is_empty();
}
@@ -195,21 +180,11 @@ bool hll_union_alloc<A>::is_estimation_mode() const {
}
template<typename A>
-int hll_union_alloc<A>::get_serialization_version() const {
- return HllUtil<A>::SER_VER;
-}
-
-template<typename A>
target_hll_type hll_union_alloc<A>::get_target_type() const {
return target_hll_type::HLL_8;
}
template<typename A>
-int hll_union_alloc<A>::get_max_serialization_bytes(const int lg_k) {
- return hll_sketch_alloc<A>::get_max_updatable_serialization_bytes(lg_k, target_hll_type::HLL_8);
-}
-
-template<typename A>
double hll_union_alloc<A>::get_rel_err(const bool upper_bound, const bool unioned,
const int lg_config_k, const int num_std_dev) {
return HllUtil<A>::getRelErr(upper_bound, unioned, lg_config_k, num_std_dev);
@@ -226,7 +201,7 @@ HllSketchImpl<A>* hll_union_alloc<A>::copy_or_downsample(const HllSketchImpl<A>*
return src->copyAs(HLL_8);
}
typedef typename std::allocator_traits<A>::template rebind_alloc<Hll8Array<A>> hll8Alloc;
- Hll8Array<A>* tgtHllArr = new (hll8Alloc().allocate(1)) Hll8Array<A>(tgt_lg_k, false);
+ Hll8Array<A>* tgtHllArr = new (hll8Alloc(src->getAllocator()).allocate(1)) Hll8Array<A>(tgt_lg_k, false, src->getAllocator());
tgtHllArr->mergeHll(*src);
//both of these are required for isomorphism
tgtHllArr->putHipAccum(src->getHipAccum());
diff --git a/be/src/thirdparty/datasketches/HllUtil.hpp b/be/src/thirdparty/datasketches/HllUtil.hpp
index ec0ddf2..3a1ebe2 100644
--- a/be/src/thirdparty/datasketches/HllUtil.hpp
+++ b/be/src/thirdparty/datasketches/HllUtil.hpp
@@ -36,7 +36,7 @@ enum hll_mode { LIST = 0, SET, HLL };
// template provides internal consistency and allows static float values
// but we don't use the template parameter anywhere
-template<typename A = std::allocator<char> >
+template<typename A = std::allocator<uint8_t> >
class HllUtil final {
public:
// preamble stuff
diff --git a/be/src/thirdparty/datasketches/MurmurHash3.h b/be/src/thirdparty/datasketches/MurmurHash3.h
index b438c7d..c1cbeab 100644
--- a/be/src/thirdparty/datasketches/MurmurHash3.h
+++ b/be/src/thirdparty/datasketches/MurmurHash3.h
@@ -3,6 +3,7 @@
// * Changed input seed in MurmurHash3_x64_128 to uint64_t
// * Define and use HashState reference to return result
// * Made entire hash function defined inline
+// * Added compute_seed_hash
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
@@ -170,4 +171,10 @@ FORCE_INLINE void MurmurHash3_x64_128(const void* key, int lenBytes, uint64_t se
//-----------------------------------------------------------------------------
+FORCE_INLINE uint16_t compute_seed_hash(uint64_t seed) {
+ HashState hashes;
+ MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
+ return static_cast<uint16_t>(hashes.h1 & 0xffff);
+}
+
#endif // _MURMURHASH3_H_
diff --git a/be/src/thirdparty/datasketches/README.md b/be/src/thirdparty/datasketches/README.md
index 1c0433d..adecb97 100644
--- a/be/src/thirdparty/datasketches/README.md
+++ b/be/src/thirdparty/datasketches/README.md
@@ -10,8 +10,8 @@ changed during this process as originally the following folders were affected:
I copied the content of these folders into the same directory so that Impala
can compile them without rewriting the include paths in the files themselves.
-The git branch of the snapshot I used as a source for the files:
-The hash: b2f749ed5ce6ba650f4259602b133c310c3a5ee4
+The git branch of the snapshot I used as a source for the files: 3.0.0
+The hash: 45885c0c8c0807bb9480886d60ca7042000a4c43
Browse the source files here:
-https://github.com/apache/datasketches-cpp/tree/b2f749ed5ce6ba650f4259602b133c310c3a5ee4
+https://github.com/apache/datasketches-cpp/tree/3.0.0
\ No newline at end of file
diff --git a/be/src/thirdparty/datasketches/RelativeErrorTables.hpp b/be/src/thirdparty/datasketches/RelativeErrorTables.hpp
index da8bebf..5e0a3c7 100644
--- a/be/src/thirdparty/datasketches/RelativeErrorTables.hpp
+++ b/be/src/thirdparty/datasketches/RelativeErrorTables.hpp
@@ -24,7 +24,7 @@
namespace datasketches {
-template<typename A = std::allocator<char>>
+template<typename A = std::allocator<uint8_t>>
class RelativeErrorTables {
public:
/**
diff --git a/be/src/thirdparty/datasketches/bounds_on_ratios_in_sampled_sets.hpp b/be/src/thirdparty/datasketches/bounds_on_ratios_in_sampled_sets.hpp
new file mode 100644
index 0000000..e2c5433
--- /dev/null
+++ b/be/src/thirdparty/datasketches/bounds_on_ratios_in_sampled_sets.hpp
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef BOUNDS_ON_RATIOS_IN_SAMPLED_SETS_HPP_
+#define BOUNDS_ON_RATIOS_IN_SAMPLED_SETS_HPP_
+
+#include <cstdint>
+#include <string>
+
+#include "bounds_binomial_proportions.hpp"
+
+namespace datasketches {
+
+/**
+ * This class is used to compute the bounds on the estimate of the ratio <i>|B| / |A|</i>, where:
+ * <ul>
+ * <li><i>|A|</i> is the unknown size of a set <i>A</i> of unique identifiers.</li>
+ * <li><i>|B|</i> is the unknown size of a subset <i>B</i> of <i>A</i>.</li>
+ * <li><i>a</i> = <i>|S<sub>A</sub>|</i> is the observed size of a sample of <i>A</i>
+ * that was obtained by Bernoulli sampling with a known inclusion probability <i>f</i>.</li>
+ * <li><i>b</i> = <i>|S<sub>A</sub> ∩ B|</i> is the observed size of a subset
+ * of <i>S<sub>A</sub></i>.</li>
+ * </ul>
+ */
+class bounds_on_ratios_in_sampled_sets {
+public:
+ static constexpr double NUM_STD_DEVS = 2.0;
+
+ /**
+ * Return the approximate lower bound based on a 95% confidence interval
+ * @param a See class javadoc
+ * @param b See class javadoc
+ * @param f the inclusion probability used to produce the set with size <i>a</i> and should
+ * generally be less than 0.5. Above this value, the results not be reliable.
+ * When <i>f</i> = 1.0 this returns the estimate.
+ * @return the approximate upper bound
+ */
+ static double lower_bound_for_b_over_a(uint64_t a, uint64_t b, double f) {
+ check_inputs(a, b, f);
+ if (a == 0) return 0.0;
+ if (f == 1.0) return static_cast<double>(b) / static_cast<double>(a);
+ return bounds_binomial_proportions::approximate_lower_bound_on_p(a, b, NUM_STD_DEVS * hacky_adjuster(f));
+ }
+
+ /**
+ * Return the approximate upper bound based on a 95% confidence interval
+ * @param a See class javadoc
+ * @param b See class javadoc
+ * @param f the inclusion probability used to produce the set with size <i>a</i>.
+ * @return the approximate lower bound
+ */
+ static double upper_bound_for_b_over_a(uint64_t a, uint64_t b, double f) {
+ check_inputs(a, b, f);
+ if (a == 0) return 1.0;
+ if (f == 1.0) return static_cast<double>(b) / static_cast<double>(a);
+ return bounds_binomial_proportions::approximate_upper_bound_on_p(a, b, NUM_STD_DEVS * hacky_adjuster(f));
+ }
+
+ /**
+ * Return the estimate of b over a
+ * @param a See class javadoc
+ * @param b See class javadoc
+ * @return the estimate of b over a
+ */
+ static double get_estimate_of_b_over_a(uint64_t a, uint64_t b) {
+ check_inputs(a, b, 0.3);
+ if (a == 0) return 0.5;
+ return static_cast<double>(b) / static_cast<double>(a);
+ }
+
+ /**
+ * Return the estimate of A. See class javadoc.
+ * @param a See class javadoc
+ * @param f the inclusion probability used to produce the set with size <i>a</i>.
+ * @return the approximate lower bound
+ */
+ static double estimate_of_a(uint64_t a, uint64_t f) {
+ check_inputs(a, 1, f);
+ return a / f;
+ }
+
+ /**
+ * Return the estimate of B. See class javadoc.
+ * @param b See class javadoc
+ * @param f the inclusion probability used to produce the set with size <i>b</i>.
+ * @return the approximate lower bound
+ */
+ static double estimate_of_b(uint64_t b, double f) {
+ check_inputs(b + 1, b, f);
+ return b / f;
+ }
+
+private:
+ /**
+ * This hackyAdjuster is tightly coupled with the width of the confidence interval normally
+ * specified with number of standard deviations. To simplify this interface the number of
+ * standard deviations has been fixed to 2.0, which corresponds to a confidence interval of
+ * 95%.
+ * @param f the inclusion probability used to produce the set with size <i>a</i>.
+ * @return the hacky Adjuster
+ */
+ static double hacky_adjuster(double f) {
+ const double tmp = sqrt(1.0 - f);
+ return (f <= 0.5) ? tmp : tmp + (0.01 * (f - 0.5));
+ }
+
+ static void check_inputs(uint64_t a, uint64_t b, double f) {
+ if (a < b) {
+ throw std::invalid_argument("a must be >= b: a = " + std::to_string(a) + ", b = " + std::to_string(b));
+ }
+ if ((f > 1.0) || (f <= 0.0)) {
+ throw std::invalid_argument("Required: ((f <= 1.0) && (f > 0.0)): " + std::to_string(f));
+ }
+ }
+
+};
+
+} /* namespace datasketches */
+
+# endif
diff --git a/be/src/thirdparty/datasketches/bounds_on_ratios_in_theta_sketched_sets.hpp b/be/src/thirdparty/datasketches/bounds_on_ratios_in_theta_sketched_sets.hpp
new file mode 100644
index 0000000..1779ec1
--- /dev/null
+++ b/be/src/thirdparty/datasketches/bounds_on_ratios_in_theta_sketched_sets.hpp
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_
+#define BOUNDS_ON_RATIOS_IN_THETA_SKETCHED_SETS_HPP_
+
+#include <cstdint>
+#include <stdexcept>
+
+#include "bounds_on_ratios_in_sampled_sets.hpp"
+
+namespace datasketches {
+
+/**
+ * This is to compute the bounds on the estimate of the ratio <i>B / A</i>, where:
+ * <ul>
+ * <li><i>A</i> is a Theta Sketch of population <i>PopA</i>.</li>
+ * <li><i>B</i> is a Theta Sketch of population <i>PopB</i> that is a subset of <i>A</i>,
+ * obtained by an intersection of <i>A</i> with some other Theta Sketch <i>C</i>,
+ * which acts like a predicate or selection clause.</li>
+ * <li>The estimate of the ratio <i>PopB/PopA</i> is
+ * estimate_of_b_over_a(<i>A, B</i>).</li>
+ * <li>The Upper Bound estimate on the ratio PopB/PopA is
+ * upper_bound_for_b_over_a(<i>A, B</i>).</li>
+ * <li>The Lower Bound estimate on the ratio PopB/PopA is
+ * lower_bound_for_b_over_a(<i>A, B</i>).</li>
+ * </ul>
+ * Note: The theta of <i>A</i> cannot be greater than the theta of <i>B</i>.
+ * If <i>B</i> is formed as an intersection of <i>A</i> and some other set <i>C</i>,
+ * then the theta of <i>B</i> is guaranteed to be less than or equal to the theta of <i>B</i>.
+ */
+template<typename ExtractKey>
+class bounds_on_ratios_in_theta_sketched_sets {
+public:
+ /**
+ * Gets the approximate lower bound for B over A based on a 95% confidence interval
+ * @param sketchA the sketch A
+ * @param sketchB the sketch B
+ * @return the approximate lower bound for B over A
+ */
+ template<typename SketchA, typename SketchB>
+ static double lower_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) {
+ const uint64_t theta64_a = sketch_a.get_theta64();
+ const uint64_t theta64_b = sketch_b.get_theta64();
+ check_thetas(theta64_a, theta64_b);
+
+ const uint64_t count_b = sketch_b.get_num_retained();
+ const uint64_t count_a = theta64_a == theta64_b
+ ? sketch_a.get_num_retained()
+ : count_less_than_theta64(sketch_a, theta64_b);
+
+ if (count_a == 0) return 0;
+ const double f = sketch_b.get_theta();
+ return bounds_on_ratios_in_sampled_sets::lower_bound_for_b_over_a(count_a, count_b, f);
+ }
+
+ /**
+ * Gets the approximate upper bound for B over A based on a 95% confidence interval
+ * @param sketchA the sketch A
+ * @param sketchB the sketch B
+ * @return the approximate upper bound for B over A
+ */
+ template<typename SketchA, typename SketchB>
+ static double upper_bound_for_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) {
+ const uint64_t theta64_a = sketch_a.get_theta64();
+ const uint64_t theta64_b = sketch_b.get_theta64();
+ check_thetas(theta64_a, theta64_b);
+
+ const uint64_t count_b = sketch_b.get_num_retained();
+ const uint64_t count_a = (theta64_a == theta64_b)
+ ? sketch_a.get_num_retained()
+ : count_less_than_theta64(sketch_a, theta64_b);
+
+ if (count_a == 0) return 1;
+ const double f = sketch_b.get_theta();
+ return bounds_on_ratios_in_sampled_sets::upper_bound_for_b_over_a(count_a, count_b, f);
+ }
+
+ /**
+ * Gets the estimate for B over A
+ * @param sketchA the sketch A
+ * @param sketchB the sketch B
+ * @return the estimate for B over A
+ */
+ template<typename SketchA, typename SketchB>
+ static double estimate_of_b_over_a(const SketchA& sketch_a, const SketchB& sketch_b) {
+ const uint64_t theta64_a = sketch_a.get_theta64();
+ const uint64_t theta64_b = sketch_b.get_theta64();
+ check_thetas(theta64_a, theta64_b);
+
+ const uint64_t count_b = sketch_b.get_num_retained();
+ const uint64_t count_a = (theta64_a == theta64_b)
+ ? sketch_a.get_num_retained()
+ : count_less_than_theta64(sketch_a, theta64_b);
+
+ if (count_a == 0) return 0.5;
+ return static_cast<double>(count_b) / static_cast<double>(count_a);
+ }
+
+private:
+
+ static inline void check_thetas(uint64_t theta_a, uint64_t theta_b) {
+ if (theta_b > theta_a) {
+ throw std::invalid_argument("theta_a must be <= theta_b");
+ }
+ }
+
+ template<typename Sketch>
+ static uint64_t count_less_than_theta64(const Sketch& sketch, uint64_t theta) {
+ uint64_t count = 0;
+ for (const auto& entry: sketch) if (ExtractKey()(entry) < theta) ++count;
+ return count;
+ }
+
+};
+
+} /* namespace datasketches */
+
+# endif
diff --git a/be/src/thirdparty/datasketches/cpc_common.hpp b/be/src/thirdparty/datasketches/cpc_common.hpp
index 9a766b8..cde110f 100644
--- a/be/src/thirdparty/datasketches/cpc_common.hpp
+++ b/be/src/thirdparty/datasketches/cpc_common.hpp
@@ -44,6 +44,8 @@ template<typename A> class u32_table;
template<typename A>
struct compressed_state {
+ explicit compressed_state(const A& allocator): table_data(allocator), table_data_words(0), table_num_entries(0),
+ window_data(allocator), window_data_words(0) {}
vector_u32<A> table_data;
uint32_t table_data_words;
uint32_t table_num_entries; // can be different from the number of entries in the sketch in hybrid mode
@@ -53,6 +55,7 @@ struct compressed_state {
template<typename A>
struct uncompressed_state {
+ explicit uncompressed_state(const A& allocator): table(allocator), window(allocator) {}
u32_table<A> table;
vector_u8<A> window;
};
diff --git a/be/src/thirdparty/datasketches/cpc_compressor.hpp b/be/src/thirdparty/datasketches/cpc_compressor.hpp
index 55fa3b8..73db797 100644
--- a/be/src/thirdparty/datasketches/cpc_compressor.hpp
+++ b/be/src/thirdparty/datasketches/cpc_compressor.hpp
@@ -129,14 +129,14 @@ private:
void compress_surprising_values(const vector_u32<A>& pairs, uint8_t lg_k, compressed_state<A>& result) const;
void compress_sliding_window(const uint8_t* window, uint8_t lg_k, uint32_t num_coupons, compressed_state<A>& target) const;
- vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const;
+ vector_u32<A> uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k, const A& allocator) const;
void uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const;
static size_t safe_length_for_compressed_pair_buf(uint64_t k, size_t num_pairs, size_t num_base_bits);
static size_t safe_length_for_compressed_window_buf(uint64_t k);
static uint8_t determine_pseudo_phase(uint8_t lg_k, uint64_t c);
- static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space);
+ static inline vector_u32<A> tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space, const A& allocator);
static inline uint64_t golomb_choose_number_of_base_bits(uint64_t k, uint64_t count);
};
diff --git a/be/src/thirdparty/datasketches/cpc_compressor_impl.hpp b/be/src/thirdparty/datasketches/cpc_compressor_impl.hpp
index b951b05..e3398c8 100644
--- a/be/src/thirdparty/datasketches/cpc_compressor_impl.hpp
+++ b/be/src/thirdparty/datasketches/cpc_compressor_impl.hpp
@@ -160,7 +160,7 @@ template<typename A>
void cpc_compressor<A>::uncompress(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint64_t num_coupons) const {
switch (cpc_sketch_alloc<A>::determine_flavor(lg_k, num_coupons)) {
case cpc_sketch_alloc<A>::flavor::EMPTY:
- target.table = u32_table<A>(2, 6 + lg_k);
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
break;
case cpc_sketch_alloc<A>::flavor::SPARSE:
uncompress_sparse_flavor(source, target, lg_k);
@@ -191,8 +191,9 @@ template<typename A>
void cpc_compressor<A>::uncompress_sparse_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
if (source.window_data.size() > 0) throw std::logic_error("unexpected sliding window");
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
- target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k);
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
+ lg_k, source.table_data.get_allocator());
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), source.table_num_entries, lg_k, pairs.get_allocator());
}
// This is complicated because it effectively builds a Sparse version
@@ -206,7 +207,7 @@ void cpc_compressor<A>::compress_hybrid_flavor(const cpc_sketch_alloc<A>& source
if (pairs_from_table.size() > 0) u32_table<A>::introspective_insertion_sort(pairs_from_table.data(), 0, pairs_from_table.size());
const size_t num_pairs_from_window = source.get_num_coupons() - pairs_from_table.size(); // because the window offset is zero
- vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size());
+ vector_u32<A> all_pairs = tricky_get_pairs_from_window(source.sliding_window.data(), k, num_pairs_from_window, pairs_from_table.size(), source.get_allocator());
u32_table<A>::merge(
pairs_from_table.data(), 0, pairs_from_table.size(),
@@ -221,7 +222,8 @@ template<typename A>
void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k) const {
if (source.window_data.size() > 0) throw std::logic_error("window is not expected");
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries, lg_k);
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, source.table_num_entries,
+ lg_k, source.table_data.get_allocator());
// In the hybrid flavor, some of these pairs actually
// belong in the window, so we will separate them out,
@@ -240,7 +242,7 @@ void cpc_compressor<A>::uncompress_hybrid_flavor(const compressed_state<A>& sour
pairs[next_true_pair++] = row_col; // move true pair down
}
}
- target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k);
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), next_true_pair, lg_k, pairs.get_allocator());
}
template<typename A>
@@ -264,21 +266,23 @@ void cpc_compressor<A>::compress_pinned_flavor(const cpc_sketch_alloc<A>& source
}
template<typename A>
-void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
+void cpc_compressor<A>::uncompress_pinned_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
+ uint8_t lg_k, uint32_t num_coupons) const {
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
const size_t num_pairs = source.table_num_entries;
if (num_pairs == 0) {
- target.table = u32_table<A>(2, 6 + lg_k);
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
} else {
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
+ lg_k, source.table_data.get_allocator());
// undo the compressor's 8-column shift
for (size_t i = 0; i < num_pairs; i++) {
if ((pairs[i] & 63) >= 56) throw std::logic_error("(pairs[i] & 63) >= 56");
pairs[i] += 8;
}
- target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
}
}
@@ -314,15 +318,17 @@ void cpc_compressor<A>::compress_sliding_flavor(const cpc_sketch_alloc<A>& sourc
}
template<typename A>
-void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target, uint8_t lg_k, uint32_t num_coupons) const {
+void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& source, uncompressed_state<A>& target,
+ uint8_t lg_k, uint32_t num_coupons) const {
if (source.window_data.size() == 0) throw std::logic_error("window is expected");
uncompress_sliding_window(source.window_data.data(), source.window_data_words, target.window, lg_k, num_coupons);
const size_t num_pairs = source.table_num_entries;
if (num_pairs == 0) {
- target.table = u32_table<A>(2, 6 + lg_k);
+ target.table = u32_table<A>(2, 6 + lg_k, source.table_data.get_allocator());
} else {
if (source.table_data.size() == 0) throw std::logic_error("table is expected");
- vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs, lg_k);
+ vector_u32<A> pairs = uncompress_surprising_values(source.table_data.data(), source.table_data_words, num_pairs,
+ lg_k, source.table_data.get_allocator());
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
if (pseudo_phase >= 16) throw std::logic_error("pseudo phase >= 16");
@@ -342,7 +348,7 @@ void cpc_compressor<A>::uncompress_sliding_flavor(const compressed_state<A>& sou
pairs[i] = (row << 6) | col;
}
- target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k);
+ target.table = u32_table<A>::make_from_pairs(pairs.data(), num_pairs, lg_k, pairs.get_allocator());
}
}
@@ -364,9 +370,10 @@ void cpc_compressor<A>::compress_surprising_values(const vector_u32<A>& pairs, u
}
template<typename A>
-vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs, uint8_t lg_k) const {
+vector_u32<A> cpc_compressor<A>::uncompress_surprising_values(const uint32_t* data, size_t data_words, size_t num_pairs,
+ uint8_t lg_k, const A& allocator) const {
const size_t k = 1 << lg_k;
- vector_u32<A> pairs(num_pairs);
+ vector_u32<A> pairs(num_pairs, 0, allocator);
const uint8_t num_base_bits = golomb_choose_number_of_base_bits(k + num_pairs, num_pairs);
low_level_uncompress_pairs(pairs.data(), num_pairs, num_base_bits, data, data_words);
return pairs;
@@ -388,7 +395,8 @@ void cpc_compressor<A>::compress_sliding_window(const uint8_t* window, uint8_t l
}
template<typename A>
-void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window, uint8_t lg_k, uint32_t num_coupons) const {
+void cpc_compressor<A>::uncompress_sliding_window(const uint32_t* data, size_t data_words, vector_u8<A>& window,
+ uint8_t lg_k, uint32_t num_coupons) const {
const size_t k = 1 << lg_k;
window.resize(k); // zeroing not needed here (unlike the Hybrid Flavor)
const uint8_t pseudo_phase = determine_pseudo_phase(lg_k, num_coupons);
@@ -710,9 +718,10 @@ void write_unary(
// The empty space that this leaves at the beginning of the output array
// will be filled in later by the caller.
template<typename A>
-vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get, uint32_t empty_space) {
+vector_u32<A> cpc_compressor<A>::tricky_get_pairs_from_window(const uint8_t* window, uint32_t k, uint32_t num_pairs_to_get,
+ uint32_t empty_space, const A& allocator) {
const size_t output_length = empty_space + num_pairs_to_get;
- vector_u32<A> pairs(output_length);
+ vector_u32<A> pairs(output_length, 0, allocator);
size_t pair_index = empty_space;
for (unsigned row_index = 0; row_index < k; row_index++) {
uint8_t byte = window[row_index];
diff --git a/be/src/thirdparty/datasketches/cpc_sketch.hpp b/be/src/thirdparty/datasketches/cpc_sketch.hpp
index 9aba16f..a4bf8f6 100644
--- a/be/src/thirdparty/datasketches/cpc_sketch.hpp
+++ b/be/src/thirdparty/datasketches/cpc_sketch.hpp
@@ -49,7 +49,7 @@ template<typename A> class cpc_sketch_alloc;
template<typename A> class cpc_union_alloc;
// alias with default allocator for convenience
-typedef cpc_sketch_alloc<std::allocator<void>> cpc_sketch;
+using cpc_sketch = cpc_sketch_alloc<std::allocator<uint8_t>>;
// allocation and initialization of global decompression (decoding) tables
// call this before anything else if you want to control the initialization time
@@ -67,7 +67,10 @@ public:
* @param lg_k base 2 logarithm of the number of bins in the sketch
* @param seed for hash function
*/
- explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
+ explicit cpc_sketch_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
+
+ using allocator_type = A;
+ A get_allocator() const;
/**
* @return configured lg_k of this sketch
@@ -204,7 +207,7 @@ public:
// This is a convenience alias for users
// The type returned by the following serialize method
- typedef vector_u8<A> vector_bytes;
+ using vector_bytes = vector_u8<A>;
/**
* This method serializes the sketch as a vector of bytes.
@@ -221,7 +224,7 @@ public:
* @param seed the seed for the hash function that was used to create the sketch
* @return an instance of a sketch
*/
- static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
+ static cpc_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
/**
* This method deserializes a sketch from a given array of bytes.
@@ -230,7 +233,7 @@ public:
* @param seed the seed for the hash function that was used to create the sketch
* @return an instance of the sketch
*/
- static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
+ static cpc_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
// for internal use
uint32_t get_num_coupons() const;
diff --git a/be/src/thirdparty/datasketches/cpc_sketch_impl.hpp b/be/src/thirdparty/datasketches/cpc_sketch_impl.hpp
index e6bc010..a314de8 100644
--- a/be/src/thirdparty/datasketches/cpc_sketch_impl.hpp
+++ b/be/src/thirdparty/datasketches/cpc_sketch_impl.hpp
@@ -41,13 +41,13 @@ void cpc_init() {
}
template<typename A>
-cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint64_t seed):
+cpc_sketch_alloc<A>::cpc_sketch_alloc(uint8_t lg_k, uint64_t seed, const A& allocator):
lg_k(lg_k),
seed(seed),
was_merged(false),
num_coupons(0),
-surprising_value_table(2, 6 + lg_k),
-sliding_window(),
+surprising_value_table(2, 6 + lg_k, allocator),
+sliding_window(allocator),
window_offset(0),
first_interesting_column(0),
kxp(1 << lg_k),
@@ -59,6 +59,11 @@ hip_est_accum(0)
}
template<typename A>
+A cpc_sketch_alloc<A>::get_allocator() const {
+ return sliding_window.get_allocator();
+}
+
+template<typename A>
uint8_t cpc_sketch_alloc<A>::get_lg_k() const {
return lg_k;
}
@@ -277,7 +282,7 @@ void cpc_sketch_alloc<A>::promote_sparse_to_windowed() {
sliding_window.resize(k, 0); // zero the memory (because we will be OR'ing into it)
- u32_table<A> new_table(2, 6 + lg_k);
+ u32_table<A> new_table(2, 6 + lg_k, sliding_window.get_allocator());
const uint32_t* old_slots = surprising_value_table.get_slots();
const size_t old_num_slots = 1 << surprising_value_table.get_lg_size();
@@ -401,7 +406,7 @@ string<A> cpc_sketch_alloc<A>::to_string() const {
template<typename A>
void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
- compressed_state<A> compressed;
+ compressed_state<A> compressed(A(sliding_window.get_allocator()));
compressed.table_data_words = 0;
compressed.table_num_entries = 0;
compressed.window_data_words = 0;
@@ -454,7 +459,7 @@ void cpc_sketch_alloc<A>::serialize(std::ostream& os) const {
template<typename A>
vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
- compressed_state<A> compressed;
+ compressed_state<A> compressed(sliding_window.get_allocator());
compressed.table_data_words = 0;
compressed.table_num_entries = 0;
compressed.window_data_words = 0;
@@ -464,7 +469,7 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
const bool has_window = compressed.window_data.size() > 0;
const uint8_t preamble_ints = get_preamble_ints(num_coupons, has_hip, has_table, has_window);
const size_t size = header_size_bytes + (preamble_ints + compressed.table_data_words + compressed.window_data_words) * sizeof(uint32_t);
- vector_u8<A> bytes(size);
+ vector_u8<A> bytes(size, 0, sliding_window.get_allocator());
uint8_t* ptr = bytes.data() + header_size_bytes;
ptr += copy_to_mem(&preamble_ints, ptr, sizeof(preamble_ints));
const uint8_t serial_version = SERIAL_VERSION;
@@ -511,7 +516,7 @@ vector_u8<A> cpc_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
}
template<typename A>
-cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
+cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
uint8_t preamble_ints;
is.read((char*)&preamble_ints, sizeof(preamble_ints));
uint8_t serial_version;
@@ -529,7 +534,7 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
const bool has_hip = flags_byte & (1 << flags::HAS_HIP);
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
- compressed_state<A> compressed;
+ compressed_state<A> compressed(allocator);
compressed.table_data_words = 0;
compressed.table_num_entries = 0;
compressed.window_data_words = 0;
@@ -583,7 +588,7 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
+ std::to_string(compute_seed_hash(seed)));
}
- uncompressed_state<A> uncompressed;
+ uncompressed_state<A> uncompressed(allocator);
get_compressor<A>().uncompress(compressed, uncompressed, lg_k, num_coupons);
if (!is.good())
throw std::runtime_error("error reading from std::istream");
@@ -592,7 +597,7 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(std::istream& is, uint64_t
}
template<typename A>
-cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
+cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
ensure_minimum_memory(size, 8);
const char* ptr = static_cast<const char*>(bytes);
const char* base = static_cast<const char*>(bytes);
@@ -614,7 +619,7 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
const bool has_table = flags_byte & (1 << flags::HAS_TABLE);
const bool has_window = flags_byte & (1 << flags::HAS_WINDOW);
ensure_minimum_memory(size, preamble_ints << 2);
- compressed_state<A> compressed;
+ compressed_state<A> compressed(allocator);
compressed.table_data_words = 0;
compressed.table_num_entries = 0;
compressed.window_data_words = 0;
@@ -677,7 +682,7 @@ cpc_sketch_alloc<A> cpc_sketch_alloc<A>::deserialize(const void* bytes, size_t s
throw std::invalid_argument("Incompatible seed hashes: " + std::to_string(seed_hash) + ", "
+ std::to_string(compute_seed_hash(seed)));
}
- uncompressed_state<A> uncompressed;
+ uncompressed_state<A> uncompressed(allocator);
get_compressor<A>().uncompress(compressed, uncompressed, lg_k, num_coupons);
return cpc_sketch_alloc(lg_k, num_coupons, first_interesting_column, std::move(uncompressed.table),
std::move(uncompressed.window), has_hip, kxp, hip_est_accum, seed);
@@ -766,7 +771,7 @@ vector_u64<A> cpc_sketch_alloc<A>::build_bit_matrix() const {
// Fill the matrix with default rows in which the "early zone" is filled with ones.
// This is essential for the routine's O(k) time cost (as opposed to O(C)).
const uint64_t default_row = (static_cast<uint64_t>(1) << window_offset) - 1;
- vector_u64<A> matrix(k, default_row);
+ vector_u64<A> matrix(k, default_row, sliding_window.get_allocator());
if (num_coupons == 0) return matrix;
diff --git a/be/src/thirdparty/datasketches/cpc_union.hpp b/be/src/thirdparty/datasketches/cpc_union.hpp
index e56aa72..dd59abc 100644
--- a/be/src/thirdparty/datasketches/cpc_union.hpp
+++ b/be/src/thirdparty/datasketches/cpc_union.hpp
@@ -35,7 +35,7 @@ namespace datasketches {
*/
// alias with default allocator for convenience
-typedef cpc_union_alloc<std::allocator<void>> cpc_union;
+using cpc_union = cpc_union_alloc<std::allocator<uint8_t>>;
template<typename A>
class cpc_union_alloc {
@@ -45,7 +45,7 @@ public:
* @param lg_k base 2 logarithm of the number of bins in the sketch
* @param seed for hash function
*/
- explicit cpc_union_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED);
+ explicit cpc_union_alloc(uint8_t lg_k = CPC_DEFAULT_LG_K, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
cpc_union_alloc(const cpc_union_alloc<A>& other);
cpc_union_alloc(cpc_union_alloc<A>&& other) noexcept;
diff --git a/be/src/thirdparty/datasketches/cpc_union_impl.hpp b/be/src/thirdparty/datasketches/cpc_union_impl.hpp
index 65d933c..5acfe5f 100644
--- a/be/src/thirdparty/datasketches/cpc_union_impl.hpp
+++ b/be/src/thirdparty/datasketches/cpc_union_impl.hpp
@@ -25,16 +25,16 @@
namespace datasketches {
template<typename A>
-cpc_union_alloc<A>::cpc_union_alloc(uint8_t lg_k, uint64_t seed):
+cpc_union_alloc<A>::cpc_union_alloc(uint8_t lg_k, uint64_t seed, const A& allocator):
lg_k(lg_k),
seed(seed),
accumulator(nullptr),
-bit_matrix()
+bit_matrix(allocator)
{
if (lg_k < CPC_MIN_LG_K || lg_k > CPC_MAX_LG_K) {
throw std::invalid_argument("lg_k must be >= " + std::to_string(CPC_MIN_LG_K) + " and <= " + std::to_string(CPC_MAX_LG_K) + ": " + std::to_string(lg_k));
}
- accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(lg_k, seed);
+ accumulator = new (AllocCpc().allocate(1)) cpc_sketch_alloc<A>(lg_k, seed, allocator);
}
template<typename A>
@@ -200,13 +200,13 @@ cpc_sketch_alloc<A> cpc_union_alloc<A>::get_result_from_bit_matrix() const {
const uint8_t offset = cpc_sketch_alloc<A>::determine_correct_offset(lg_k, num_coupons);
- vector_u8<A> sliding_window(k);
+ vector_u8<A> sliding_window(k, 0, bit_matrix.get_allocator());
// don't need to zero the window's memory
// dynamically growing caused snowplow effect
uint8_t table_lg_size = lg_k - 4; // K/16; in some cases this will end up being oversized
if (table_lg_size < 2) table_lg_size = 2;
- u32_table<A> table(table_lg_size, 6 + lg_k);
+ u32_table<A> table(table_lg_size, 6 + lg_k, bit_matrix.get_allocator());
// the following should work even when the offset is zero
const uint64_t mask_for_clearing_window = (static_cast<uint64_t>(0xff) << offset) ^ UINT64_MAX;
@@ -314,7 +314,7 @@ void cpc_union_alloc<A>::reduce_k(uint8_t new_lg_k) {
vector_u64<A> old_matrix = std::move(bit_matrix);
const uint8_t old_lg_k = lg_k;
const size_t new_k = 1 << new_lg_k;
- bit_matrix = vector_u64<A>(new_k, 0);
+ bit_matrix = vector_u64<A>(new_k, 0, old_matrix.get_allocator());
lg_k = new_lg_k;
or_matrix_into_matrix(old_matrix, old_lg_k);
return;
diff --git a/be/src/thirdparty/datasketches/cpc_util.hpp b/be/src/thirdparty/datasketches/cpc_util.hpp
index b63f26f..1a33b3a 100644
--- a/be/src/thirdparty/datasketches/cpc_util.hpp
+++ b/be/src/thirdparty/datasketches/cpc_util.hpp
@@ -24,12 +24,6 @@
namespace datasketches {
-static inline uint16_t compute_seed_hash(uint64_t seed) {
- HashState hashes;
- MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
- return hashes.h1 & 0xffff;
-}
-
static inline uint64_t divide_longs_rounding_up(uint64_t x, uint64_t y) {
if (y == 0) throw std::invalid_argument("divide_longs_rounding_up: bad argument");
const uint64_t quotient = x / y;
diff --git a/be/src/thirdparty/datasketches/hll.hpp b/be/src/thirdparty/datasketches/hll.hpp
index 3898dda..a65b945 100644
--- a/be/src/thirdparty/datasketches/hll.hpp
+++ b/be/src/thirdparty/datasketches/hll.hpp
@@ -108,7 +108,7 @@ class hll_union_alloc;
template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
-template<typename A = std::allocator<char> >
+template<typename A = std::allocator<uint8_t> >
class hll_sketch_alloc final {
public:
/**
@@ -119,7 +119,7 @@ class hll_sketch_alloc final {
* keeping memory use constant (if HLL_6 or HLL_8) at the cost of
* starting out using much more memory
*/
- explicit hll_sketch_alloc(int lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false);
+ explicit hll_sketch_alloc(int lg_config_k, target_hll_type tgt_type = HLL_4, bool start_full_size = false, const A& allocator = A());
/**
* Copy constructor
@@ -140,14 +140,14 @@ class hll_sketch_alloc final {
* Reconstructs a sketch from a serialized image on a stream.
* @param is An input stream with a binary image of a sketch
*/
- static hll_sketch_alloc deserialize(std::istream& is);
+ static hll_sketch_alloc deserialize(std::istream& is, const A& allocator = A());
/**
* Reconstructs a sketch from a serialized image in a byte array.
* @param is bytes An input array with a binary image of a sketch
* @param len Length of the input array, in bytes
*/
- static hll_sketch_alloc deserialize(const void* bytes, size_t len);
+ static hll_sketch_alloc deserialize(const void* bytes, size_t len, const A& allocator = A());
//! Class destructor
virtual ~hll_sketch_alloc();
@@ -423,7 +423,7 @@ class hll_sketch_alloc final {
* author Kevin Lang
*/
-template<typename A = std::allocator<char> >
+template<typename A = std::allocator<uint8_t> >
class hll_union_alloc {
public:
/**
@@ -431,7 +431,7 @@ class hll_union_alloc {
* @param lg_max_k The maximum size, in log2, of k. The value must
* be between 7 and 21, inclusive.
*/
- explicit hll_union_alloc(int lg_max_k);
+ explicit hll_union_alloc(int lg_max_k, const A& allocator = A());
/**
* Returns the current cardinality estimate
@@ -469,18 +469,6 @@ class hll_union_alloc {
double get_upper_bound(int num_std_dev) const;
/**
- * Returns the size of the union serialized in compact form.
- * @return Size of the union serialized in compact form, in bytes.
- */
- int get_compact_serialization_bytes() const;
-
- /**
- * Returns the size of the union serialized without compaction.
- * @return Size of the union serialized without compaction, in bytes.
- */
- int get_updatable_serialization_bytes() const;
-
- /**
* Returns union's configured lg_k value.
* @return Configured lg_k value.
*/
@@ -493,12 +481,6 @@ class hll_union_alloc {
target_hll_type get_target_type() const;
/**
- * Indicates if the union is currently stored compacted.
- * @return True if the union is stored in compact form.
- */
- bool is_compact() const;
-
- /**
* Indicates if the union is currently empty.
* @return True if the union is empty.
*/
@@ -606,15 +588,6 @@ class hll_union_alloc {
void update(const void* data, size_t length_bytes);
/**
- * Returns the maximum size in bytes that this union operator can grow to given a lg_k.
- *
- * @param lg_k The maximum Log2 of k for this union operator. This value must be
- * between 4 and 21 inclusively.
- * @return the maximum size in bytes that this union operator can grow to.
- */
- static int get_max_serialization_bytes(int lg_k);
-
- /**
* Gets the current (approximate) Relative Error (RE) asymptotic values given several
* parameters. This is used primarily for testing.
* @param upper_bound return the RE for the Upper Bound, otherwise for the Lower Bound.
@@ -645,7 +618,6 @@ class hll_union_alloc {
void coupon_update(int coupon);
hll_mode get_current_mode() const;
- int get_serialization_version() const;
bool is_out_of_order_flag() const;
bool is_estimation_mode() const;
diff --git a/be/src/thirdparty/datasketches/icon_estimator.hpp b/be/src/thirdparty/datasketches/icon_estimator.hpp
index 27d76ca..4a9daea 100644
--- a/be/src/thirdparty/datasketches/icon_estimator.hpp
+++ b/be/src/thirdparty/datasketches/icon_estimator.hpp
@@ -231,7 +231,7 @@ static const double ICON_POLYNOMIAL_COEFFICIENTS[ICON_TABLE_SIZE] = {
#endif
};
-static double evaluate_polynomial(const double* coefficients, int start, int num, double x) {
+static inline double evaluate_polynomial(const double* coefficients, int start, int num, double x) {
const int final = start + num - 1;
double total = coefficients[final];
for (int j = final - 1; j >= start; j--) {
@@ -241,11 +241,11 @@ static double evaluate_polynomial(const double* coefficients, int start, int num
return total;
}
-static double icon_exponential_approximation(double k, double c) {
+static inline double icon_exponential_approximation(double k, double c) {
return (0.7940236163830469 * k * pow(2.0, c / k));
}
-static double compute_icon_estimate(uint8_t lg_k, uint64_t c) {
+static inline double compute_icon_estimate(uint8_t lg_k, uint64_t c) {
if (lg_k < ICON_MIN_LOG_K || lg_k > ICON_MAX_LOG_K) throw std::out_of_range("lg_k out of range");
if (c < 2) return ((c == 0) ? 0.0 : 1.0);
const size_t k = 1 << lg_k;
diff --git a/be/src/thirdparty/datasketches/kll_quantile_calculator.hpp b/be/src/thirdparty/datasketches/kll_quantile_calculator.hpp
index bc60f26..5114399 100644
--- a/be/src/thirdparty/datasketches/kll_quantile_calculator.hpp
+++ b/be/src/thirdparty/datasketches/kll_quantile_calculator.hpp
@@ -28,7 +28,7 @@ template <typename T, typename C, typename A>
class kll_quantile_calculator {
public:
// assumes that all levels are sorted including level 0
- kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n);
+ kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator);
T get_quantile(double fraction) const;
private:
diff --git a/be/src/thirdparty/datasketches/kll_quantile_calculator_impl.hpp b/be/src/thirdparty/datasketches/kll_quantile_calculator_impl.hpp
index f580819..23efa4d 100644
--- a/be/src/thirdparty/datasketches/kll_quantile_calculator_impl.hpp
+++ b/be/src/thirdparty/datasketches/kll_quantile_calculator_impl.hpp
@@ -29,8 +29,8 @@
namespace datasketches {
template <typename T, typename C, typename A>
-kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n):
-n_(n), levels_(num_levels + 1)
+kll_quantile_calculator<T, C, A>::kll_quantile_calculator(const T* items, const uint32_t* levels, uint8_t num_levels, uint64_t n, const A& allocator):
+n_(n), levels_(num_levels + 1, 0, allocator), entries_(allocator)
{
const uint32_t num_items = levels[num_levels] - levels[0];
entries_.reserve(num_items);
@@ -116,7 +116,7 @@ uint32_t kll_quantile_calculator<T, C, A>::search_for_chunk_containing_pos(uint6
template <typename T, typename C, typename A>
void kll_quantile_calculator<T, C, A>::merge_sorted_blocks(Container& entries, const uint32_t* levels, uint8_t num_levels, uint32_t num_items) {
if (num_levels == 1) return;
- Container temporary;
+ Container temporary(entries.get_allocator());
temporary.reserve(num_items);
merge_sorted_blocks_direct(entries, temporary, levels, 0, num_levels);
}
diff --git a/be/src/thirdparty/datasketches/kll_sketch.hpp b/be/src/thirdparty/datasketches/kll_sketch.hpp
index a4530c9..bbca76f 100644
--- a/be/src/thirdparty/datasketches/kll_sketch.hpp
+++ b/be/src/thirdparty/datasketches/kll_sketch.hpp
@@ -161,7 +161,7 @@ class kll_sketch {
static const uint16_t MIN_K = DEFAULT_M;
static const uint16_t MAX_K = (1 << 16) - 1;
- explicit kll_sketch(uint16_t k = DEFAULT_K);
+ explicit kll_sketch(uint16_t k = DEFAULT_K, const A& allocator = A());
kll_sketch(const kll_sketch& other);
kll_sketch(kll_sketch&& other) noexcept;
~kll_sketch();
@@ -203,6 +203,12 @@ class kll_sketch {
bool is_empty() const;
/**
+ * Returns configured parameter k
+ * @return parameter k
+ */
+ uint16_t get_k() const;
+
+ /**
* Returns the length of the input stream.
* @return stream length
*/
@@ -401,7 +407,7 @@ class kll_sketch {
* @param is input stream
* @return an instance of a sketch
*/
- static kll_sketch deserialize(std::istream& is);
+ static kll_sketch<T, C, S, A> deserialize(std::istream& is, const A& allocator = A());
/**
* This method deserializes a sketch from a given array of bytes.
@@ -409,7 +415,7 @@ class kll_sketch {
* @param size the size of the array
* @return an instance of a sketch
*/
- static kll_sketch deserialize(const void* bytes, size_t size);
+ static kll_sketch<T, C, S, A> deserialize(const void* bytes, size_t size, const A& allocator = A());
/*
* Gets the normalized rank error given k and pmf.
@@ -461,6 +467,7 @@ class kll_sketch {
static const uint8_t PREAMBLE_INTS_SHORT = 2; // for empty and single item
static const uint8_t PREAMBLE_INTS_FULL = 5;
+ A allocator_;
uint16_t k_;
uint8_t m_; // minimum buffer "width"
uint16_t min_k_; // for error estimation after merging with different k
diff --git a/be/src/thirdparty/datasketches/kll_sketch_impl.hpp b/be/src/thirdparty/datasketches/kll_sketch_impl.hpp
index f0c5ff3..0e0ef87 100644
--- a/be/src/thirdparty/datasketches/kll_sketch_impl.hpp
+++ b/be/src/thirdparty/datasketches/kll_sketch_impl.hpp
@@ -30,13 +30,14 @@
namespace datasketches {
template<typename T, typename C, typename S, typename A>
-kll_sketch<T, C, S, A>::kll_sketch(uint16_t k):
+kll_sketch<T, C, S, A>::kll_sketch(uint16_t k, const A& allocator):
+allocator_(allocator),
k_(k),
m_(DEFAULT_M),
min_k_(k),
n_(0),
num_levels_(1),
-levels_(2),
+levels_(2, 0, allocator),
items_(nullptr),
items_size_(k_),
min_value_(nullptr),
@@ -47,11 +48,12 @@ is_level_zero_sorted_(false)
throw std::invalid_argument("K must be >= " + std::to_string(MIN_K) + " and <= " + std::to_string(MAX_K) + ": " + std::to_string(k));
}
levels_[0] = levels_[1] = k;
- items_ = A().allocate(items_size_);
+ items_ = allocator_.allocate(items_size_);
}
template<typename T, typename C, typename S, typename A>
kll_sketch<T, C, S, A>::kll_sketch(const kll_sketch& other):
+allocator_(other.allocator_),
k_(other.k_),
m_(other.m_),
min_k_(other.min_k_),
@@ -64,14 +66,15 @@ min_value_(nullptr),
max_value_(nullptr),
is_level_zero_sorted_(other.is_level_zero_sorted_)
{
- items_ = A().allocate(items_size_);
+ items_ = allocator_.allocate(items_size_);
std::copy(&other.items_[levels_[0]], &other.items_[levels_[num_levels_]], &items_[levels_[0]]);
- if (other.min_value_ != nullptr) min_value_ = new (A().allocate(1)) T(*other.min_value_);
- if (other.max_value_ != nullptr) max_value_ = new (A().allocate(1)) T(*other.max_value_);
+ if (other.min_value_ != nullptr) min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
+ if (other.max_value_ != nullptr) max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
}
template<typename T, typename C, typename S, typename A>
kll_sketch<T, C, S, A>::kll_sketch(kll_sketch&& other) noexcept:
+allocator_(std::move(other.allocator_)),
k_(other.k_),
m_(other.m_),
min_k_(other.min_k_),
@@ -91,7 +94,8 @@ is_level_zero_sorted_(other.is_level_zero_sorted_)
template<typename T, typename C, typename S, typename A>
kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(const kll_sketch& other) {
- kll_sketch copy(other);
+ kll_sketch<T, C, S, A> copy(other);
+ std::swap(allocator_, copy.allocator_);
std::swap(k_, copy.k_);
std::swap(m_, copy.m_);
std::swap(min_k_, copy.min_k_);
@@ -108,6 +112,7 @@ kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(const kll_sketch& othe
template<typename T, typename C, typename S, typename A>
kll_sketch<T, C, S, A>& kll_sketch<T, C, S, A>::operator=(kll_sketch&& other) {
+ std::swap(allocator_, other.allocator_);
std::swap(k_, other.k_);
std::swap(m_, other.m_);
std::swap(min_k_, other.min_k_);
@@ -128,15 +133,15 @@ kll_sketch<T, C, S, A>::~kll_sketch() {
const uint32_t begin = levels_[0];
const uint32_t end = levels_[num_levels_];
for (uint32_t i = begin; i < end; i++) items_[i].~T();
- A().deallocate(items_, items_size_);
+ allocator_.deallocate(items_, items_size_);
}
if (min_value_ != nullptr) {
min_value_->~T();
- A().deallocate(min_value_, 1);
+ allocator_.deallocate(min_value_, 1);
}
if (max_value_ != nullptr) {
max_value_->~T();
- A().deallocate(max_value_, 1);
+ allocator_.deallocate(max_value_, 1);
}
}
@@ -159,8 +164,8 @@ void kll_sketch<T, C, S, A>::update(T&& value) {
template<typename T, typename C, typename S, typename A>
void kll_sketch<T, C, S, A>::update_min_max(const T& value) {
if (is_empty()) {
- min_value_ = new (A().allocate(1)) T(value);
- max_value_ = new (A().allocate(1)) T(value);
+ min_value_ = new (allocator_.allocate(1)) T(value);
+ max_value_ = new (allocator_.allocate(1)) T(value);
} else {
if (C()(value, *min_value_)) *min_value_ = value;
if (C()(*max_value_, value)) *max_value_ = value;
@@ -182,8 +187,8 @@ void kll_sketch<T, C, S, A>::merge(const kll_sketch& other) {
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
}
if (is_empty()) {
- min_value_ = new (A().allocate(1)) T(*other.min_value_);
- max_value_ = new (A().allocate(1)) T(*other.max_value_);
+ min_value_ = new (allocator_.allocate(1)) T(*other.min_value_);
+ max_value_ = new (allocator_.allocate(1)) T(*other.max_value_);
} else {
if (C()(*other.min_value_, *min_value_)) *min_value_ = *other.min_value_;
if (C()(*max_value_, *other.max_value_)) *max_value_ = *other.max_value_;
@@ -206,8 +211,8 @@ void kll_sketch<T, C, S, A>::merge(kll_sketch&& other) {
throw std::invalid_argument("incompatible M: " + std::to_string(m_) + " and " + std::to_string(other.m_));
}
if (is_empty()) {
- min_value_ = new (A().allocate(1)) T(std::move(*other.min_value_));
- max_value_ = new (A().allocate(1)) T(std::move(*other.max_value_));
+ min_value_ = new (allocator_.allocate(1)) T(std::move(*other.min_value_));
+ max_value_ = new (allocator_.allocate(1)) T(std::move(*other.max_value_));
} else {
if (C()(*other.min_value_, *min_value_)) *min_value_ = std::move(*other.min_value_);
if (C()(*max_value_, *other.max_value_)) *max_value_ = std::move(*other.max_value_);
@@ -229,6 +234,11 @@ bool kll_sketch<T, C, S, A>::is_empty() const {
}
template<typename T, typename C, typename S, typename A>
+uint16_t kll_sketch<T, C, S, A>::get_k() const {
+ return k_;
+}
+
+template<typename T, typename C, typename S, typename A>
uint64_t kll_sketch<T, C, S, A>::get_n() const {
return n_;
}
@@ -270,8 +280,7 @@ T kll_sketch<T, C, S, A>::get_quantile(double fraction) const {
template<typename T, typename C, typename S, typename A>
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions, uint32_t size) const {
- std::vector<T, A> quantiles;
- quantiles.reserve(size);
+ std::vector<T, A> quantiles(allocator_);
if (is_empty()) return quantiles;
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator;
quantiles.reserve(size);
@@ -295,11 +304,11 @@ std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(const double* fractions,
template<typename T, typename C, typename S, typename A>
std::vector<T, A> kll_sketch<T, C, S, A>::get_quantiles(size_t num) const {
- if (is_empty()) return std::vector<T, A>();
+ if (is_empty()) return std::vector<T, A>(allocator_);
if (num == 0) {
throw std::invalid_argument("num must be > 0");
}
- std::vector<double> fractions(num);
+ vector_d<A> fractions(num, 0, allocator_);
fractions[0] = 0.0;
for (size_t i = 1; i < num; i++) {
fractions[i] = static_cast<double>(i) / (num - 1);
@@ -411,7 +420,7 @@ template<typename T, typename C, typename S, typename A>
vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const {
const bool is_single_item = n_ == 1;
const size_t size = header_size_bytes + get_serialized_size_bytes();
- vector_u8<A> bytes(size);
+ vector_u8<A> bytes(size, 0, allocator_);
uint8_t* ptr = bytes.data() + header_size_bytes;
const uint8_t* end_ptr = ptr + size;
const uint8_t preamble_ints(is_empty() || is_single_item ? PREAMBLE_INTS_SHORT : PREAMBLE_INTS_FULL);
@@ -449,7 +458,7 @@ vector_u8<A> kll_sketch<T, C, S, A>::serialize(unsigned header_size_bytes) const
}
template<typename T, typename C, typename S, typename A>
-kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is) {
+kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is, const A& allocator) {
uint8_t preamble_ints;
is.read((char*)&preamble_ints, sizeof(preamble_ints));
uint8_t serial_version;
@@ -472,7 +481,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is) {
if (!is.good()) throw std::runtime_error("error reading from std::istream");
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
- if (is_empty) return kll_sketch(k);
+ if (is_empty) return kll_sketch(k, allocator);
uint64_t n;
uint16_t min_k;
@@ -488,7 +497,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is) {
is.read((char*)&num_levels, sizeof(num_levels));
is.read((char*)&unused, sizeof(unused));
}
- vector_u32<A> levels(num_levels + 1);
+ vector_u32<A> levels(num_levels + 1, 0, allocator);
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
if (is_single_item) {
levels[0] = capacity - 1;
@@ -497,41 +506,43 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(std::istream& is) {
is.read((char*)levels.data(), sizeof(levels[0]) * num_levels);
}
levels[num_levels] = capacity;
- auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
- std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
- std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
- std::unique_ptr<T, item_deleter> min_value;
- std::unique_ptr<T, item_deleter> max_value;
+ A alloc(allocator);
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
if (!is_single_item) {
S().deserialize(is, min_value_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
S().deserialize(is, max_value_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
}
- auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
- std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
+ auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
+ std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
const auto num_items = levels[num_levels] - levels[0];
S().deserialize(is, &items_buffer.get()[levels[0]], num_items);
// serde call did not throw, repackage with destrtuctors
- std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
+ std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
if (is_single_item) {
new (min_value_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
new (max_value_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
}
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
+ if (!is.good())
+ throw std::runtime_error("error reading from std::istream");
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
std::move(min_value), std::move(max_value), is_level_zero_sorted);
}
template<typename T, typename C, typename S, typename A>
-kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size) {
+kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, size_t size, const A& allocator) {
ensure_minimum_memory(size, 8);
const char* ptr = static_cast<const char*>(bytes);
uint8_t preamble_ints;
@@ -555,7 +566,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
ensure_minimum_memory(size, 1 << preamble_ints);
const bool is_empty(flags_byte & (1 << flags::IS_EMPTY));
- if (is_empty) return kll_sketch<T, C, S, A>(k);
+ if (is_empty) return kll_sketch<T, C, S, A>(k, allocator);
uint64_t n;
uint16_t min_k;
@@ -572,7 +583,7 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
ptr += copy_from_mem(ptr, &num_levels, sizeof(num_levels));
ptr++; // skip unused byte
}
- vector_u32<A> levels(num_levels + 1);
+ vector_u32<A> levels(num_levels + 1, 0, allocator);
const uint32_t capacity(kll_helper::compute_total_capacity(k, m, num_levels));
if (is_single_item) {
levels[0] = capacity - 1;
@@ -581,35 +592,36 @@ kll_sketch<T, C, S, A> kll_sketch<T, C, S, A>::deserialize(const void* bytes, si
ptr += copy_from_mem(ptr, levels.data(), sizeof(levels[0]) * num_levels);
}
levels[num_levels] = capacity;
- auto item_buffer_deleter = [](T* ptr) { A().deallocate(ptr, 1); };
- std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(A().allocate(1), item_buffer_deleter);
- std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(A().allocate(1), item_buffer_deleter);
- std::unique_ptr<T, item_deleter> min_value;
- std::unique_ptr<T, item_deleter> max_value;
+ A alloc(allocator);
+ auto item_buffer_deleter = [&alloc](T* ptr) { alloc.deallocate(ptr, 1); };
+ std::unique_ptr<T, decltype(item_buffer_deleter)> min_value_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, decltype(item_buffer_deleter)> max_value_buffer(alloc.allocate(1), item_buffer_deleter);
+ std::unique_ptr<T, item_deleter> min_value(nullptr, item_deleter(allocator));
+ std::unique_ptr<T, item_deleter> max_value(nullptr, item_deleter(allocator));
if (!is_single_item) {
ptr += S().deserialize(ptr, end_ptr - ptr, min_value_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
ptr += S().deserialize(ptr, end_ptr - ptr, max_value_buffer.get(), 1);
// serde call did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
}
- auto items_buffer_deleter = [capacity](T* ptr) { A().deallocate(ptr, capacity); };
- std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(A().allocate(capacity), items_buffer_deleter);
+ auto items_buffer_deleter = [capacity, &alloc](T* ptr) { alloc.deallocate(ptr, capacity); };
+ std::unique_ptr<T, decltype(items_buffer_deleter)> items_buffer(alloc.allocate(capacity), items_buffer_deleter);
const auto num_items = levels[num_levels] - levels[0];
ptr += S().deserialize(ptr, end_ptr - ptr, &items_buffer.get()[levels[0]], num_items);
// serde call did not throw, repackage with destrtuctors
- std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity));
+ std::unique_ptr<T, items_deleter> items(items_buffer.release(), items_deleter(levels[0], capacity, allocator));
const size_t delta = ptr - static_cast<const char*>(bytes);
if (delta != size) throw std::logic_error("deserialized size mismatch: " + std::to_string(delta) + " != " + std::to_string(size));
const bool is_level_zero_sorted = (flags_byte & (1 << flags::IS_LEVEL_ZERO_SORTED)) > 0;
if (is_single_item) {
new (min_value_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter());
+ min_value = std::unique_ptr<T, item_deleter>(min_value_buffer.release(), item_deleter(allocator));
new (max_value_buffer.get()) T(items.get()[levels[0]]);
// copy did not throw, repackage with destrtuctor
- max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter());
+ max_value = std::unique_ptr<T, item_deleter>(max_value_buffer.release(), item_deleter(allocator));
}
return kll_sketch(k, min_k, n, num_levels, std::move(levels), std::move(items), capacity,
std::move(min_value), std::move(max_value), is_level_zero_sorted);
@@ -634,6 +646,7 @@ template<typename T, typename C, typename S, typename A>
kll_sketch<T, C, S, A>::kll_sketch(uint16_t k, uint16_t min_k, uint64_t n, uint8_t num_levels, vector_u32<A>&& levels,
std::unique_ptr<T, items_deleter> items, uint32_t items_size, std::unique_ptr<T, item_deleter> min_value,
std::unique_ptr<T, item_deleter> max_value, bool is_level_zero_sorted):
+allocator_(levels.get_allocator()),
k_(k),
m_(DEFAULT_M),
min_k_(min_k),
@@ -735,9 +748,9 @@ void kll_sketch<T, C, S, A>::add_empty_top_level_to_completely_full_sketch() {
const uint32_t new_total_cap = cur_total_cap + delta_cap;
// move (and shift) the current data into the new buffer
- T* new_buf = A().allocate(new_total_cap);
+ T* new_buf = allocator_.allocate(new_total_cap);
kll_helper::move_construct<T>(items_, 0, cur_total_cap, new_buf, delta_cap, true);
- A().deallocate(items_, items_size_);
+ allocator_.deallocate(items_, items_size_);
items_ = new_buf;
items_size_ = new_total_cap;
@@ -763,19 +776,20 @@ void kll_sketch<T, C, S, A>::sort_level_zero() {
template<typename T, typename C, typename S, typename A>
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> kll_sketch<T, C, S, A>::get_quantile_calculator() {
sort_level_zero();
- typedef typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>> AllocCalc;
+ using AllocCalc = typename std::allocator_traits<A>::template rebind_alloc<kll_quantile_calculator<T, C, A>>;
+ AllocCalc alloc(allocator_);
std::unique_ptr<kll_quantile_calculator<T, C, A>, std::function<void(kll_quantile_calculator<T, C, A>*)>> quantile_calculator(
- new (AllocCalc().allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_),
- [](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); AllocCalc().deallocate(ptr, 1); }
+ new (alloc.allocate(1)) kll_quantile_calculator<T, C, A>(items_, levels_.data(), num_levels_, n_, allocator_),
+ [&alloc](kll_quantile_calculator<T, C, A>* ptr){ ptr->~kll_quantile_calculator<T, C, A>(); alloc.deallocate(ptr, 1); }
);
return quantile_calculator;
}
template<typename T, typename C, typename S, typename A>
vector_d<A> kll_sketch<T, C, S, A>::get_PMF_or_CDF(const T* split_points, uint32_t size, bool is_CDF) const {
- if (is_empty()) return vector_d<A>();
+ if (is_empty()) return vector_d<A>(allocator_);
kll_helper::validate_values<T, C>(split_points, size);
- vector_d<A> buckets(size + 1, 0);
+ vector_d<A> buckets(size + 1, 0, allocator_);
uint8_t level = 0;
uint64_t weight = 1;
while (level < num_levels_) {
@@ -845,12 +859,13 @@ template<typename T, typename C, typename S, typename A>
template<typename O>
void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
const uint32_t tmp_num_items = get_num_retained() + other.get_num_retained_above_level_zero();
- auto tmp_items_deleter = [tmp_num_items](T* ptr) { A().deallocate(ptr, tmp_num_items); }; // no destructor needed
- const std::unique_ptr<T, decltype(tmp_items_deleter)> workbuf(A().allocate(tmp_num_items), tmp_items_deleter);
+ A alloc(allocator_);
+ auto tmp_items_deleter = [tmp_num_items, &alloc](T* ptr) { alloc.deallocate(ptr, tmp_num_items); }; // no destructor needed
+ const std::unique_ptr<T, decltype(tmp_items_deleter)> workbuf(allocator_.allocate(tmp_num_items), tmp_items_deleter);
const uint8_t ub = kll_helper::ub_on_num_levels(final_n);
const size_t work_levels_size = ub + 2; // ub+1 does not work
- vector_u32<A> worklevels(work_levels_size);
- vector_u32<A> outlevels(work_levels_size);
+ vector_u32<A> worklevels(work_levels_size, 0, allocator_);
+ vector_u32<A> outlevels(work_levels_size, 0, allocator_);
const uint8_t provisional_num_levels = std::max(num_levels_, other.num_levels_);
@@ -864,9 +879,9 @@ void kll_sketch<T, C, S, A>::merge_higher_levels(O&& other, uint64_t final_n) {
// now we need to transfer the results back into "this" sketch
if (result.final_capacity != items_size_) {
- A().deallocate(items_, items_size_);
+ allocator_.deallocate(items_, items_size_);
items_size_ = result.final_capacity;
- items_ = A().allocate(items_size_);
+ items_ = allocator_.allocate(items_size_);
}
const uint32_t free_space_at_bottom = result.final_capacity - result.final_num_items;
kll_helper::move_construct<T>(workbuf.get(), outlevels[0], outlevels[0] + result.final_num_items, items_, free_space_at_bottom, true);
@@ -1101,29 +1116,32 @@ const std::pair<const T&, const uint64_t> kll_sketch<T, C, S, A>::const_iterator
template<typename T, typename C, typename S, typename A>
class kll_sketch<T, C, S, A>::item_deleter {
public:
- void operator() (T* ptr) const {
+ item_deleter(const A& allocator): allocator_(allocator) {}
+ void operator() (T* ptr) {
if (ptr != nullptr) {
ptr->~T();
- A().deallocate(ptr, 1);
+ allocator_.deallocate(ptr, 1);
}
}
+ private:
+ A allocator_;
};
template<typename T, typename C, typename S, typename A>
class kll_sketch<T, C, S, A>::items_deleter {
public:
- items_deleter(uint32_t start, uint32_t num): start(start), num(num) {}
- void operator() (T* ptr) const {
+ items_deleter(uint32_t start, uint32_t num, const A& allocator):
+ allocator_(allocator), start_(start), num_(num) {}
+ void operator() (T* ptr) {
if (ptr != nullptr) {
- for (uint32_t i = start; i < num; ++i) {
- ptr[i].~T();
- }
- A().deallocate(ptr, num);
+ for (uint32_t i = start_; i < num_; ++i) ptr[i].~T();
+ allocator_.deallocate(ptr, num_);
}
}
private:
- uint32_t start;
- uint32_t num;
+ A allocator_;
+ uint32_t start_;
+ uint32_t num_;
};
} /* namespace datasketches */
diff --git a/be/src/thirdparty/datasketches/memory_operations.hpp b/be/src/thirdparty/datasketches/memory_operations.hpp
index 80dc3a3..986b2b0 100644
--- a/be/src/thirdparty/datasketches/memory_operations.hpp
+++ b/be/src/thirdparty/datasketches/memory_operations.hpp
@@ -52,6 +52,18 @@ static inline size_t copy_to_mem(const void* src, void* dst, size_t size) {
return size;
}
+template<typename T>
+static inline size_t copy_to_mem(const T& item, void* dst) {
+ memcpy(dst, &item, sizeof(T));
+ return sizeof(T);
+}
+
+template<typename T>
+static inline size_t copy_from_mem(const void* src, T& item) {
+ memcpy(&item, src, sizeof(T));
+ return sizeof(T);
+}
+
} // namespace
#endif // _MEMORY_OPERATIONS_HPP_
diff --git a/be/src/thirdparty/datasketches/theta_a_not_b.hpp b/be/src/thirdparty/datasketches/theta_a_not_b.hpp
index db66ac7..4beef60 100644
--- a/be/src/thirdparty/datasketches/theta_a_not_b.hpp
+++ b/be/src/thirdparty/datasketches/theta_a_not_b.hpp
@@ -20,51 +20,34 @@
#ifndef THETA_A_NOT_B_HPP_
#define THETA_A_NOT_B_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-
#include "theta_sketch.hpp"
-#include "common_defs.hpp"
+#include "theta_set_difference_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_a_not_b_alloc {
public:
- /**
- * Creates an instance of the a-not-b operation (set difference) with a given has seed.
- * @param seed hash seed
- */
- explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED);
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
+ using State = theta_set_difference_base<Entry, ExtractKey, CompactSketch, Allocator>;
+
+ explicit theta_a_not_b_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* Computes the a-not-b set operation given two sketches.
* @return the result of a-not-b
*/
- compact_theta_sketch_alloc<A> compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered = true) const;
+ template<typename FwdSketch, typename Sketch>
+ CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered = true) const;
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
- uint16_t seed_hash_;
-
- class less_than {
- public:
- explicit less_than(uint64_t value): value(value) {}
- bool operator()(uint64_t value) const { return value < this->value; }
- private:
- uint64_t value;
- };
+ State state_;
};
// alias with default allocator for convenience
-typedef theta_a_not_b_alloc<std::allocator<void>> theta_a_not_b;
+using theta_a_not_b = theta_a_not_b_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
diff --git a/be/src/thirdparty/datasketches/theta_a_not_b_impl.hpp b/be/src/thirdparty/datasketches/theta_a_not_b_impl.hpp
index 4343ee3..4c17bbf 100644
--- a/be/src/thirdparty/datasketches/theta_a_not_b_impl.hpp
+++ b/be/src/thirdparty/datasketches/theta_a_not_b_impl.hpp
@@ -26,56 +26,15 @@
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
template<typename A>
-theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed):
-seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
+theta_a_not_b_alloc<A>::theta_a_not_b_alloc(uint64_t seed, const A& allocator):
+state_(seed, allocator)
{}
template<typename A>
-compact_theta_sketch_alloc<A> theta_a_not_b_alloc<A>::compute(const theta_sketch_alloc<A>& a, const theta_sketch_alloc<A>& b, bool ordered) const {
- if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return compact_theta_sketch_alloc<A>(a, ordered);
- if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
- if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
-
- const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
- vector_u64<A> keys;
- bool is_empty = a.is_empty();
-
- if (b.get_num_retained() == 0) {
- std::copy_if(a.begin(), a.end(), std::back_inserter(keys), less_than(theta));
- } else {
- if (a.is_ordered() && b.is_ordered()) { // sort-based
- std::set_difference(a.begin(), a.end(), b.begin(), b.end(), conditional_back_inserter(keys, less_than(theta)));
- } else { // hash-based
- const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- vector_u64<A> b_hash_table(1 << lg_size, 0);
- for (auto key: b) {
- if (key < theta) {
- update_theta_sketch_alloc<A>::hash_search_or_insert(key, b_hash_table.data(), lg_size);
- } else if (b.is_ordered()) {
- break; // early stop
- }
- }
-
- // scan A lookup B
- for (auto key: a) {
- if (key < theta) {
- if (!update_theta_sketch_alloc<A>::hash_search(key, b_hash_table.data(), lg_size)) keys.push_back(key);
- } else if (a.is_ordered()) {
- break; // early stop
- }
- }
- }
- }
- if (keys.empty() && theta == theta_sketch_alloc<A>::MAX_THETA) is_empty = true;
- if (ordered && !a.is_ordered()) std::sort(keys.begin(), keys.end());
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash_, a.is_ordered() || ordered);
+template<typename FwdSketch, typename Sketch>
+auto theta_a_not_b_alloc<A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const -> CompactSketch {
+ return state_.compute(std::forward<FwdSketch>(a), b, ordered);
}
} /* namespace datasketches */
diff --git a/be/src/thirdparty/datasketches/CubicInterpolation.hpp b/be/src/thirdparty/datasketches/theta_comparators.hpp
similarity index 57%
copy from be/src/thirdparty/datasketches/CubicInterpolation.hpp
copy to be/src/thirdparty/datasketches/theta_comparators.hpp
index b9cdfe7..e8a39b7 100644
--- a/be/src/thirdparty/datasketches/CubicInterpolation.hpp
+++ b/be/src/thirdparty/datasketches/theta_comparators.hpp
@@ -17,27 +17,32 @@
* under the License.
*/
-#ifndef _CUBICINTERPOLATION_HPP_
-#define _CUBICINTERPOLATION_HPP_
-
-#include <memory>
+#ifndef THETA_COMPARATORS_HPP_
+#define THETA_COMPARATORS_HPP_
namespace datasketches {
-template<typename A = std::allocator<char>>
-class CubicInterpolation {
- public:
- static double usingXAndYTables(const double xArr[], const double yArr[],
- int len, double x);
-
- static double usingXAndYTables(double x);
-
- static double usingXArrAndYStride(const double xArr[], const int xArrLen,
- double yStride, double x);
+template<typename ExtractKey>
+struct compare_by_key {
+ template<typename Entry1, typename Entry2>
+ bool operator()(Entry1&& a, Entry2&& b) const {
+ return ExtractKey()(std::forward<Entry1>(a)) < ExtractKey()(std::forward<Entry2>(b));
+ }
};
-}
+// less than
+
+template<typename Key, typename Entry, typename ExtractKey>
+class key_less_than {
+public:
+ explicit key_less_than(const Key& key): key(key) {}
+ bool operator()(const Entry& entry) const {
+ return ExtractKey()(entry) < this->key;
+ }
+private:
+ Key key;
+};
-#include "CubicInterpolation-internal.hpp"
+} /* namespace datasketches */
-#endif /* _CUBICINTERPOLATION_HPP_ */
\ No newline at end of file
+#endif
diff --git a/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp b/be/src/thirdparty/datasketches/theta_constants.hpp
similarity index 66%
copy from be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp
copy to be/src/thirdparty/datasketches/theta_constants.hpp
index 8baecbe..d5d6fd9 100644
--- a/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp
+++ b/be/src/thirdparty/datasketches/theta_constants.hpp
@@ -17,24 +17,20 @@
* under the License.
*/
-#ifndef _COMPOSITEINTERPOLATIONXTABLE_HPP_
-#define _COMPOSITEINTERPOLATIONXTABLE_HPP_
+#ifndef THETA_CONSTANTS_HPP_
+#define THETA_CONSTANTS_HPP_
-#include <memory>
+#include <climits>
namespace datasketches {
-template<typename A = std::allocator<char>>
-class CompositeInterpolationXTable {
- public:
- static int get_y_stride(int logK);
-
- static const double* get_x_arr(int logK);
- static int get_x_arr_length();
-};
-
+namespace theta_constants {
+ enum resize_factor { X1, X2, X4, X8 };
+ static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
+ static const uint8_t MIN_LG_K = 5;
+ static const uint8_t MAX_LG_K = 26;
}
-#include "CompositeInterpolationXTable-internal.hpp"
+} /* namespace datasketches */
-#endif /* _COMPOSITEINTERPOLATIONXTABLE_HPP_ */
\ No newline at end of file
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_helpers.hpp b/be/src/thirdparty/datasketches/theta_helpers.hpp
new file mode 100644
index 0000000..6852590
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_helpers.hpp
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_HELPERS_HPP_
+#define THETA_HELPERS_HPP_
+
+#include <string>
+#include <stdexcept>
+
+namespace datasketches {
+
+template<typename T>
+static void check_value(T actual, T expected, const char* description) {
+ if (actual != expected) {
+ throw std::invalid_argument(std::string(description) + " mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
+ }
+}
+
+template<bool dummy>
+class checker {
+public:
+ static void check_serial_version(uint8_t actual, uint8_t expected) {
+ check_value(actual, expected, "serial version");
+ }
+ static void check_sketch_family(uint8_t actual, uint8_t expected) {
+ check_value(actual, expected, "sketch family");
+ }
+ static void check_sketch_type(uint8_t actual, uint8_t expected) {
+ check_value(actual, expected, "sketch type");
+ }
+ static void check_seed_hash(uint16_t actual, uint16_t expected) {
+ check_value(actual, expected, "seed hash");
+ }
+};
+
+} /* namespace datasketches */
+
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_intersection.hpp b/be/src/thirdparty/datasketches/theta_intersection.hpp
index 5945c52..98a8bf1 100644
--- a/be/src/thirdparty/datasketches/theta_intersection.hpp
+++ b/be/src/thirdparty/datasketches/theta_intersection.hpp
@@ -20,29 +20,28 @@
#ifndef THETA_INTERSECTION_HPP_
#define THETA_INTERSECTION_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-
#include "theta_sketch.hpp"
-#include "common_defs.hpp"
+#include "theta_intersection_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_intersection_alloc {
public:
- /**
- * Creates an instance of the intersection with a given hash seed.
- * @param seed hash seed
- */
- explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED);
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using Sketch = theta_sketch_alloc<Allocator>;
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
+
+ struct pass_through_policy {
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
+ unused(incoming_entry);
+ return internal_entry;
+ }
+ };
+ using State = theta_intersection_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
+
+ explicit theta_intersection_alloc(uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* Updates the intersection with a given sketch.
@@ -50,7 +49,8 @@ public:
* can reduce the current set to leave the overlapping subset only.
* @param sketch represents input set for the intersection
*/
- void update(const theta_sketch_alloc<A>& sketch);
+ template<typename FwdSketch>
+ void update(FwdSketch&& sketch);
/**
* Produces a copy of the current state of the intersection.
@@ -59,7 +59,7 @@ public:
* @param ordered optional flag to specify if ordered sketch should be produced
* @return the result of the intersection
*/
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
+ CompactSketch get_result(bool ordered = true) const;
/**
* Returns true if the state of the intersection is defined (not infinite "universe").
@@ -68,21 +68,14 @@ public:
bool has_result() const;
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
- bool is_valid_;
- bool is_empty_;
- uint64_t theta_;
- uint8_t lg_size_;
- vector_u64<A> keys_;
- uint32_t num_keys_;
- uint16_t seed_hash_;
+ State state_;
};
// alias with default allocator for convenience
-typedef theta_intersection_alloc<std::allocator<void>> theta_intersection;
+using theta_intersection = theta_intersection_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
#include "theta_intersection_impl.hpp"
-# endif
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_intersection_base.hpp b/be/src/thirdparty/datasketches/theta_intersection_base.hpp
new file mode 100644
index 0000000..c034590
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_intersection_base.hpp
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_INTERSECTION_BASE_HPP_
+#define THETA_INTERSECTION_BASE_HPP_
+
+namespace datasketches {
+
+template<
+ typename Entry,
+ typename ExtractKey,
+ typename Policy,
+ typename Sketch,
+ typename CompactSketch,
+ typename Allocator
+>
+class theta_intersection_base {
+public:
+ using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
+ using resize_factor = typename hash_table::resize_factor;
+ using comparator = compare_by_key<ExtractKey>;
+ theta_intersection_base(uint64_t seed, const Policy& policy, const Allocator& allocator);
+
+ template<typename FwdSketch>
+ void update(FwdSketch&& sketch);
+
+ CompactSketch get_result(bool ordered = true) const;
+
+ bool has_result() const;
+
+ const Policy& get_policy() const;
+
+private:
+ Policy policy_;
+ bool is_valid_;
+ hash_table table_;
+};
+
+} /* namespace datasketches */
+
+#include "theta_intersection_base_impl.hpp"
+
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_intersection_base_impl.hpp b/be/src/thirdparty/datasketches/theta_intersection_base_impl.hpp
new file mode 100644
index 0000000..286f0ca
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_intersection_base_impl.hpp
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+#include "conditional_forward.hpp"
+
+namespace datasketches {
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+theta_intersection_base<EN, EK, P, S, CS, A>::theta_intersection_base(uint64_t seed, const P& policy, const A& allocator):
+policy_(policy),
+is_valid_(false),
+table_(0, 0, resize_factor::X1, theta_constants::MAX_THETA, seed, allocator, false)
+{}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+template<typename SS>
+void theta_intersection_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
+ if (table_.is_empty_) return;
+ if (!sketch.is_empty() && sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
+ table_.is_empty_ |= sketch.is_empty();
+ table_.theta_ = std::min(table_.theta_, sketch.get_theta64());
+ if (is_valid_ && table_.num_entries_ == 0) return;
+ if (sketch.get_num_retained() == 0) {
+ is_valid_ = true;
+ table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
+ return;
+ }
+ if (!is_valid_) { // first update, copy or move incoming sketch
+ is_valid_ = true;
+ const uint8_t lg_size = lg_size_from_count(sketch.get_num_retained(), theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
+ for (auto& entry: sketch) {
+ auto result = table_.find(EK()(entry));
+ if (result.second) {
+ throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
+ }
+ table_.insert(result.first, conditional_forward<SS>(entry));
+ }
+ if (table_.num_entries_ != sketch.get_num_retained()) throw std::invalid_argument("num entries mismatch, possibly corrupted input sketch");
+ } else { // intersection
+ const uint32_t max_matches = std::min(table_.num_entries_, sketch.get_num_retained());
+ std::vector<EN, A> matched_entries(table_.allocator_);
+ matched_entries.reserve(max_matches);
+ uint32_t match_count = 0;
+ uint32_t count = 0;
+ for (auto& entry: sketch) {
+ if (EK()(entry) < table_.theta_) {
+ auto result = table_.find(EK()(entry));
+ if (result.second) {
+ if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
+ policy_(*result.first, conditional_forward<SS>(entry));
+ matched_entries.push_back(std::move(*result.first));
+ ++match_count;
+ }
+ } else if (sketch.is_ordered()) {
+ break; // early stop
+ }
+ ++count;
+ }
+ if (count > sketch.get_num_retained()) {
+ throw std::invalid_argument(" more keys than expected, possibly corrupted input sketch");
+ } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
+ throw std::invalid_argument(" fewer keys than expected, possibly corrupted input sketch");
+ }
+ if (match_count == 0) {
+ table_ = hash_table(0, 0, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
+ if (table_.theta_ == theta_constants::MAX_THETA) table_.is_empty_ = true;
+ } else {
+ const uint8_t lg_size = lg_size_from_count(match_count, theta_update_sketch_base<EN, EK, A>::REBUILD_THRESHOLD);
+ table_ = hash_table(lg_size, lg_size, resize_factor::X1, table_.theta_, table_.seed_, table_.allocator_, table_.is_empty_);
+ for (uint32_t i = 0; i < match_count; i++) {
+ auto result = table_.find(EK()(matched_entries[i]));
+ table_.insert(result.first, std::move(matched_entries[i]));
+ }
+ }
+ }
+}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+CS theta_intersection_base<EN, EK, P, S, CS, A>::get_result(bool ordered) const {
+ if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
+ std::vector<EN, A> entries(table_.allocator_);
+ if (table_.num_entries_ > 0) {
+ entries.reserve(table_.num_entries_);
+ std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero<EN, EK>());
+ if (ordered) std::sort(entries.begin(), entries.end(), comparator());
+ }
+ return CS(table_.is_empty_, ordered, compute_seed_hash(table_.seed_), table_.theta_, std::move(entries));
+}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+bool theta_intersection_base<EN, EK, P, S, CS, A>::has_result() const {
+ return is_valid_;
+}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+const P& theta_intersection_base<EN, EK, P, S, CS, A>::get_policy() const {
+ return policy_;
+}
+
+} /* namespace datasketches */
diff --git a/be/src/thirdparty/datasketches/theta_intersection_impl.hpp b/be/src/thirdparty/datasketches/theta_intersection_impl.hpp
index d090b3a..a0c4291 100644
--- a/be/src/thirdparty/datasketches/theta_intersection_impl.hpp
+++ b/be/src/thirdparty/datasketches/theta_intersection_impl.hpp
@@ -20,109 +20,27 @@
#ifndef THETA_INTERSECTION_IMPL_HPP_
#define THETA_INTERSECTION_IMPL_HPP_
-#include <algorithm>
-
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
template<typename A>
-theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed):
-is_valid_(false),
-is_empty_(false),
-theta_(theta_sketch_alloc<A>::MAX_THETA),
-lg_size_(0),
-keys_(),
-num_keys_(0),
-seed_hash_(theta_sketch_alloc<A>::get_seed_hash(seed))
+theta_intersection_alloc<A>::theta_intersection_alloc(uint64_t seed, const A& allocator):
+state_(seed, pass_through_policy(), allocator)
{}
template<typename A>
-void theta_intersection_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
- if (is_empty_) return;
- if (!sketch.is_empty() && sketch.get_seed_hash() != seed_hash_) throw std::invalid_argument("seed hash mismatch");
- is_empty_ |= sketch.is_empty();
- theta_ = std::min(theta_, sketch.get_theta64());
- if (is_valid_ && num_keys_ == 0) return;
- if (sketch.get_num_retained() == 0) {
- is_valid_ = true;
- if (keys_.size() > 0) {
- keys_.resize(0);
- lg_size_ = 0;
- num_keys_ = 0;
- }
- return;
- }
- if (!is_valid_) { // first update, clone incoming sketch
- is_valid_ = true;
- lg_size_ = lg_size_from_count(sketch.get_num_retained(), update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- keys_.resize(1 << lg_size_, 0);
- for (auto key: sketch) {
- if (!update_theta_sketch_alloc<A>::hash_search_or_insert(key, keys_.data(), lg_size_)) {
- throw std::invalid_argument("duplicate key, possibly corrupted input sketch");
- }
- ++num_keys_;
- }
- if (num_keys_ != sketch.get_num_retained()) throw std::invalid_argument("num keys mismatch, possibly corrupted input sketch");
- } else { // intersection
- const uint32_t max_matches = std::min(num_keys_, sketch.get_num_retained());
- vector_u64<A> matched_keys(max_matches);
- uint32_t match_count = 0;
- uint32_t count = 0;
- for (auto key: sketch) {
- if (key < theta_) {
- if (update_theta_sketch_alloc<A>::hash_search(key, keys_.data(), lg_size_)) {
- if (match_count == max_matches) throw std::invalid_argument("max matches exceeded, possibly corrupted input sketch");
- matched_keys[match_count++] = key;
- }
- } else if (sketch.is_ordered()) {
- break; // early stop
- }
- ++count;
- }
- if (count > sketch.get_num_retained()) {
- throw std::invalid_argument(" more keys then expected, possibly corrupted input sketch");
- } else if (!sketch.is_ordered() && count < sketch.get_num_retained()) {
- throw std::invalid_argument(" fewer keys then expected, possibly corrupted input sketch");
- }
- if (match_count == 0) {
- keys_.resize(0);
- lg_size_ = 0;
- num_keys_ = 0;
- if (theta_ == theta_sketch_alloc<A>::MAX_THETA) is_empty_ = true;
- } else {
- const uint8_t lg_size = lg_size_from_count(match_count, update_theta_sketch_alloc<A>::REBUILD_THRESHOLD);
- if (lg_size != lg_size_) {
- lg_size_ = lg_size;
- keys_.resize(1 << lg_size_);
- }
- std::fill(keys_.begin(), keys_.end(), 0);
- for (uint32_t i = 0; i < match_count; i++) {
- update_theta_sketch_alloc<A>::hash_search_or_insert(matched_keys[i], keys_.data(), lg_size_);
- }
- num_keys_ = match_count;
- }
- }
+template<typename SS>
+void theta_intersection_alloc<A>::update(SS&& sketch) {
+ state_.update(std::forward<SS>(sketch));
}
template<typename A>
-compact_theta_sketch_alloc<A> theta_intersection_alloc<A>::get_result(bool ordered) const {
- if (!is_valid_) throw std::invalid_argument("calling get_result() before calling update() is undefined");
- vector_u64<A> keys(num_keys_);
- if (num_keys_ > 0) {
- std::copy_if(keys_.begin(), keys_.end(), keys.begin(), [](uint64_t key) { return key != 0; });
- if (ordered) std::sort(keys.begin(), keys.end());
- }
- return compact_theta_sketch_alloc<A>(is_empty_, theta_, std::move(keys), seed_hash_, ordered);
+auto theta_intersection_alloc<A>::get_result(bool ordered) const -> CompactSketch {
+ return state_.get_result(ordered);
}
template<typename A>
bool theta_intersection_alloc<A>::has_result() const {
- return is_valid_;
+ return state_.has_result();
}
} /* namespace datasketches */
diff --git a/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp b/be/src/thirdparty/datasketches/theta_jaccard_similarity.hpp
similarity index 59%
copy from be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp
copy to be/src/thirdparty/datasketches/theta_jaccard_similarity.hpp
index 8baecbe..417ed54 100644
--- a/be/src/thirdparty/datasketches/CompositeInterpolationXTable.hpp
+++ b/be/src/thirdparty/datasketches/theta_jaccard_similarity.hpp
@@ -17,24 +17,21 @@
* under the License.
*/
-#ifndef _COMPOSITEINTERPOLATIONXTABLE_HPP_
-#define _COMPOSITEINTERPOLATIONXTABLE_HPP_
+#ifndef THETA_JACCARD_SIMILARITY_HPP_
+#define THETA_JACCARD_SIMILARITY_HPP_
-#include <memory>
+#include "theta_jaccard_similarity_base.hpp"
+#include "theta_union.hpp"
+#include "theta_intersection.hpp"
namespace datasketches {
-template<typename A = std::allocator<char>>
-class CompositeInterpolationXTable {
- public:
- static int get_y_stride(int logK);
+template<typename Allocator = std::allocator<uint64_t>>
+using theta_jaccard_similarity_alloc = jaccard_similarity_base<theta_union_alloc<Allocator>, theta_intersection_alloc<Allocator>, trivial_extract_key>;
- static const double* get_x_arr(int logK);
- static int get_x_arr_length();
-};
+// alias with default allocator for convenience
+using theta_jaccard_similarity = theta_jaccard_similarity_alloc<std::allocator<uint64_t>>;
-}
+} /* namespace datasketches */
-#include "CompositeInterpolationXTable-internal.hpp"
-
-#endif /* _COMPOSITEINTERPOLATIONXTABLE_HPP_ */
\ No newline at end of file
+# endif
diff --git a/be/src/thirdparty/datasketches/theta_jaccard_similarity_base.hpp b/be/src/thirdparty/datasketches/theta_jaccard_similarity_base.hpp
new file mode 100644
index 0000000..cb18601
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_jaccard_similarity_base.hpp
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_JACCARD_SIMILARITY_BASE_HPP_
+#define THETA_JACCARD_SIMILARITY_BASE_HPP_
+
+#include <memory>
+#include <array>
+
+#include "theta_constants.hpp"
+#include "bounds_on_ratios_in_theta_sketched_sets.hpp"
+#include "ceiling_power_of_2.hpp"
+#include "common_defs.hpp"
+
+namespace datasketches {
+
+template<typename Union, typename Intersection, typename ExtractKey>
+class jaccard_similarity_base {
+public:
+
+ /**
+ * Computes the Jaccard similarity index with upper and lower bounds. The Jaccard similarity index
+ * <i>J(A,B) = (A ^ B)/(A U B)</i> is used to measure how similar the two sketches are to each
+ * other. If J = 1.0, the sketches are considered equal. If J = 0, the two sketches are
+ * disjoint. A Jaccard of .95 means the overlap between the two
+ * sets is 95% of the union of the two sets.
+ *
+ * <p>Note: For very large pairs of sketches, where the configured nominal entries of the sketches
+ * are 2^25 or 2^26, this method may produce unpredictable results.
+ *
+ * @param sketch_a given sketch A
+ * @param sketch_b given sketch B
+ * @return a double array {LowerBound, Estimate, UpperBound} of the Jaccard index.
+ * The Upper and Lower bounds are for a confidence interval of 95.4% or +/- 2 standard deviations.
+ */
+ template<typename SketchA, typename SketchB>
+ static std::array<double, 3> jaccard(const SketchA& sketch_a, const SketchB& sketch_b) {
+ if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return {1, 1, 1};
+ if (sketch_a.is_empty() && sketch_b.is_empty()) return {1, 1, 1};
+ if (sketch_a.is_empty() || sketch_b.is_empty()) return {0, 0, 0};
+
+ auto union_ab = compute_union(sketch_a, sketch_b);
+ if (identical_sets(sketch_a, sketch_b, union_ab)) return {1, 1, 1};
+
+ // intersection
+ Intersection i;
+ i.update(sketch_a);
+ i.update(sketch_b);
+ i.update(union_ab); // ensures that intersection is a subset of the union
+ auto inter_abu = i.get_result(false);
+
+ return {
+ bounds_on_ratios_in_theta_sketched_sets<ExtractKey>::lower_bound_for_b_over_a(union_ab, inter_abu),
+ bounds_on_ratios_in_theta_sketched_sets<ExtractKey>::estimate_of_b_over_a(union_ab, inter_abu),
+ bounds_on_ratios_in_theta_sketched_sets<ExtractKey>::upper_bound_for_b_over_a(union_ab, inter_abu)
+ };
+ }
+
+ /**
+ * Returns true if the two given sketches are equivalent.
+ * @param sketch_a the given sketch A
+ * @param sketch_b the given sketch B
+ * @return true if the two given sketches are exactly equal
+ */
+ template<typename SketchA, typename SketchB>
+ static bool exactly_equal(const SketchA& sketch_a, const SketchB& sketch_b) {
+ if (reinterpret_cast<const void*>(&sketch_a) == reinterpret_cast<const void*>(&sketch_b)) return true;
+ if (sketch_a.is_empty() && sketch_b.is_empty()) return true;
+ if (sketch_a.is_empty() || sketch_b.is_empty()) return false;
+
+ auto union_ab = compute_union(sketch_a, sketch_b);
+ if (identical_sets(sketch_a, sketch_b, union_ab)) return true;
+ return false;
+ }
+
+ /**
+ * Tests similarity of an actual Sketch against an expected Sketch.
+ * Computes the lower bound of the Jaccard index <i>J<sub>LB</sub></i> of the actual and
+ * expected sketches.
+ * if <i>J<sub>LB</sub> ≥ threshold</i>, then the sketches are considered to be
+ * similar with a confidence of 97.7%.
+ *
+ * @param actual the sketch to be tested
+ * @param expected the reference sketch that is considered to be correct
+ * @param threshold a real value between zero and one
+ * @return true if the similarity of the two sketches is greater than the given threshold
+ * with at least 97.7% confidence
+ */
+ template<typename SketchA, typename SketchB>
+ static bool similarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
+ auto jc = jaccard(actual, expected);
+ return jc[0] >= threshold;
+ }
+
+ /**
+ * Tests dissimilarity of an actual Sketch against an expected Sketch.
+ * Computes the upper bound of the Jaccard index <i>J<sub>UB</sub></i> of the actual and
+ * expected sketches.
+ * if <i>J<sub>UB</sub> ≤ threshold</i>, then the sketches are considered to be
+ * dissimilar with a confidence of 97.7%.
+ *
+ * @param actual the sketch to be tested
+ * @param expected the reference sketch that is considered to be correct
+ * @param threshold a real value between zero and one
+ * @return true if the dissimilarity of the two sketches is greater than the given threshold
+ * with at least 97.7% confidence
+ */
+ template<typename SketchA, typename SketchB>
+ static bool dissimilarity_test(const SketchA& actual, const SketchB& expected, double threshold) {
+ auto jc = jaccard(actual, expected);
+ return jc[2] <= threshold;
+ }
+
+private:
+
+ template<typename SketchA, typename SketchB>
+ static typename Union::CompactSketch compute_union(const SketchA& sketch_a, const SketchB& sketch_b) {
+ const unsigned count_a = sketch_a.get_num_retained();
+ const unsigned count_b = sketch_b.get_num_retained();
+ const unsigned lg_k = std::min(std::max(log2(ceiling_power_of_2(count_a + count_b)), theta_constants::MIN_LG_K), theta_constants::MAX_LG_K);
+ auto u = typename Union::builder().set_lg_k(lg_k).build();
+ u.update(sketch_a);
+ u.update(sketch_b);
+ return u.get_result(false);
+ }
+
+ template<typename SketchA, typename SketchB, typename UnionAB>
+ static bool identical_sets(const SketchA& sketch_a, const SketchB& sketch_b, const UnionAB& union_ab) {
+ if (union_ab.get_num_retained() == sketch_a.get_num_retained() &&
+ union_ab.get_num_retained() == sketch_b.get_num_retained() &&
+ union_ab.get_theta64() == sketch_a.get_theta64() &&
+ union_ab.get_theta64() == sketch_b.get_theta64()) return true;
+ return false;
+ }
+
+};
+
+} /* namespace datasketches */
+
+# endif
diff --git a/be/src/thirdparty/datasketches/theta_set_difference_base.hpp b/be/src/thirdparty/datasketches/theta_set_difference_base.hpp
new file mode 100644
index 0000000..5cc601f
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_set_difference_base.hpp
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_SET_DIFFERENCE_BASE_HPP_
+#define THETA_SET_DIFFERENCE_BASE_HPP_
+
+#include "theta_comparators.hpp"
+#include "theta_update_sketch_base.hpp"
+
+namespace datasketches {
+
+template<
+ typename Entry,
+ typename ExtractKey,
+ typename CompactSketch,
+ typename Allocator
+>
+class theta_set_difference_base {
+public:
+ using comparator = compare_by_key<ExtractKey>;
+ using AllocU64 = typename std::allocator_traits<Allocator>::template rebind_alloc<uint64_t>;
+ using hash_table = theta_update_sketch_base<uint64_t, trivial_extract_key, AllocU64>;
+
+ theta_set_difference_base(uint64_t seed, const Allocator& allocator = Allocator());
+
+ template<typename FwdSketch, typename Sketch>
+ CompactSketch compute(FwdSketch&& a, const Sketch& b, bool ordered) const;
+
+private:
+ Allocator allocator_;
+ uint16_t seed_hash_;
+};
+
+} /* namespace datasketches */
+
+#include "theta_set_difference_base_impl.hpp"
+
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_set_difference_base_impl.hpp b/be/src/thirdparty/datasketches/theta_set_difference_base_impl.hpp
new file mode 100644
index 0000000..4ab98a8
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_set_difference_base_impl.hpp
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
+#define THETA_A_SET_DIFFERENCE_BASE_IMPL_HPP_
+
+#include <algorithm>
+
+#include "conditional_back_inserter.hpp"
+#include "conditional_forward.hpp"
+
+namespace datasketches {
+
+template<typename EN, typename EK, typename CS, typename A>
+theta_set_difference_base<EN, EK, CS, A>::theta_set_difference_base(uint64_t seed, const A& allocator):
+allocator_(allocator),
+seed_hash_(compute_seed_hash(seed))
+{}
+
+template<typename EN, typename EK, typename CS, typename A>
+template<typename FwdSketch, typename Sketch>
+CS theta_set_difference_base<EN, EK, CS, A>::compute(FwdSketch&& a, const Sketch& b, bool ordered) const {
+ if (a.is_empty() || a.get_num_retained() == 0 || b.is_empty()) return CS(a, ordered);
+ if (a.get_seed_hash() != seed_hash_) throw std::invalid_argument("A seed hash mismatch");
+ if (b.get_seed_hash() != seed_hash_) throw std::invalid_argument("B seed hash mismatch");
+
+ const uint64_t theta = std::min(a.get_theta64(), b.get_theta64());
+ std::vector<EN, A> entries(allocator_);
+ bool is_empty = a.is_empty();
+
+ if (b.get_num_retained() == 0) {
+ std::copy_if(forward_begin(std::forward<FwdSketch>(a)), forward_end(std::forward<FwdSketch>(a)), std::back_inserter(entries),
+ key_less_than<uint64_t, EN, EK>(theta));
+ } else {
+ if (a.is_ordered() && b.is_ordered()) { // sort-based
+ std::set_difference(forward_begin(std::forward<FwdSketch>(a)), forward_end(std::forward<FwdSketch>(a)), b.begin(), b.end(),
+ conditional_back_inserter(entries, key_less_than<uint64_t, EN, EK>(theta)), comparator());
+ } else { // hash-based
+ const uint8_t lg_size = lg_size_from_count(b.get_num_retained(), hash_table::REBUILD_THRESHOLD);
+ hash_table table(lg_size, lg_size, hash_table::resize_factor::X1, 0, 0, allocator_); // theta and seed are not used here
+ for (const auto& entry: b) {
+ const uint64_t hash = EK()(entry);
+ if (hash < theta) {
+ table.insert(table.find(hash).first, hash);
+ } else if (b.is_ordered()) {
+ break; // early stop
+ }
+ }
+
+ // scan A lookup B
+ for (auto& entry: a) {
+ const uint64_t hash = EK()(entry);
+ if (hash < theta) {
+ auto result = table.find(hash);
+ if (!result.second) entries.push_back(conditional_forward<FwdSketch>(entry));
+ } else if (a.is_ordered()) {
+ break; // early stop
+ }
+ }
+ }
+ }
+ if (entries.empty() && theta == theta_constants::MAX_THETA) is_empty = true;
+ if (ordered && !a.is_ordered()) std::sort(entries.begin(), entries.end(), comparator());
+ return CS(is_empty, a.is_ordered() || ordered, seed_hash_, theta, std::move(entries));
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_sketch.hpp b/be/src/thirdparty/datasketches/theta_sketch.hpp
index b809f71..2e24168 100644
--- a/be/src/thirdparty/datasketches/theta_sketch.hpp
+++ b/be/src/thirdparty/datasketches/theta_sketch.hpp
@@ -20,45 +20,29 @@
#ifndef THETA_SKETCH_HPP_
#define THETA_SKETCH_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-#include <vector>
-
-#include "common_defs.hpp"
+#include "theta_update_sketch_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-// forward-declarations
-template<typename A> class theta_sketch_alloc;
-template<typename A> class update_theta_sketch_alloc;
-template<typename A> class compact_theta_sketch_alloc;
-template<typename A> class theta_union_alloc;
-template<typename A> class theta_intersection_alloc;
-template<typename A> class theta_a_not_b_alloc;
-
-// for serialization as raw bytes
-template<typename A> using AllocU8 = typename std::allocator_traits<A>::template rebind_alloc<uint8_t>;
-template<typename A> using vector_u8 = std::vector<uint8_t, AllocU8<A>>;
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_sketch_alloc {
public:
- static const uint64_t MAX_THETA = LLONG_MAX; // signed max for compatibility with Java
- static const uint8_t SERIAL_VERSION = 3;
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using iterator = theta_iterator<Entry, ExtractKey>;
+ using const_iterator = theta_const_iterator<Entry, ExtractKey>;
virtual ~theta_sketch_alloc() = default;
/**
+ * @return allocator
+ */
+ virtual Allocator get_allocator() const = 0;
+
+ /**
* @return true if this sketch represents an empty set (not the same as no retained entries!)
*/
- bool is_empty() const;
+ virtual bool is_empty() const = 0;
/**
* @return estimate of the distinct count of the input stream
@@ -96,13 +80,16 @@ public:
/**
* @return theta as a positive integer between 0 and LLONG_MAX
*/
- uint64_t get_theta64() const;
+ virtual uint64_t get_theta64() const = 0;
/**
* @return the number of retained entries in the sketch
*/
virtual uint32_t get_num_retained() const = 0;
+ /**
+ * @return hash of the seed that was used to hash the input
+ */
virtual uint16_t get_seed_hash() const = 0;
/**
@@ -111,109 +98,82 @@ public:
virtual bool is_ordered() const = 0;
/**
- * Writes a human-readable summary of this sketch to a given stream
+ * Provides a human-readable summary of this sketch as a string
* @param print_items if true include the list of items retained by the sketch
+ * @return sketch summary as a string
*/
- virtual string<A> to_string(bool print_items = false) const = 0;
-
- /**
- * This method serializes the sketch into a given stream in a binary form
- * @param os output stream
- */
- virtual void serialize(std::ostream& os) const = 0;
-
- // This is a convenience alias for users
- // The type returned by the following serialize method
- typedef vector_u8<A> vector_bytes;
+ virtual string<Allocator> to_string(bool print_items = false) const;
/**
- * This method serializes the sketch as a vector of bytes.
- * An optional header can be reserved in front of the sketch.
- * It is an uninitialized space of a given size.
- * This header is used in Datasketches PostgreSQL extension.
- * @param header_size_bytes space to reserve in front of the sketch
- */
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const = 0;
-
- // This is a convenience alias for users
- // The type returned by the following deserialize methods
- // It is not possible to return instances of an abstract type, so this has to be a pointer
- typedef std::unique_ptr<theta_sketch_alloc<A>, std::function<void(theta_sketch_alloc<A>*)>> unique_ptr;
-
- /**
- * This method deserializes a sketch from a given stream.
- * @param is input stream
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of a sketch as a unique_ptr
+ * Iterator over hash values in this sketch.
+ * @return begin iterator
*/
- static unique_ptr deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
+ virtual iterator begin() = 0;
/**
- * This method deserializes a sketch from a given array of bytes.
- * @param bytes pointer to the array of bytes
- * @param size the size of the array
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of the sketch
+ * Iterator pointing past the valid range.
+ * Not to be incremented or dereferenced.
+ * @return end iterator
*/
- static unique_ptr deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
-
- class const_iterator;
+ virtual iterator end() = 0;
/**
- * Iterator over hash values in this sketch.
+ * Const iterator over hash values in this sketch.
* @return begin iterator
*/
virtual const_iterator begin() const = 0;
/**
- * Iterator pointing past the valid range.
+ * Const iterator pointing past the valid range.
* Not to be incremented or dereferenced.
* @return end iterator
*/
virtual const_iterator end() const = 0;
protected:
- enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
-
- bool is_empty_;
- uint64_t theta_;
-
- theta_sketch_alloc(bool is_empty, uint64_t theta);
-
- static uint16_t get_seed_hash(uint64_t seed);
-
- static void check_sketch_type(uint8_t actual, uint8_t expected);
- static void check_serial_version(uint8_t actual, uint8_t expected);
- static void check_seed_hash(uint16_t actual, uint16_t expected);
-
- friend theta_intersection_alloc<A>;
- friend theta_a_not_b_alloc<A>;
+ using ostrstream = std::basic_ostringstream<char, std::char_traits<char>, AllocChar<Allocator>>;
+ virtual void print_specifics(ostrstream& os) const = 0;
};
-// update sketch
-
-template<typename A> using AllocU64 = typename std::allocator_traits<A>::template rebind_alloc<uint64_t>;
-template<typename A> using vector_u64 = std::vector<uint64_t, AllocU64<A>>;
+// forward declaration
+template<typename A> class compact_theta_sketch_alloc;
-template<typename A>
-class update_theta_sketch_alloc: public theta_sketch_alloc<A> {
+template<typename Allocator = std::allocator<uint64_t>>
+class update_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
public:
- class builder;
- enum resize_factor { X1, X2, X4, X8 };
- static const uint8_t SKETCH_TYPE = 2;
+ using Base = theta_sketch_alloc<Allocator>;
+ using Entry = typename Base::Entry;
+ using ExtractKey = typename Base::ExtractKey;
+ using iterator = typename Base::iterator;
+ using const_iterator = typename Base::const_iterator;
+ using theta_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
+ using resize_factor = typename theta_table::resize_factor;
// No constructor here. Use builder instead.
+ class builder;
+ update_theta_sketch_alloc(const update_theta_sketch_alloc&) = default;
+ update_theta_sketch_alloc(update_theta_sketch_alloc&&) noexcept = default;
virtual ~update_theta_sketch_alloc() = default;
+ update_theta_sketch_alloc& operator=(const update_theta_sketch_alloc&) = default;
+ update_theta_sketch_alloc& operator=(update_theta_sketch_alloc&&) = default;
- virtual uint32_t get_num_retained() const;
- virtual uint16_t get_seed_hash() const;
+ virtual Allocator get_allocator() const;
+ virtual bool is_empty() const;
virtual bool is_ordered() const;
- virtual string<A> to_string(bool print_items = false) const;
- virtual void serialize(std::ostream& os) const;
- typedef vector_u8<A> vector_bytes; // alias for users
- // header space is reserved, but not initialized
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
+ virtual uint16_t get_seed_hash() const;
+ virtual uint64_t get_theta64() const;
+ virtual uint32_t get_num_retained() const;
+
+ /**
+ * @return configured nominal number of entries in the sketch
+ */
+ uint8_t get_lg_k() const;
+
+ /**
+ * @return configured resize factor of the sketch
+ */
+ resize_factor get_rf() const;
/**
* Update this sketch with a given string.
@@ -302,7 +262,7 @@ public:
* @param data pointer to the data
* @param length of the data in bytes
*/
- void update(const void* data, unsigned length);
+ void update(const void* data, size_t length);
/**
* Remove retained entries in excess of the nominal size k (if any)
@@ -314,105 +274,85 @@ public:
* @param ordered optional flag to specify if ordered sketch should be produced
* @return compact sketch
*/
- compact_theta_sketch_alloc<A> compact(bool ordered = true) const;
-
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
-
- /**
- * This method deserializes a sketch from a given stream.
- * @param is input stream
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of a sketch
- */
- static update_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
+ compact_theta_sketch_alloc<Allocator> compact(bool ordered = true) const;
- /**
- * This method deserializes a sketch from a given array of bytes.
- * @param bytes pointer to the array of bytes
- * @param size the size of the array
- * @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of the sketch
- */
- static update_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
+ virtual iterator begin();
+ virtual iterator end();
+ virtual const_iterator begin() const;
+ virtual const_iterator end() const;
private:
- // resize threshold = 0.5 tuned for speed
- static constexpr double RESIZE_THRESHOLD = 0.5;
- // hash table rebuild threshold = 15/16
- static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
-
- static constexpr uint8_t STRIDE_HASH_BITS = 7;
- static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
-
- uint8_t lg_cur_size_;
- uint8_t lg_nom_size_;
- vector_u64<A> keys_;
- uint32_t num_keys_;
- resize_factor rf_;
- float p_;
- uint64_t seed_;
- uint32_t capacity_;
+ theta_table table_;
// for builder
- update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed);
-
- // for deserialize
- update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed);
+ update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
+ uint64_t seed, const Allocator& allocator);
- void resize();
- void rebuild();
-
- friend theta_union_alloc<A>;
- void internal_update(uint64_t hash);
-
- friend theta_intersection_alloc<A>;
- friend theta_a_not_b_alloc<A>;
- static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
- static inline uint32_t get_stride(uint64_t hash, uint8_t lg_size);
- static bool hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size);
- static bool hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size);
-
- friend theta_sketch_alloc<A>;
- static update_theta_sketch_alloc<A> internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
- static update_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed);
+ using ostrstream = typename Base::ostrstream;
+ virtual void print_specifics(ostrstream& os) const;
};
// compact sketch
-template<typename A>
-class compact_theta_sketch_alloc: public theta_sketch_alloc<A> {
+template<typename Allocator = std::allocator<uint64_t>>
+class compact_theta_sketch_alloc: public theta_sketch_alloc<Allocator> {
public:
+ using Base = theta_sketch_alloc<Allocator>;
+ using iterator = typename Base::iterator;
+ using const_iterator = typename Base::const_iterator;
+ using AllocBytes = typename std::allocator_traits<Allocator>::template rebind_alloc<uint8_t>;
+ using vector_bytes = std::vector<uint8_t, AllocBytes>;
+
+ static const uint8_t SERIAL_VERSION = 3;
static const uint8_t SKETCH_TYPE = 3;
- // No constructor here.
// Instances of this type can be obtained:
- // - by compacting an update_theta_sketch
+ // - by compacting an update_theta_sketch_alloc
// - as a result of a set operation
// - by deserializing a previously serialized compact sketch
- compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered);
+ compact_theta_sketch_alloc(const Base& other, bool ordered);
+ compact_theta_sketch_alloc(const compact_theta_sketch_alloc&) = default;
+ compact_theta_sketch_alloc(compact_theta_sketch_alloc&&) noexcept = default;
virtual ~compact_theta_sketch_alloc() = default;
+ compact_theta_sketch_alloc& operator=(const compact_theta_sketch_alloc&) = default;
+ compact_theta_sketch_alloc& operator=(compact_theta_sketch_alloc&&) = default;
+ virtual Allocator get_allocator() const;
+ virtual bool is_empty() const;
+ virtual bool is_ordered() const;
+ virtual uint64_t get_theta64() const;
virtual uint32_t get_num_retained() const;
virtual uint16_t get_seed_hash() const;
- virtual bool is_ordered() const;
- virtual string<A> to_string(bool print_items = false) const;
- virtual void serialize(std::ostream& os) const;
- typedef vector_u8<A> vector_bytes; // alias for users
- // header space is reserved, but not initialized
- virtual vector_bytes serialize(unsigned header_size_bytes = 0) const;
- virtual typename theta_sketch_alloc<A>::const_iterator begin() const;
- virtual typename theta_sketch_alloc<A>::const_iterator end() const;
+ /**
+ * This method serializes the sketch into a given stream in a binary form
+ * @param os output stream
+ */
+ void serialize(std::ostream& os) const;
+
+ /**
+ * This method serializes the sketch as a vector of bytes.
+ * An optional header can be reserved in front of the sketch.
+ * It is an uninitialized space of a given size.
+ * This header is used in Datasketches PostgreSQL extension.
+ * @param header_size_bytes space to reserve in front of the sketch
+ */
+ vector_bytes serialize(unsigned header_size_bytes = 0) const;
+
+ virtual iterator begin();
+ virtual iterator end();
+ virtual const_iterator begin() const;
+ virtual const_iterator end() const;
/**
* This method deserializes a sketch from a given stream.
* @param is input stream
* @param seed the seed for the hash function that was used to create the sketch
- * @return an instance of a sketch
+ * @return an instance of the sketch
*/
- static compact_theta_sketch_alloc<A> deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED);
+ static compact_theta_sketch_alloc deserialize(std::istream& is,
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
/**
* This method deserializes a sketch from a given array of bytes.
@@ -421,110 +361,36 @@ public:
* @param seed the seed for the hash function that was used to create the sketch
* @return an instance of the sketch
*/
- static compact_theta_sketch_alloc<A> deserialize(const void* bytes, size_t size, uint64_t seed = DEFAULT_SEED);
+ static compact_theta_sketch_alloc deserialize(const void* bytes, size_t size,
+ uint64_t seed = DEFAULT_SEED, const Allocator& allocator = Allocator());
+
+ // for internal use
+ compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<uint64_t, Allocator>&& entries);
private:
- typedef typename std::allocator_traits<A>::template rebind_alloc<uint64_t> AllocU64;
+ enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
- vector_u64<A> keys_;
- uint16_t seed_hash_;
+ bool is_empty_;
bool is_ordered_;
+ uint16_t seed_hash_;
+ uint64_t theta_;
+ std::vector<uint64_t, Allocator> entries_;
- friend theta_sketch_alloc<A>;
- friend update_theta_sketch_alloc<A>;
- friend theta_union_alloc<A>;
- friend theta_intersection_alloc<A>;
- friend theta_a_not_b_alloc<A>;
- compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered);
- static compact_theta_sketch_alloc<A> internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
- static compact_theta_sketch_alloc<A> internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash);
-};
-
-// builder
-
-template<typename A>
-class update_theta_sketch_alloc<A>::builder {
-public:
- static const uint8_t MIN_LG_K = 5;
- static const uint8_t DEFAULT_LG_K = 12;
- static const resize_factor DEFAULT_RESIZE_FACTOR = X8;
-
- /**
- * Creates and instance of the builder with default parameters.
- */
- builder();
-
- /**
- * Set log2(k), where k is a nominal number of entries in the sketch
- * @param lg_k base 2 logarithm of nominal number of entries
- * @return this builder
- */
- builder& set_lg_k(uint8_t lg_k);
-
- /**
- * Set resize factor for the internal hash table (defaults to 8)
- * @param rf resize factor
- * @return this builder
- */
- builder& set_resize_factor(resize_factor rf);
-
- /**
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
- * all entries until it reaches the limit, at which point it goes into the estimation mode
- * and reduces the effective sampling probability (theta) as necessary.
- * @param p sampling probability
- * @return this builder
- */
- builder& set_p(float p);
-
- /**
- * Set the seed for the hash function. Should be used carefully if needed.
- * Sketches produced with different seed are not compatible
- * and cannot be mixed in set operations.
- * @param seed hash seed
- * @return this builder
- */
- builder& set_seed(uint64_t seed);
-
- /**
- * This is to create an instance of the sketch with predefined parameters.
- * @return and instance of the sketch
- */
- update_theta_sketch_alloc<A> build() const;
-
-private:
- uint8_t lg_k_;
- resize_factor rf_;
- float p_;
- uint64_t seed_;
-
- static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
+ using ostrstream = typename Base::ostrstream;
+ virtual void print_specifics(ostrstream& os) const;
};
-// iterator
-template<typename A>
-class theta_sketch_alloc<A>::const_iterator: public std::iterator<std::input_iterator_tag, uint64_t> {
+template<typename Allocator>
+class update_theta_sketch_alloc<Allocator>::builder: public theta_base_builder<builder, Allocator> {
public:
- const_iterator& operator++();
- const_iterator operator++(int);
- bool operator==(const const_iterator& other) const;
- bool operator!=(const const_iterator& other) const;
- uint64_t operator*() const;
-
-private:
- const uint64_t* keys_;
- uint32_t size_;
- uint32_t index_;
- const_iterator(const uint64_t* keys, uint32_t size, uint32_t index);
- friend class update_theta_sketch_alloc<A>;
- friend class compact_theta_sketch_alloc<A>;
+ builder(const Allocator& allocator = Allocator());
+ update_theta_sketch_alloc build() const;
};
-
// aliases with default allocator for convenience
-typedef theta_sketch_alloc<std::allocator<void>> theta_sketch;
-typedef update_theta_sketch_alloc<std::allocator<void>> update_theta_sketch;
-typedef compact_theta_sketch_alloc<std::allocator<void>> compact_theta_sketch;
+using theta_sketch = theta_sketch_alloc<std::allocator<uint64_t>>;
+using update_theta_sketch = update_theta_sketch_alloc<std::allocator<uint64_t>>;
+using compact_theta_sketch = compact_theta_sketch_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
diff --git a/be/src/thirdparty/datasketches/theta_sketch_impl.hpp b/be/src/thirdparty/datasketches/theta_sketch_impl.hpp
index 579a675..1335e59 100644
--- a/be/src/thirdparty/datasketches/theta_sketch_impl.hpp
+++ b/be/src/thirdparty/datasketches/theta_sketch_impl.hpp
@@ -20,35 +20,23 @@
#ifndef THETA_SKETCH_IMPL_HPP_
#define THETA_SKETCH_IMPL_HPP_
-#include <algorithm>
-#include <cmath>
-#include <memory>
-#include <functional>
-#include <istream>
-#include <ostream>
#include <sstream>
+#include <vector>
-#include "MurmurHash3.h"
#include "serde.hpp"
#include "binomial_bounds.hpp"
-#include "memory_operations.hpp"
+#include "theta_helpers.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
template<typename A>
-theta_sketch_alloc<A>::theta_sketch_alloc(bool is_empty, uint64_t theta):
-is_empty_(is_empty), theta_(theta)
-{}
+bool theta_sketch_alloc<A>::is_estimation_mode() const {
+ return get_theta64() < theta_constants::MAX_THETA && !is_empty();
+}
template<typename A>
-bool theta_sketch_alloc<A>::is_empty() const {
- return is_empty_;
+double theta_sketch_alloc<A>::get_theta() const {
+ return static_cast<double>(get_theta64()) / theta_constants::MAX_THETA;
}
template<typename A>
@@ -69,182 +57,47 @@ double theta_sketch_alloc<A>::get_upper_bound(uint8_t num_std_devs) const {
}
template<typename A>
-bool theta_sketch_alloc<A>::is_estimation_mode() const {
- return theta_ < MAX_THETA && !is_empty_;
-}
-
-template<typename A>
-double theta_sketch_alloc<A>::get_theta() const {
- return (double) theta_ / MAX_THETA;
-}
-
-template<typename A>
-uint64_t theta_sketch_alloc<A>::get_theta64() const {
- return theta_;
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
- uint8_t preamble_longs;
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
- uint8_t serial_version;
- is.read((char*)&serial_version, sizeof(serial_version));
- uint8_t type;
- is.read((char*)&type, sizeof(type));
- uint8_t lg_nom_size;
- is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- is.read((char*)&flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- is.read((char*)&seed_hash, sizeof(seed_hash));
-
- check_serial_version(serial_version, SERIAL_VERSION);
-
- if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
- check_seed_hash(seed_hash, get_seed_hash(seed));
- typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
- typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(update_theta_sketch_alloc<A>::internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed))),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
- } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
- typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(compact_theta_sketch_alloc<A>::internal_deserialize(is, preamble_longs, flags_byte, seed_hash))),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
- }
- throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::unique_ptr theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
- ensure_minimum_memory(size, static_cast<size_t>(8));
- const char* ptr = static_cast<const char*>(bytes);
- uint8_t preamble_longs;
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
- uint8_t serial_version;
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
- uint8_t type;
- ptr += copy_from_mem(ptr, &type, sizeof(type));
- uint8_t lg_nom_size;
- ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
-
- check_serial_version(serial_version, SERIAL_VERSION);
-
- if (type == update_theta_sketch_alloc<A>::SKETCH_TYPE) {
- check_seed_hash(seed_hash, get_seed_hash(seed));
- typename update_theta_sketch_alloc<A>::resize_factor rf = static_cast<typename update_theta_sketch_alloc<A>::resize_factor>(preamble_longs >> 6);
- typedef typename std::allocator_traits<A>::template rebind_alloc<update_theta_sketch_alloc<A>> AU;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AU().allocate(1)) update_theta_sketch_alloc<A>(
- update_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed))
- ),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AU().deallocate(static_cast<update_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
- } else if (type == compact_theta_sketch_alloc<A>::SKETCH_TYPE) {
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) check_seed_hash(seed_hash, get_seed_hash(seed));
- typedef typename std::allocator_traits<A>::template rebind_alloc<compact_theta_sketch_alloc<A>> AC;
- return unique_ptr(
- static_cast<theta_sketch_alloc<A>*>(new (AC().allocate(1)) compact_theta_sketch_alloc<A>(
- compact_theta_sketch_alloc<A>::internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash))
- ),
- [](theta_sketch_alloc<A>* ptr) {
- ptr->~theta_sketch_alloc();
- AC().deallocate(static_cast<compact_theta_sketch_alloc<A>*>(ptr), 1);
- }
- );
- }
- throw std::invalid_argument("unsupported sketch type " + std::to_string((int) type));
-}
-
-template<typename A>
-uint16_t theta_sketch_alloc<A>::get_seed_hash(uint64_t seed) {
- HashState hashes;
- MurmurHash3_x64_128(&seed, sizeof(seed), 0, hashes);
- return hashes.h1;
-}
-
-template<typename A>
-void theta_sketch_alloc<A>::check_sketch_type(uint8_t actual, uint8_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
- }
-}
-
-template<typename A>
-void theta_sketch_alloc<A>::check_serial_version(uint8_t actual, uint8_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch serial version mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
- }
-}
-
-template<typename A>
-void theta_sketch_alloc<A>::check_seed_hash(uint16_t actual, uint16_t expected) {
- if (actual != expected) {
- throw std::invalid_argument("Sketch seed hash mismatch: expected " + std::to_string(expected) + ", actual " + std::to_string(actual));
+string<A> theta_sketch_alloc<A>::to_string(bool detail) const {
+ ostrstream os;
+ os << "### Theta sketch summary:" << std::endl;
+ os << " num retained entries : " << get_num_retained() << std::endl;
+ os << " seed hash : " << get_seed_hash() << std::endl;
+ os << " empty? : " << (is_empty() ? "true" : "false") << std::endl;
+ os << " ordered? : " << (is_ordered() ? "true" : "false") << std::endl;
+ os << " estimation mode? : " << (is_estimation_mode() ? "true" : "false") << std::endl;
+ os << " theta (fraction) : " << get_theta() << std::endl;
+ os << " theta (raw 64-bit) : " << get_theta64() << std::endl;
+ os << " estimate : " << this->get_estimate() << std::endl;
+ os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
+ os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
+ print_specifics(os);
+ os << "### End sketch summary" << std::endl;
+ if (detail) {
+ os << "### Retained entries" << std::endl;
+ for (const auto& hash: *this) {
+ os << hash << std::endl;
+ }
+ os << "### End retained entries" << std::endl;
}
+ return os.str();
}
// update sketch
template<typename A>
-update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, float p, uint64_t seed):
-theta_sketch_alloc<A>(true, theta_sketch_alloc<A>::MAX_THETA),
-lg_cur_size_(lg_cur_size),
-lg_nom_size_(lg_nom_size),
-keys_(1 << lg_cur_size_, 0),
-num_keys_(0),
-rf_(rf),
-p_(p),
-seed_(seed),
-capacity_(get_capacity(lg_cur_size, lg_nom_size))
-{
- if (p < 1) this->theta_ *= p;
-}
-
-template<typename A>
-update_theta_sketch_alloc<A>::update_theta_sketch_alloc(bool is_empty, uint64_t theta, uint8_t lg_cur_size, uint8_t lg_nom_size, vector_u64<A>&& keys, uint32_t num_keys, resize_factor rf, float p, uint64_t seed):
-theta_sketch_alloc<A>(is_empty, theta),
-lg_cur_size_(lg_cur_size),
-lg_nom_size_(lg_nom_size),
-keys_(std::move(keys)),
-num_keys_(num_keys),
-rf_(rf),
-p_(p),
-seed_(seed),
-capacity_(get_capacity(lg_cur_size, lg_nom_size))
+update_theta_sketch_alloc<A>::update_theta_sketch_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
+ uint64_t theta, uint64_t seed, const A& allocator):
+table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator)
{}
template<typename A>
-uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
- return num_keys_;
+A update_theta_sketch_alloc<A>::get_allocator() const {
+ return table_.allocator_;
}
template<typename A>
-uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
- return theta_sketch_alloc<A>::get_seed_hash(seed_);
+bool update_theta_sketch_alloc<A>::is_empty() const {
+ return table_.is_empty_;
}
template<typename A>
@@ -253,169 +106,28 @@ bool update_theta_sketch_alloc<A>::is_ordered() const {
}
template<typename A>
-string<A> update_theta_sketch_alloc<A>::to_string(bool print_items) const {
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
- os << "### Update Theta sketch summary:" << std::endl;
- os << " lg nominal size : " << (int) lg_nom_size_ << std::endl;
- os << " lg current size : " << (int) lg_cur_size_ << std::endl;
- os << " num retained keys : " << num_keys_ << std::endl;
- os << " resize factor : " << (1 << rf_) << std::endl;
- os << " sampling probability : " << p_ << std::endl;
- os << " seed hash : " << this->get_seed_hash() << std::endl;
- os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
- os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
- os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
- os << " theta (fraction) : " << this->get_theta() << std::endl;
- os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
- os << " estimate : " << this->get_estimate() << std::endl;
- os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
- os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
- os << "### End sketch summary" << std::endl;
- if (print_items) {
- os << "### Retained keys" << std::endl;
- for (auto key: *this) os << " " << key << std::endl;
- os << "### End retained keys" << std::endl;
- }
- return os.str();
-}
-
-template<typename A>
-void update_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
- const uint8_t preamble_longs_and_rf = 3 | (rf_ << 6);
- os.write((char*)&preamble_longs_and_rf, sizeof(preamble_longs_and_rf));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
- os.write((char*)&serial_version, sizeof(serial_version));
- const uint8_t type = SKETCH_TYPE;
- os.write((char*)&type, sizeof(type));
- os.write((char*)&lg_nom_size_, sizeof(lg_nom_size_));
- os.write((char*)&lg_cur_size_, sizeof(lg_cur_size_));
- const uint8_t flags_byte(
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
- );
- os.write((char*)&flags_byte, sizeof(flags_byte));
- const uint16_t seed_hash = get_seed_hash();
- os.write((char*)&seed_hash, sizeof(seed_hash));
- os.write((char*)&num_keys_, sizeof(num_keys_));
- os.write((char*)&p_, sizeof(p_));
- os.write((char*)&(this->theta_), sizeof(uint64_t));
- os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
-}
-
-template<typename A>
-vector_u8<A> update_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
- const uint8_t preamble_longs = 3;
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
- vector_u8<A> bytes(size);
- uint8_t* ptr = bytes.data() + header_size_bytes;
-
- const uint8_t preamble_longs_and_rf = preamble_longs | (rf_ << 6);
- ptr += copy_to_mem(&preamble_longs_and_rf, ptr, sizeof(preamble_longs_and_rf));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
- ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
- const uint8_t type = SKETCH_TYPE;
- ptr += copy_to_mem(&type, ptr, sizeof(type));
- ptr += copy_to_mem(&lg_nom_size_, ptr, sizeof(lg_nom_size_));
- ptr += copy_to_mem(&lg_cur_size_, ptr, sizeof(lg_cur_size_));
- const uint8_t flags_byte(
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0)
- );
- ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
- const uint16_t seed_hash = get_seed_hash();
- ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
- ptr += copy_to_mem(&num_keys_, ptr, sizeof(num_keys_));
- ptr += copy_to_mem(&p_, ptr, sizeof(p_));
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
- ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
-
- return bytes;
-}
-
-template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
- uint8_t preamble_longs;
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
- resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
- preamble_longs &= 0x3f; // remove resize factor
- uint8_t serial_version;
- is.read((char*)&serial_version, sizeof(serial_version));
- uint8_t type;
- is.read((char*)&type, sizeof(type));
- uint8_t lg_nom_size;
- is.read((char*)&lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- is.read((char*)&lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- is.read((char*)&flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- is.read((char*)&seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(is, rf, lg_cur_size, lg_nom_size, flags_byte, seed);
+uint64_t update_theta_sketch_alloc<A>::get_theta64() const {
+ return table_.theta_;
}
template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
- uint32_t num_keys;
- is.read((char*)&num_keys, sizeof(num_keys));
- float p;
- is.read((char*)&p, sizeof(p));
- uint64_t theta;
- is.read((char*)&theta, sizeof(theta));
- vector_u64<A> keys(1 << lg_cur_size);
- is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
- return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
+uint32_t update_theta_sketch_alloc<A>::get_num_retained() const {
+ return table_.num_entries_;
}
template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
- ensure_minimum_memory(size, 8);
- const char* ptr = static_cast<const char*>(bytes);
- uint8_t preamble_longs;
- ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
- resize_factor rf = static_cast<resize_factor>(preamble_longs >> 6);
- preamble_longs &= 0x3f; // remove resize factor
- uint8_t serial_version;
- ptr += copy_from_mem(ptr, &serial_version, sizeof(serial_version));
- uint8_t type;
- ptr += copy_from_mem(ptr, &type, sizeof(type));
- uint8_t lg_nom_size;
- ptr += copy_from_mem(ptr, &lg_nom_size, sizeof(lg_nom_size));
- uint8_t lg_cur_size;
- ptr += copy_from_mem(ptr, &lg_cur_size, sizeof(lg_cur_size));
- uint8_t flags_byte;
- ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
- uint16_t seed_hash;
- ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), rf, lg_cur_size, lg_nom_size, flags_byte, seed);
+uint16_t update_theta_sketch_alloc<A>::get_seed_hash() const {
+ return compute_seed_hash(table_.seed_);
}
template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, resize_factor rf, uint8_t lg_cur_size, uint8_t lg_nom_size, uint8_t flags_byte, uint64_t seed) {
- const uint32_t table_size = 1 << lg_cur_size;
- ensure_minimum_memory(size, 16 + sizeof(uint64_t) * table_size);
- const char* ptr = static_cast<const char*>(bytes);
- uint32_t num_keys;
- ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
- float p;
- ptr += copy_from_mem(ptr, &p, sizeof(p));
- uint64_t theta;
- ptr += copy_from_mem(ptr, &theta, sizeof(theta));
- vector_u64<A> keys(table_size);
- ptr += copy_from_mem(ptr, keys.data(), sizeof(uint64_t) * table_size);
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- return update_theta_sketch_alloc<A>(is_empty, theta, lg_cur_size, lg_nom_size, std::move(keys), num_keys, rf, p, seed);
+uint8_t update_theta_sketch_alloc<A>::get_lg_k() const {
+ return table_.lg_nom_size_;
}
template<typename A>
-void update_theta_sketch_alloc<A>::update(const std::string& value) {
- if (value.empty()) return;
- update(value.c_str(), value.length());
+auto update_theta_sketch_alloc<A>::get_rf() const -> resize_factor {
+ return table_.rf_;
}
template<typename A>
@@ -460,19 +172,7 @@ void update_theta_sketch_alloc<A>::update(int8_t value) {
template<typename A>
void update_theta_sketch_alloc<A>::update(double value) {
- union {
- int64_t long_value;
- double double_value;
- } long_double_union;
-
- if (value == 0.0) {
- long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
- } else if (std::isnan(value)) {
- long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
- } else {
- long_double_union.double_value = value;
- }
- update(&long_double_union, sizeof(long_double_union));
+ update(canonical_double(value));
}
template<typename A>
@@ -481,157 +181,116 @@ void update_theta_sketch_alloc<A>::update(float value) {
}
template<typename A>
-void update_theta_sketch_alloc<A>::update(const void* data, unsigned length) {
- HashState hashes;
- MurmurHash3_x64_128(data, length, seed_, hashes);
- const uint64_t hash = hashes.h1 >> 1; // Java implementation does logical shift >>> to make values positive
- internal_update(hash);
-}
-
-template<typename A>
-compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
- return compact_theta_sketch_alloc<A>(*this, ordered);
+void update_theta_sketch_alloc<A>::update(const std::string& value) {
+ if (value.empty()) return;
+ update(value.c_str(), value.length());
}
template<typename A>
-void update_theta_sketch_alloc<A>::internal_update(uint64_t hash) {
- this->is_empty_ = false;
- if (hash >= this->theta_ || hash == 0) return; // hash == 0 is reserved to mark empty slots in the table
- if (hash_search_or_insert(hash, keys_.data(), lg_cur_size_)) {
- num_keys_++;
- if (num_keys_ > capacity_) {
- if (lg_cur_size_ <= lg_nom_size_) {
- resize();
- } else {
- rebuild();
- }
- }
+void update_theta_sketch_alloc<A>::update(const void* data, size_t length) {
+ const uint64_t hash = table_.hash_and_screen(data, length);
+ if (hash == 0) return;
+ auto result = table_.find(hash);
+ if (!result.second) {
+ table_.insert(result.first, hash);
}
}
template<typename A>
void update_theta_sketch_alloc<A>::trim() {
- if (num_keys_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
+ table_.trim();
}
template<typename A>
-void update_theta_sketch_alloc<A>::resize() {
- const uint8_t lg_tgt_size = lg_nom_size_ + 1;
- const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
- const uint8_t lg_new_size = lg_cur_size_ + factor;
- const uint32_t new_size = 1 << lg_new_size;
- vector_u64<A> new_keys(new_size, 0);
- for (uint32_t i = 0; i < keys_.size(); i++) {
- if (keys_[i] != 0) {
- hash_search_or_insert(keys_[i], new_keys.data(), lg_new_size); // TODO hash_insert
- }
- }
- keys_ = std::move(new_keys);
- lg_cur_size_ += factor;
- capacity_ = get_capacity(lg_cur_size_, lg_nom_size_);
-}
-
-template<typename A>
-void update_theta_sketch_alloc<A>::rebuild() {
- const uint32_t pivot = (1 << lg_nom_size_) + keys_.size() - num_keys_;
- std::nth_element(keys_.begin(), keys_.begin() + pivot, keys_.end());
- this->theta_ = keys_[pivot];
- vector_u64<A> new_keys(keys_.size(), 0);
- num_keys_ = 0;
- for (uint32_t i = 0; i < keys_.size(); i++) {
- if (keys_[i] != 0 && keys_[i] < this->theta_) {
- hash_search_or_insert(keys_[i], new_keys.data(), lg_cur_size_); // TODO hash_insert
- num_keys_++;
- }
- }
- keys_ = std::move(new_keys);
+auto update_theta_sketch_alloc<A>::begin() -> iterator {
+ return iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
}
template<typename A>
-uint32_t update_theta_sketch_alloc<A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
- const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
- return std::floor(fraction * (1 << lg_cur_size));
+auto update_theta_sketch_alloc<A>::end() -> iterator {
+ return iterator(nullptr, 0, 1 << table_.lg_cur_size_);
}
template<typename A>
-uint32_t update_theta_sketch_alloc<A>::get_stride(uint64_t hash, uint8_t lg_size) {
- // odd and independent of index assuming lg_size lowest bits of the hash were used for the index
- return (2 * static_cast<uint32_t>((hash >> lg_size) & STRIDE_MASK)) + 1;
+auto update_theta_sketch_alloc<A>::begin() const -> const_iterator {
+ return const_iterator(table_.entries_, 1 << table_.lg_cur_size_, 0);
}
template<typename A>
-bool update_theta_sketch_alloc<A>::hash_search_or_insert(uint64_t hash, uint64_t* table, uint8_t lg_size) {
- const uint32_t mask = (1 << lg_size) - 1;
- const uint32_t stride = get_stride(hash, lg_size);
- uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
+auto update_theta_sketch_alloc<A>::end() const -> const_iterator {
+ return const_iterator(nullptr, 0, 1 << table_.lg_cur_size_);
+}
- // search for duplicate or zero
- const uint32_t loop_index = cur_probe;
- do {
- const uint64_t value = table[cur_probe];
- if (value == 0) {
- table[cur_probe] = hash; // insert value
- return true;
- } else if (value == hash) {
- return false; // found a duplicate
- }
- cur_probe = (cur_probe + stride) & mask;
- } while (cur_probe != loop_index);
- throw std::logic_error("key not found and no empty slots!");
-}
-
-template<typename A>
-bool update_theta_sketch_alloc<A>::hash_search(uint64_t hash, const uint64_t* table, uint8_t lg_size) {
- const uint32_t mask = (1 << lg_size) - 1;
- const uint32_t stride = update_theta_sketch_alloc<A>::get_stride(hash, lg_size);
- uint32_t cur_probe = static_cast<uint32_t>(hash) & mask;
- const uint32_t loop_index = cur_probe;
- do {
- const uint64_t value = table[cur_probe];
- if (value == 0) {
- return false;
- } else if (value == hash) {
- return true;
- }
- cur_probe = (cur_probe + stride) & mask;
- } while (cur_probe != loop_index);
- throw std::logic_error("key not found and search wrapped");
+template<typename A>
+compact_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::compact(bool ordered) const {
+ return compact_theta_sketch_alloc<A>(*this, ordered);
}
template<typename A>
-typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::begin() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
+void update_theta_sketch_alloc<A>::print_specifics(ostrstream& os) const {
+ os << " lg nominal size : " << static_cast<int>(table_.lg_nom_size_) << std::endl;
+ os << " lg current size : " << static_cast<int>(table_.lg_cur_size_) << std::endl;
+ os << " resize factor : " << (1 << table_.rf_) << std::endl;
}
+// builder
+
template<typename A>
-typename theta_sketch_alloc<A>::const_iterator update_theta_sketch_alloc<A>::end() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
+update_theta_sketch_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
+
+template<typename A>
+update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
+ return update_theta_sketch_alloc(this->starting_lg_size(), this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
}
// compact sketch
template<typename A>
-compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, uint64_t theta, vector_u64<A>&& keys, uint16_t seed_hash, bool is_ordered):
-theta_sketch_alloc<A>(is_empty, theta),
-keys_(std::move(keys)),
+compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const Base& other, bool ordered):
+is_empty_(other.is_empty()),
+is_ordered_(other.is_ordered() || ordered),
+seed_hash_(other.get_seed_hash()),
+theta_(other.get_theta64()),
+entries_(other.get_allocator())
+{
+ entries_.reserve(other.get_num_retained());
+ std::copy(other.begin(), other.end(), std::back_inserter(entries_));
+ if (ordered && !other.is_ordered()) std::sort(entries_.begin(), entries_.end());
+}
+
+template<typename A>
+compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta,
+ std::vector<uint64_t, A>&& entries):
+is_empty_(is_empty),
+is_ordered_(is_ordered),
seed_hash_(seed_hash),
-is_ordered_(is_ordered)
+theta_(theta),
+entries_(std::move(entries))
{}
template<typename A>
-compact_theta_sketch_alloc<A>::compact_theta_sketch_alloc(const theta_sketch_alloc<A>& other, bool ordered):
-theta_sketch_alloc<A>(other),
-keys_(other.get_num_retained()),
-seed_hash_(other.get_seed_hash()),
-is_ordered_(other.is_ordered() || ordered)
-{
- std::copy(other.begin(), other.end(), keys_.begin());
- if (ordered && !other.is_ordered()) std::sort(keys_.begin(), keys_.end());
+A compact_theta_sketch_alloc<A>::get_allocator() const {
+ return entries_.get_allocator();
+}
+
+template<typename A>
+bool compact_theta_sketch_alloc<A>::is_empty() const {
+ return is_empty_;
+}
+
+template<typename A>
+bool compact_theta_sketch_alloc<A>::is_ordered() const {
+ return is_ordered_;
+}
+
+template<typename A>
+uint64_t compact_theta_sketch_alloc<A>::get_theta64() const {
+ return theta_;
}
template<typename A>
uint32_t compact_theta_sketch_alloc<A>::get_num_retained() const {
- return keys_.size();
+ return entries_.size();
}
template<typename A>
@@ -640,158 +299,148 @@ uint16_t compact_theta_sketch_alloc<A>::get_seed_hash() const {
}
template<typename A>
-bool compact_theta_sketch_alloc<A>::is_ordered() const {
- return is_ordered_;
+auto compact_theta_sketch_alloc<A>::begin() -> iterator {
+ return iterator(entries_.data(), entries_.size(), 0);
}
template<typename A>
-string<A> compact_theta_sketch_alloc<A>::to_string(bool print_items) const {
- std::basic_ostringstream<char, std::char_traits<char>, AllocChar<A>> os;
- os << "### Compact Theta sketch summary:" << std::endl;
- os << " num retained keys : " << keys_.size() << std::endl;
- os << " seed hash : " << this->get_seed_hash() << std::endl;
- os << " empty? : " << (this->is_empty() ? "true" : "false") << std::endl;
- os << " ordered? : " << (this->is_ordered() ? "true" : "false") << std::endl;
- os << " estimation mode? : " << (this->is_estimation_mode() ? "true" : "false") << std::endl;
- os << " theta (fraction) : " << this->get_theta() << std::endl;
- os << " theta (raw 64-bit) : " << this->theta_ << std::endl;
- os << " estimate : " << this->get_estimate() << std::endl;
- os << " lower bound 95% conf : " << this->get_lower_bound(2) << std::endl;
- os << " upper bound 95% conf : " << this->get_upper_bound(2) << std::endl;
- os << "### End sketch summary" << std::endl;
- if (print_items) {
- os << "### Retained keys" << std::endl;
- for (auto key: *this) os << " " << key << std::endl;
- os << "### End retained keys" << std::endl;
- }
- return os.str();
+auto compact_theta_sketch_alloc<A>::end() -> iterator {
+ return iterator(nullptr, 0, entries_.size());
}
template<typename A>
+auto compact_theta_sketch_alloc<A>::begin() const -> const_iterator {
+ return const_iterator(entries_.data(), entries_.size(), 0);
+}
+
+template<typename A>
+auto compact_theta_sketch_alloc<A>::end() const -> const_iterator {
+ return const_iterator(nullptr, 0, entries_.size());
+}
+
+template<typename A>
+void compact_theta_sketch_alloc<A>::print_specifics(ostrstream&) const {}
+
+template<typename A>
void compact_theta_sketch_alloc<A>::serialize(std::ostream& os) const {
- const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
+ const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
os.write(reinterpret_cast<const char*>(&preamble_longs), sizeof(preamble_longs));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
+ const uint8_t serial_version = SERIAL_VERSION;
os.write(reinterpret_cast<const char*>(&serial_version), sizeof(serial_version));
const uint8_t type = SKETCH_TYPE;
os.write(reinterpret_cast<const char*>(&type), sizeof(type));
const uint16_t unused16 = 0;
os.write(reinterpret_cast<const char*>(&unused16), sizeof(unused16));
const uint8_t flags_byte(
- (1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
- (1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
+ (1 << flags::IS_COMPACT) |
+ (1 << flags::IS_READ_ONLY) |
+ (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+ (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
);
os.write(reinterpret_cast<const char*>(&flags_byte), sizeof(flags_byte));
const uint16_t seed_hash = get_seed_hash();
- os.write((char*)&seed_hash, sizeof(seed_hash));
+ os.write(reinterpret_cast<const char*>(&seed_hash), sizeof(seed_hash));
if (!this->is_empty()) {
if (!is_single_item) {
- const uint32_t num_keys = keys_.size();
- os.write((char*)&num_keys, sizeof(num_keys));
+ const uint32_t num_entries = entries_.size();
+ os.write(reinterpret_cast<const char*>(&num_entries), sizeof(num_entries));
const uint32_t unused32 = 0;
- os.write((char*)&unused32, sizeof(unused32));
+ os.write(reinterpret_cast<const char*>(&unused32), sizeof(unused32));
if (this->is_estimation_mode()) {
- os.write((char*)&(this->theta_), sizeof(uint64_t));
+ os.write(reinterpret_cast<const char*>(&(this->theta_)), sizeof(uint64_t));
}
}
- os.write((char*)keys_.data(), sizeof(uint64_t) * keys_.size());
+ os.write(reinterpret_cast<const char*>(entries_.data()), entries_.size() * sizeof(uint64_t));
}
}
template<typename A>
-vector_u8<A> compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const {
- const bool is_single_item = keys_.size() == 1 && !this->is_estimation_mode();
+auto compact_theta_sketch_alloc<A>::serialize(unsigned header_size_bytes) const -> vector_bytes {
+ const bool is_single_item = entries_.size() == 1 && !this->is_estimation_mode();
const uint8_t preamble_longs = this->is_empty() || is_single_item ? 1 : this->is_estimation_mode() ? 3 : 2;
- const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs + sizeof(uint64_t) * keys_.size();
- vector_u8<A> bytes(size);
+ const size_t size = header_size_bytes + sizeof(uint64_t) * preamble_longs
+ + sizeof(uint64_t) * entries_.size();
+ vector_bytes bytes(size, 0, entries_.get_allocator());
uint8_t* ptr = bytes.data() + header_size_bytes;
ptr += copy_to_mem(&preamble_longs, ptr, sizeof(preamble_longs));
- const uint8_t serial_version = theta_sketch_alloc<A>::SERIAL_VERSION;
+ const uint8_t serial_version = SERIAL_VERSION;
ptr += copy_to_mem(&serial_version, ptr, sizeof(serial_version));
const uint8_t type = SKETCH_TYPE;
ptr += copy_to_mem(&type, ptr, sizeof(type));
const uint16_t unused16 = 0;
ptr += copy_to_mem(&unused16, ptr, sizeof(unused16));
const uint8_t flags_byte(
- (1 << theta_sketch_alloc<A>::flags::IS_COMPACT) |
- (1 << theta_sketch_alloc<A>::flags::IS_READ_ONLY) |
- (this->is_empty() ? 1 << theta_sketch_alloc<A>::flags::IS_EMPTY : 0) |
- (this->is_ordered() ? 1 << theta_sketch_alloc<A>::flags::IS_ORDERED : 0)
+ (1 << flags::IS_COMPACT) |
+ (1 << flags::IS_READ_ONLY) |
+ (this->is_empty() ? 1 << flags::IS_EMPTY : 0) |
+ (this->is_ordered() ? 1 << flags::IS_ORDERED : 0)
);
ptr += copy_to_mem(&flags_byte, ptr, sizeof(flags_byte));
const uint16_t seed_hash = get_seed_hash();
ptr += copy_to_mem(&seed_hash, ptr, sizeof(seed_hash));
if (!this->is_empty()) {
if (!is_single_item) {
- const uint32_t num_keys = keys_.size();
- ptr += copy_to_mem(&num_keys, ptr, sizeof(num_keys));
+ const uint32_t num_entries = entries_.size();
+ ptr += copy_to_mem(&num_entries, ptr, sizeof(num_entries));
const uint32_t unused32 = 0;
ptr += copy_to_mem(&unused32, ptr, sizeof(unused32));
if (this->is_estimation_mode()) {
- ptr += copy_to_mem(&(this->theta_), ptr, sizeof(uint64_t));
+ ptr += copy_to_mem(&theta_, ptr, sizeof(uint64_t));
}
}
- ptr += copy_to_mem(keys_.data(), ptr, sizeof(uint64_t) * keys_.size());
+ ptr += copy_to_mem(entries_.data(), ptr, entries_.size() * sizeof(uint64_t));
}
-
return bytes;
}
template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed) {
+compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
uint8_t preamble_longs;
- is.read((char*)&preamble_longs, sizeof(preamble_longs));
+ is.read(reinterpret_cast<char*>(&preamble_longs), sizeof(preamble_longs));
uint8_t serial_version;
- is.read((char*)&serial_version, sizeof(serial_version));
+ is.read(reinterpret_cast<char*>(&serial_version), sizeof(serial_version));
uint8_t type;
- is.read((char*)&type, sizeof(type));
+ is.read(reinterpret_cast<char*>(&type), sizeof(type));
uint16_t unused16;
- is.read((char*)&unused16, sizeof(unused16));
+ is.read(reinterpret_cast<char*>(&unused16), sizeof(unused16));
uint8_t flags_byte;
- is.read((char*)&flags_byte, sizeof(flags_byte));
+ is.read(reinterpret_cast<char*>(&flags_byte), sizeof(flags_byte));
uint16_t seed_hash;
- is.read((char*)&seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(is, preamble_longs, flags_byte, seed_hash);
-}
-
-template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(std::istream& is, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
- uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
- uint32_t num_keys = 0;
-
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ is.read(reinterpret_cast<char*>(&seed_hash), sizeof(seed_hash));
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+
+ uint64_t theta = theta_constants::MAX_THETA;
+ uint32_t num_entries = 0;
if (!is_empty) {
if (preamble_longs == 1) {
- num_keys = 1;
+ num_entries = 1;
} else {
- is.read((char*)&num_keys, sizeof(num_keys));
+ is.read(reinterpret_cast<char*>(&num_entries), sizeof(num_entries));
uint32_t unused32;
- is.read((char*)&unused32, sizeof(unused32));
+ is.read(reinterpret_cast<char*>(&unused32), sizeof(unused32));
if (preamble_longs > 2) {
- is.read((char*)&theta, sizeof(theta));
+ is.read(reinterpret_cast<char*>(&theta), sizeof(theta));
}
}
}
- vector_u64<A> keys(num_keys);
- if (!is_empty) is.read((char*)keys.data(), sizeof(uint64_t) * keys.size());
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
+ if (!is_empty) is.read(reinterpret_cast<char*>(entries.data()), sizeof(uint64_t) * entries.size());
- const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
- if (!is.good()) throw std::runtime_error("error reading from std::istream");
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
+ if (!is.good()) throw std::runtime_error("error reading from std::istream");
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
}
template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed) {
+compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const void* bytes, size_t size, uint64_t seed, const A& allocator) {
ensure_minimum_memory(size, 8);
const char* ptr = static_cast<const char*>(bytes);
+ const char* base = ptr;
uint8_t preamble_longs;
ptr += copy_from_mem(ptr, &preamble_longs, sizeof(preamble_longs));
uint8_t serial_version;
@@ -804,28 +453,19 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::deserialize(const v
ptr += copy_from_mem(ptr, &flags_byte, sizeof(flags_byte));
uint16_t seed_hash;
ptr += copy_from_mem(ptr, &seed_hash, sizeof(seed_hash));
- theta_sketch_alloc<A>::check_sketch_type(type, SKETCH_TYPE);
- theta_sketch_alloc<A>::check_serial_version(serial_version, theta_sketch_alloc<A>::SERIAL_VERSION);
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
- if (!is_empty) theta_sketch_alloc<A>::check_seed_hash(seed_hash, theta_sketch_alloc<A>::get_seed_hash(seed));
- return internal_deserialize(ptr, size - (ptr - static_cast<const char*>(bytes)), preamble_longs, flags_byte, seed_hash);
-}
-
-template<typename A>
-compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserialize(const void* bytes, size_t size, uint8_t preamble_longs, uint8_t flags_byte, uint16_t seed_hash) {
- const char* ptr = static_cast<const char*>(bytes);
- const char* base = ptr;
-
- uint64_t theta = theta_sketch_alloc<A>::MAX_THETA;
- uint32_t num_keys = 0;
+ checker<true>::check_sketch_type(type, SKETCH_TYPE);
+ checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
+ const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
+ if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
- const bool is_empty = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_EMPTY);
+ uint64_t theta = theta_constants::MAX_THETA;
+ uint32_t num_entries = 0;
if (!is_empty) {
if (preamble_longs == 1) {
- num_keys = 1;
+ num_entries = 1;
} else {
ensure_minimum_memory(size, 8); // read the first prelong before this method
- ptr += copy_from_mem(ptr, &num_keys, sizeof(num_keys));
+ ptr += copy_from_mem(ptr, &num_entries, sizeof(num_entries));
uint32_t unused32;
ptr += copy_from_mem(ptr, &unused32, sizeof(unused32));
if (preamble_longs > 2) {
@@ -834,106 +474,16 @@ compact_theta_sketch_alloc<A> compact_theta_sketch_alloc<A>::internal_deserializ
}
}
}
- const size_t keys_size_bytes = sizeof(uint64_t) * num_keys;
- check_memory_size(ptr - base + keys_size_bytes, size);
- vector_u64<A> keys(num_keys);
- if (!is_empty) ptr += copy_from_mem(ptr, keys.data(), keys_size_bytes);
-
- const bool is_ordered = flags_byte & (1 << theta_sketch_alloc<A>::flags::IS_ORDERED);
- return compact_theta_sketch_alloc<A>(is_empty, theta, std::move(keys), seed_hash, is_ordered);
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::begin() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), 0);
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator compact_theta_sketch_alloc<A>::end() const {
- return typename theta_sketch_alloc<A>::const_iterator(keys_.data(), keys_.size(), keys_.size());
-}
-
-// builder
-
-template<typename A>
-update_theta_sketch_alloc<A>::builder::builder():
-lg_k_(DEFAULT_LG_K), rf_(DEFAULT_RESIZE_FACTOR), p_(1), seed_(DEFAULT_SEED) {}
-
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
- if (lg_k < MIN_LG_K) {
- throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
- }
- lg_k_ = lg_k;
- return *this;
-}
-
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_resize_factor(resize_factor rf) {
- rf_ = rf;
- return *this;
-}
+ const size_t entries_size_bytes = sizeof(uint64_t) * num_entries;
+ check_memory_size(ptr - base + entries_size_bytes, size);
+ std::vector<uint64_t, A> entries(num_entries, 0, allocator);
+ if (!is_empty) ptr += copy_from_mem(ptr, entries.data(), entries_size_bytes);
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_p(float p) {
- p_ = p;
- return *this;
-}
-
-template<typename A>
-typename update_theta_sketch_alloc<A>::builder& update_theta_sketch_alloc<A>::builder::set_seed(uint64_t seed) {
- seed_ = seed;
- return *this;
-}
-
-template<typename A>
-uint8_t update_theta_sketch_alloc<A>::builder::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
- return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
-}
-
-template<typename A>
-update_theta_sketch_alloc<A> update_theta_sketch_alloc<A>::builder::build() const {
- return update_theta_sketch_alloc<A>(starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_)), lg_k_, rf_, p_, seed_);
-}
-
-// iterator
-
-template<typename A>
-theta_sketch_alloc<A>::const_iterator::const_iterator(const uint64_t* keys, uint32_t size, uint32_t index):
-keys_(keys), size_(size), index_(index) {
- while (index_ < size_ && keys_[index_] == 0) ++index_;
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator& theta_sketch_alloc<A>::const_iterator::operator++() {
- do {
- ++index_;
- } while (index_ < size_ && keys_[index_] == 0);
- return *this;
-}
-
-template<typename A>
-typename theta_sketch_alloc<A>::const_iterator theta_sketch_alloc<A>::const_iterator::operator++(int) {
- const_iterator tmp(*this);
- operator++();
- return tmp;
-}
-
-template<typename A>
-bool theta_sketch_alloc<A>::const_iterator::operator==(const const_iterator& other) const {
- return index_ == other.index_;
-}
-
-template<typename A>
-bool theta_sketch_alloc<A>::const_iterator::operator!=(const const_iterator& other) const {
- return index_ != other.index_;
-}
-
-template<typename A>
-uint64_t theta_sketch_alloc<A>::const_iterator::operator*() const {
- return keys_[index_];
+ const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
+ return compact_theta_sketch_alloc(is_empty, is_ordered, seed_hash, theta, std::move(entries));
}
} /* namespace datasketches */
#endif
+
diff --git a/be/src/thirdparty/datasketches/theta_union.hpp b/be/src/thirdparty/datasketches/theta_union.hpp
index 6cf8ccc..44f9b52 100644
--- a/be/src/thirdparty/datasketches/theta_union.hpp
+++ b/be/src/thirdparty/datasketches/theta_union.hpp
@@ -20,103 +20,70 @@
#ifndef THETA_UNION_HPP_
#define THETA_UNION_HPP_
-#include <memory>
-#include <functional>
-#include <climits>
-
+#include "serde.hpp"
#include "theta_sketch.hpp"
+#include "theta_union_base.hpp"
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-template<typename A>
+template<typename Allocator = std::allocator<uint64_t>>
class theta_union_alloc {
public:
- class builder;
+ using Entry = uint64_t;
+ using ExtractKey = trivial_extract_key;
+ using Sketch = theta_sketch_alloc<Allocator>;
+ using CompactSketch = compact_theta_sketch_alloc<Allocator>;
+ using resize_factor = theta_constants::resize_factor;
+
+ struct pass_through_policy {
+ uint64_t operator()(uint64_t internal_entry, uint64_t incoming_entry) const {
+ unused(incoming_entry);
+ return internal_entry;
+ }
+ };
+ using State = theta_union_base<Entry, ExtractKey, pass_through_policy, Sketch, CompactSketch, Allocator>;
// No constructor here. Use builder instead.
+ class builder;
/**
* This method is to update the union with a given sketch
* @param sketch to update the union with
*/
- void update(const theta_sketch_alloc<A>& sketch);
+ template<typename FwdSketch>
+ void update(FwdSketch&& sketch);
/**
* This method produces a copy of the current state of the union as a compact sketch.
* @param ordered optional flag to specify if ordered sketch should be produced
* @return the result of the union
*/
- compact_theta_sketch_alloc<A> get_result(bool ordered = true) const;
+ CompactSketch get_result(bool ordered = true) const;
private:
- bool is_empty_;
- uint64_t theta_;
- update_theta_sketch_alloc<A> state_;
+ State state_;
// for builder
- theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state);
+ theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Allocator& allocator);
};
-// builder
-
template<typename A>
-class theta_union_alloc<A>::builder {
+class theta_union_alloc<A>::builder: public theta_base_builder<builder, A> {
public:
- typedef typename update_theta_sketch_alloc<A>::resize_factor resize_factor;
-
- /**
- * Set log2(k), where k is a nominal number of entries in the sketch
- * @param lg_k base 2 logarithm of nominal number of entries
- * @return this builder
- */
- builder& set_lg_k(uint8_t lg_k);
-
- /**
- * Set resize factor for the internal hash table (defaults to 8)
- * @param rf resize factor
- * @return this builder
- */
- builder& set_resize_factor(resize_factor rf);
-
- /**
- * Set sampling probability (initial theta). The default is 1, so the sketch retains
- * all entries until it reaches the limit, at which point it goes into the estimation mode
- * and reduces the effective sampling probability (theta) as necessary.
- * @param p sampling probability
- * @return this builder
- */
- builder& set_p(float p);
-
- /**
- * Set the seed for the hash function. Should be used carefully if needed.
- * Sketches produced with different seed are not compatible
- * and cannot be mixed in set operations.
- * @param seed hash seed
- * @return this builder
- */
- builder& set_seed(uint64_t seed);
+ builder(const A& allocator = A());
/**
* This is to create an instance of the union with predefined parameters.
- * @return and instance of the union
+ * @return an instance of the union
*/
theta_union_alloc<A> build() const;
-
-private:
- typename update_theta_sketch_alloc<A>::builder sketch_builder;
};
// alias with default allocator for convenience
-typedef theta_union_alloc<std::allocator<void>> theta_union;
+using theta_union = theta_union_alloc<std::allocator<uint64_t>>;
} /* namespace datasketches */
#include "theta_union_impl.hpp"
-# endif
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_union_base.hpp b/be/src/thirdparty/datasketches/theta_union_base.hpp
new file mode 100644
index 0000000..d41f5bd
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_union_base.hpp
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_UNION_BASE_HPP_
+#define THETA_UNION_BASE_HPP_
+
+#include "theta_update_sketch_base.hpp"
+
+namespace datasketches {
+
+template<
+ typename Entry,
+ typename ExtractKey,
+ typename Policy,
+ typename Sketch,
+ typename CompactSketch,
+ typename Allocator
+>
+class theta_union_base {
+public:
+ using hash_table = theta_update_sketch_base<Entry, ExtractKey, Allocator>;
+ using resize_factor = typename hash_table::resize_factor;
+ using comparator = compare_by_key<ExtractKey>;
+
+ theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const Policy& policy, const Allocator& allocator);
+
+ template<typename FwdSketch>
+ void update(FwdSketch&& sketch);
+
+ CompactSketch get_result(bool ordered = true) const;
+
+ const Policy& get_policy() const;
+
+private:
+ Policy policy_;
+ hash_table table_;
+ uint64_t union_theta_;
+};
+
+} /* namespace datasketches */
+
+#include "theta_union_base_impl.hpp"
+
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_union_base_impl.hpp b/be/src/thirdparty/datasketches/theta_union_base_impl.hpp
new file mode 100644
index 0000000..ec8ce56
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_union_base_impl.hpp
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_UNION_BASE_IMPL_HPP_
+#define THETA_UNION_BASE_IMPL_HPP_
+
+#include <algorithm>
+
+#include "conditional_forward.hpp"
+
+namespace datasketches {
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+theta_union_base<EN, EK, P, S, CS, A>::theta_union_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf,
+ uint64_t theta, uint64_t seed, const P& policy, const A& allocator):
+policy_(policy),
+table_(lg_cur_size, lg_nom_size, rf, theta, seed, allocator),
+union_theta_(table_.theta_)
+{}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+template<typename SS>
+void theta_union_base<EN, EK, P, S, CS, A>::update(SS&& sketch) {
+ if (sketch.is_empty()) return;
+ if (sketch.get_seed_hash() != compute_seed_hash(table_.seed_)) throw std::invalid_argument("seed hash mismatch");
+ table_.is_empty_ = false;
+ if (sketch.get_theta64() < union_theta_) union_theta_ = sketch.get_theta64();
+ for (auto& entry: sketch) {
+ const uint64_t hash = EK()(entry);
+ if (hash < union_theta_) {
+ auto result = table_.find(hash);
+ if (!result.second) {
+ table_.insert(result.first, conditional_forward<SS>(entry));
+ } else {
+ policy_(*result.first, conditional_forward<SS>(entry));
+ }
+ } else {
+ if (sketch.is_ordered()) break; // early stop
+ }
+ }
+ if (table_.theta_ < union_theta_) union_theta_ = table_.theta_;
+}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+CS theta_union_base<EN, EK, P, S, CS, A>::get_result(bool ordered) const {
+ std::vector<EN, A> entries(table_.allocator_);
+ if (table_.is_empty_) return CS(true, true, compute_seed_hash(table_.seed_), union_theta_, std::move(entries));
+ entries.reserve(table_.num_entries_);
+ uint64_t theta = std::min(union_theta_, table_.theta_);
+ const uint32_t nominal_num = 1 << table_.lg_nom_size_;
+ if (union_theta_ >= theta && table_.num_entries_ <= nominal_num) {
+ std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero<EN, EK>());
+ } else {
+ std::copy_if(table_.begin(), table_.end(), std::back_inserter(entries), key_not_zero_less_than<uint64_t, EN, EK>(theta));
+ if (entries.size() > nominal_num) {
+ std::nth_element(entries.begin(), entries.begin() + nominal_num, entries.end(), comparator());
+ theta = EK()(entries[nominal_num]);
+ entries.erase(entries.begin() + nominal_num, entries.end());
+ entries.shrink_to_fit();
+ }
+ }
+ if (ordered) std::sort(entries.begin(), entries.end(), comparator());
+ return CS(table_.is_empty_, ordered, compute_seed_hash(table_.seed_), theta, std::move(entries));
+}
+
+template<typename EN, typename EK, typename P, typename S, typename CS, typename A>
+const P& theta_union_base<EN, EK, P, S, CS, A>::get_policy() const {
+ return policy_;
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_union_impl.hpp b/be/src/thirdparty/datasketches/theta_union_impl.hpp
index 4d8ebaa..4708d70 100644
--- a/be/src/thirdparty/datasketches/theta_union_impl.hpp
+++ b/be/src/thirdparty/datasketches/theta_union_impl.hpp
@@ -22,86 +22,30 @@
namespace datasketches {
-/*
- * author Alexander Saydakov
- * author Lee Rhodes
- * author Kevin Lang
- */
-
-template<typename A>
-theta_union_alloc<A>::theta_union_alloc(uint64_t theta, update_theta_sketch_alloc<A>&& state):
-is_empty_(true), theta_(theta), state_(std::move(state)) {}
-
-template<typename A>
-void theta_union_alloc<A>::update(const theta_sketch_alloc<A>& sketch) {
- if (sketch.is_empty()) return;
- if (sketch.get_seed_hash() != state_.get_seed_hash()) throw std::invalid_argument("seed hash mismatch");
- is_empty_ = false;
- if (sketch.get_theta64() < theta_) theta_ = sketch.get_theta64();
- if (sketch.is_ordered()) {
- for (auto hash: sketch) {
- if (hash >= theta_) break; // early stop
- state_.internal_update(hash);
- }
- } else {
- for (auto hash: sketch) if (hash < theta_) state_.internal_update(hash);
- }
- if (state_.get_theta64() < theta_) theta_ = state_.get_theta64();
-}
-
template<typename A>
-compact_theta_sketch_alloc<A> theta_union_alloc<A>::get_result(bool ordered) const {
- if (is_empty_) return state_.compact(ordered);
- const uint32_t nom_num_keys = 1 << state_.lg_nom_size_;
- if (theta_ >= state_.theta_ && state_.get_num_retained() <= nom_num_keys) return state_.compact(ordered);
- uint64_t theta = std::min(theta_, state_.get_theta64());
- vector_u64<A> keys(state_.get_num_retained());
- uint32_t num_keys = 0;
- for (auto key: state_) {
- if (key < theta) keys[num_keys++] = key;
- }
- if (num_keys > nom_num_keys) {
- std::nth_element(keys.begin(), keys.begin() + nom_num_keys, keys.begin() + num_keys);
- theta = keys[nom_num_keys];
- num_keys = nom_num_keys;
- }
- if (num_keys != state_.get_num_retained()) {
- keys.resize(num_keys);
- }
- if (ordered) std::sort(keys.begin(), keys.end());
- return compact_theta_sketch_alloc<A>(false, theta, std::move(keys), state_.get_seed_hash(), ordered);
-}
-
-// builder
+theta_union_alloc<A>::theta_union_alloc(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator):
+state_(lg_cur_size, lg_nom_size, rf, theta, seed, pass_through_policy(), allocator)
+{}
template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_lg_k(uint8_t lg_k) {
- sketch_builder.set_lg_k(lg_k);
- return *this;
+template<typename SS>
+void theta_union_alloc<A>::update(SS&& sketch) {
+ state_.update(std::forward<SS>(sketch));
}
template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_resize_factor(resize_factor rf) {
- sketch_builder.set_resize_factor(rf);
- return *this;
+auto theta_union_alloc<A>::get_result(bool ordered) const -> CompactSketch {
+ return state_.get_result(ordered);
}
template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_p(float p) {
- sketch_builder.set_p(p);
- return *this;
-}
-
-template<typename A>
-typename theta_union_alloc<A>::builder& theta_union_alloc<A>::builder::set_seed(uint64_t seed) {
- sketch_builder.set_seed(seed);
- return *this;
-}
+theta_union_alloc<A>::builder::builder(const A& allocator): theta_base_builder<builder, A>(allocator) {}
template<typename A>
-theta_union_alloc<A> theta_union_alloc<A>::builder::build() const {
- update_theta_sketch_alloc<A> sketch = sketch_builder.build();
- return theta_union_alloc(sketch.get_theta64(), std::move(sketch));
+auto theta_union_alloc<A>::builder::build() const -> theta_union_alloc {
+ return theta_union_alloc(
+ this->starting_sub_multiple(this->lg_k_ + 1, this->MIN_LG_K, static_cast<uint8_t>(this->rf_)),
+ this->lg_k_, this->rf_, this->starting_theta(), this->seed_, this->allocator_);
}
} /* namespace datasketches */
diff --git a/be/src/thirdparty/datasketches/theta_update_sketch_base.hpp b/be/src/thirdparty/datasketches/theta_update_sketch_base.hpp
new file mode 100644
index 0000000..eae7984
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_update_sketch_base.hpp
@@ -0,0 +1,243 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_UPDATE_SKETCH_BASE_HPP_
+#define THETA_UPDATE_SKETCH_BASE_HPP_
+
+#include <vector>
+#include <climits>
+#include <cmath>
+
+#include "common_defs.hpp"
+#include "MurmurHash3.h"
+#include "theta_comparators.hpp"
+#include "theta_constants.hpp"
+
+namespace datasketches {
+
+template<
+ typename Entry,
+ typename ExtractKey,
+ typename Allocator
+>
+struct theta_update_sketch_base {
+ using resize_factor = theta_constants::resize_factor;
+ using comparator = compare_by_key<ExtractKey>;
+
+ theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta,
+ uint64_t seed, const Allocator& allocator, bool is_empty = true);
+ theta_update_sketch_base(const theta_update_sketch_base& other);
+ theta_update_sketch_base(theta_update_sketch_base&& other) noexcept;
+ ~theta_update_sketch_base();
+ theta_update_sketch_base& operator=(const theta_update_sketch_base& other);
+ theta_update_sketch_base& operator=(theta_update_sketch_base&& other);
+
+ using iterator = Entry*;
+
+ inline uint64_t hash_and_screen(const void* data, size_t length);
+
+ inline std::pair<iterator, bool> find(uint64_t key) const;
+
+ template<typename FwdEntry>
+ inline void insert(iterator it, FwdEntry&& entry);
+
+ iterator begin() const;
+ iterator end() const;
+
+ // resize threshold = 0.5 tuned for speed
+ static constexpr double RESIZE_THRESHOLD = 0.5;
+ // hash table rebuild threshold = 15/16
+ static constexpr double REBUILD_THRESHOLD = 15.0 / 16.0;
+
+ static constexpr uint8_t STRIDE_HASH_BITS = 7;
+ static constexpr uint32_t STRIDE_MASK = (1 << STRIDE_HASH_BITS) - 1;
+
+ Allocator allocator_;
+ bool is_empty_;
+ uint8_t lg_cur_size_;
+ uint8_t lg_nom_size_;
+ resize_factor rf_;
+ uint32_t num_entries_;
+ uint64_t theta_;
+ uint64_t seed_;
+ Entry* entries_;
+
+ void resize();
+ void rebuild();
+ void trim();
+
+ static inline uint32_t get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size);
+ static inline uint32_t get_stride(uint64_t key, uint8_t lg_size);
+ static void consolidate_non_empty(Entry* entries, size_t size, size_t num);
+};
+
+// builder
+
+template<typename Derived, typename Allocator>
+class theta_base_builder {
+public:
+ using resize_factor = theta_constants::resize_factor;
+ static const uint8_t MIN_LG_K = theta_constants::MIN_LG_K;
+ static const uint8_t MAX_LG_K = theta_constants::MAX_LG_K;
+ static const uint8_t DEFAULT_LG_K = 12;
+ static const resize_factor DEFAULT_RESIZE_FACTOR = resize_factor::X8;
+
+ /**
+ * Creates and instance of the builder with default parameters.
+ */
+ theta_base_builder(const Allocator& allocator);
+
+ /**
+ * Set log2(k), where k is a nominal number of entries in the sketch
+ * @param lg_k base 2 logarithm of nominal number of entries
+ * @return this builder
+ */
+ Derived& set_lg_k(uint8_t lg_k);
+
+ /**
+ * Set resize factor for the internal hash table (defaults to 8)
+ * @param rf resize factor
+ * @return this builder
+ */
+ Derived& set_resize_factor(resize_factor rf);
+
+ /**
+ * Set sampling probability (initial theta). The default is 1, so the sketch retains
+ * all entries until it reaches the limit, at which point it goes into the estimation mode
+ * and reduces the effective sampling probability (theta) as necessary.
+ * @param p sampling probability
+ * @return this builder
+ */
+ Derived& set_p(float p);
+
+ /**
+ * Set the seed for the hash function. Should be used carefully if needed.
+ * Sketches produced with different seed are not compatible
+ * and cannot be mixed in set operations.
+ * @param seed hash seed
+ * @return this builder
+ */
+ Derived& set_seed(uint64_t seed);
+
+protected:
+ Allocator allocator_;
+ uint8_t lg_k_;
+ resize_factor rf_;
+ float p_;
+ uint64_t seed_;
+
+ uint64_t starting_theta() const;
+ uint8_t starting_lg_size() const;
+ static uint8_t starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf);
+};
+
+// key extractor
+
+struct trivial_extract_key {
+ template<typename T>
+ auto operator()(T&& entry) const -> decltype(std::forward<T>(entry)) {
+ return std::forward<T>(entry);
+ }
+};
+
+// key not zero
+
+template<typename Entry, typename ExtractKey>
+class key_not_zero {
+public:
+ bool operator()(const Entry& entry) const {
+ return ExtractKey()(entry) != 0;
+ }
+};
+
+template<typename Key, typename Entry, typename ExtractKey>
+class key_not_zero_less_than {
+public:
+ explicit key_not_zero_less_than(const Key& key): key(key) {}
+ bool operator()(const Entry& entry) const {
+ return ExtractKey()(entry) != 0 && ExtractKey()(entry) < this->key;
+ }
+private:
+ Key key;
+};
+
+// MurMur3 hash functions
+
+static inline uint64_t compute_hash(const void* data, size_t length, uint64_t seed) {
+ HashState hashes;
+ MurmurHash3_x64_128(data, length, seed, hashes);
+ return (hashes.h1 >> 1); // Java implementation does unsigned shift >>> to make values positive
+}
+
+// iterators
+
+template<typename Entry, typename ExtractKey>
+class theta_iterator: public std::iterator<std::input_iterator_tag, Entry> {
+public:
+ theta_iterator(Entry* entries, uint32_t size, uint32_t index);
+ theta_iterator& operator++();
+ theta_iterator operator++(int);
+ bool operator==(const theta_iterator& other) const;
+ bool operator!=(const theta_iterator& other) const;
+ Entry& operator*() const;
+
+private:
+ Entry* entries_;
+ uint32_t size_;
+ uint32_t index_;
+};
+
+template<typename Entry, typename ExtractKey>
+class theta_const_iterator: public std::iterator<std::input_iterator_tag, Entry> {
+public:
+ theta_const_iterator(const Entry* entries, uint32_t size, uint32_t index);
+ theta_const_iterator& operator++();
+ theta_const_iterator operator++(int);
+ bool operator==(const theta_const_iterator& other) const;
+ bool operator!=(const theta_const_iterator& other) const;
+ const Entry& operator*() const;
+
+private:
+ const Entry* entries_;
+ uint32_t size_;
+ uint32_t index_;
+};
+
+// double value canonicalization for compatibility with Java
+static inline int64_t canonical_double(double value) {
+ union {
+ int64_t long_value;
+ double double_value;
+ } long_double_union;
+
+ if (value == 0.0) {
+ long_double_union.double_value = 0.0; // canonicalize -0.0 to 0.0
+ } else if (std::isnan(value)) {
+ long_double_union.long_value = 0x7ff8000000000000L; // canonicalize NaN using value from Java's Double.doubleToLongBits()
+ } else {
+ long_double_union.double_value = value;
+ }
+ return long_double_union.long_value;
+}
+
+} /* namespace datasketches */
+
+#include "theta_update_sketch_base_impl.hpp"
+
+#endif
diff --git a/be/src/thirdparty/datasketches/theta_update_sketch_base_impl.hpp b/be/src/thirdparty/datasketches/theta_update_sketch_base_impl.hpp
new file mode 100644
index 0000000..a343c78
--- /dev/null
+++ b/be/src/thirdparty/datasketches/theta_update_sketch_base_impl.hpp
@@ -0,0 +1,394 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
+#define THETA_UPDATE_SKETCH_BASE_IMPL_HPP_
+
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+
+namespace datasketches {
+
+template<typename EN, typename EK, typename A>
+theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(uint8_t lg_cur_size, uint8_t lg_nom_size, resize_factor rf, uint64_t theta, uint64_t seed, const A& allocator, bool is_empty):
+allocator_(allocator),
+is_empty_(is_empty),
+lg_cur_size_(lg_cur_size),
+lg_nom_size_(lg_nom_size),
+rf_(rf),
+num_entries_(0),
+theta_(theta),
+seed_(seed),
+entries_(nullptr)
+{
+ if (lg_cur_size > 0) {
+ const size_t size = 1 << lg_cur_size;
+ entries_ = allocator_.allocate(size);
+ for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
+ }
+}
+
+template<typename EN, typename EK, typename A>
+theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(const theta_update_sketch_base& other):
+allocator_(other.allocator_),
+is_empty_(other.is_empty_),
+lg_cur_size_(other.lg_cur_size_),
+lg_nom_size_(other.lg_nom_size_),
+rf_(other.rf_),
+num_entries_(other.num_entries_),
+theta_(other.theta_),
+seed_(other.seed_),
+entries_(nullptr)
+{
+ if (other.entries_ != nullptr) {
+ const size_t size = 1 << lg_cur_size_;
+ entries_ = allocator_.allocate(size);
+ for (size_t i = 0; i < size; ++i) {
+ if (EK()(other.entries_[i]) != 0) {
+ new (&entries_[i]) EN(other.entries_[i]);
+ } else {
+ EK()(entries_[i]) = 0;
+ }
+ }
+ }
+}
+
+template<typename EN, typename EK, typename A>
+theta_update_sketch_base<EN, EK, A>::theta_update_sketch_base(theta_update_sketch_base&& other) noexcept:
+allocator_(std::move(other.allocator_)),
+is_empty_(other.is_empty_),
+lg_cur_size_(other.lg_cur_size_),
+lg_nom_size_(other.lg_nom_size_),
+rf_(other.rf_),
+num_entries_(other.num_entries_),
+theta_(other.theta_),
+seed_(other.seed_),
+entries_(other.entries_)
+{
+ other.entries_ = nullptr;
+}
+
+template<typename EN, typename EK, typename A>
+theta_update_sketch_base<EN, EK, A>::~theta_update_sketch_base()
+{
+ if (entries_ != nullptr) {
+ const size_t size = 1 << lg_cur_size_;
+ for (size_t i = 0; i < size; ++i) {
+ if (EK()(entries_[i]) != 0) entries_[i].~EN();
+ }
+ allocator_.deallocate(entries_, size);
+ }
+}
+
+template<typename EN, typename EK, typename A>
+theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operator=(const theta_update_sketch_base& other) {
+ theta_update_sketch_base<EN, EK, A> copy(other);
+ std::swap(allocator_, copy.allocator_);
+ std::swap(is_empty_, copy.is_empty_);
+ std::swap(lg_cur_size_, copy.lg_cur_size_);
+ std::swap(lg_nom_size_, copy.lg_nom_size_);
+ std::swap(rf_, copy.rf_);
+ std::swap(num_entries_, copy.num_entries_);
+ std::swap(theta_, copy.theta_);
+ std::swap(seed_, copy.seed_);
+ std::swap(entries_, copy.entries_);
+ return *this;
+}
+
+template<typename EN, typename EK, typename A>
+theta_update_sketch_base<EN, EK, A>& theta_update_sketch_base<EN, EK, A>::operator=(theta_update_sketch_base&& other) {
+ std::swap(allocator_, other.allocator_);
+ std::swap(is_empty_, other.is_empty_);
+ std::swap(lg_cur_size_, other.lg_cur_size_);
+ std::swap(lg_nom_size_, other.lg_nom_size_);
+ std::swap(rf_, other.rf_);
+ std::swap(num_entries_, other.num_entries_);
+ std::swap(theta_, other.theta_);
+ std::swap(seed_, other.seed_);
+ std::swap(entries_, other.entries_);
+ return *this;
+}
+
+template<typename EN, typename EK, typename A>
+uint64_t theta_update_sketch_base<EN, EK, A>::hash_and_screen(const void* data, size_t length) {
+ is_empty_ = false;
+ const uint64_t hash = compute_hash(data, length, seed_);
+ if (hash >= theta_) return 0; // hash == 0 is reserved to mark empty slots in the table
+ return hash;
+}
+
+template<typename EN, typename EK, typename A>
+auto theta_update_sketch_base<EN, EK, A>::find(uint64_t key) const -> std::pair<iterator, bool> {
+ const size_t size = 1 << lg_cur_size_;
+ const size_t mask = size - 1;
+ const uint32_t stride = get_stride(key, lg_cur_size_);
+ uint32_t index = static_cast<uint32_t>(key) & mask;
+ // search for duplicate or zero
+ const uint32_t loop_index = index;
+ do {
+ const uint64_t probe = EK()(entries_[index]);
+ if (probe == 0) {
+ return std::pair<iterator, bool>(&entries_[index], false);
+ } else if (probe == key) {
+ return std::pair<iterator, bool>(&entries_[index], true);
+ }
+ index = (index + stride) & mask;
+ } while (index != loop_index);
+ throw std::logic_error("key not found and no empty slots!");
+}
+
+template<typename EN, typename EK, typename A>
+template<typename Fwd>
+void theta_update_sketch_base<EN, EK, A>::insert(iterator it, Fwd&& entry) {
+ new (it) EN(std::forward<Fwd>(entry));
+ ++num_entries_;
+ if (num_entries_ > get_capacity(lg_cur_size_, lg_nom_size_)) {
+ if (lg_cur_size_ <= lg_nom_size_) {
+ resize();
+ } else {
+ rebuild();
+ }
+ }
+}
+
+template<typename EN, typename EK, typename A>
+auto theta_update_sketch_base<EN, EK, A>::begin() const -> iterator {
+ return entries_;
+}
+
+template<typename EN, typename EK, typename A>
+auto theta_update_sketch_base<EN, EK, A>::end() const -> iterator {
+ return &entries_[1 << lg_cur_size_];
+}
+
+template<typename EN, typename EK, typename A>
+uint32_t theta_update_sketch_base<EN, EK, A>::get_capacity(uint8_t lg_cur_size, uint8_t lg_nom_size) {
+ const double fraction = (lg_cur_size <= lg_nom_size) ? RESIZE_THRESHOLD : REBUILD_THRESHOLD;
+ return std::floor(fraction * (1 << lg_cur_size));
+}
+
+template<typename EN, typename EK, typename A>
+uint32_t theta_update_sketch_base<EN, EK, A>::get_stride(uint64_t key, uint8_t lg_size) {
+ // odd and independent of index assuming lg_size lowest bits of the key were used for the index
+ return (2 * static_cast<uint32_t>((key >> lg_size) & STRIDE_MASK)) + 1;
+}
+
+template<typename EN, typename EK, typename A>
+void theta_update_sketch_base<EN, EK, A>::resize() {
+ const size_t old_size = 1 << lg_cur_size_;
+ const uint8_t lg_tgt_size = lg_nom_size_ + 1;
+ const uint8_t factor = std::max(1, std::min(static_cast<int>(rf_), lg_tgt_size - lg_cur_size_));
+ lg_cur_size_ += factor;
+ const size_t new_size = 1 << lg_cur_size_;
+ EN* old_entries = entries_;
+ entries_ = allocator_.allocate(new_size);
+ for (size_t i = 0; i < new_size; ++i) EK()(entries_[i]) = 0;
+ num_entries_ = 0;
+ for (size_t i = 0; i < old_size; ++i) {
+ const uint64_t key = EK()(old_entries[i]);
+ if (key != 0) {
+ insert(find(key).first, std::move(old_entries[i])); // consider a special insert with no comparison
+ old_entries[i].~EN();
+ }
+ }
+ allocator_.deallocate(old_entries, old_size);
+}
+
+// assumes number of entries > nominal size
+template<typename EN, typename EK, typename A>
+void theta_update_sketch_base<EN, EK, A>::rebuild() {
+ const size_t size = 1 << lg_cur_size_;
+ const uint32_t nominal_size = 1 << lg_nom_size_;
+
+ // empty entries have uninitialized payloads
+ // TODO: avoid this for empty or trivial payloads (arithmetic types)
+ consolidate_non_empty(entries_, size, num_entries_);
+
+ std::nth_element(entries_, entries_ + nominal_size, entries_ + num_entries_, comparator());
+ this->theta_ = EK()(entries_[nominal_size]);
+ EN* old_entries = entries_;
+ const size_t num_old_entries = num_entries_;
+ entries_ = allocator_.allocate(size);
+ for (size_t i = 0; i < size; ++i) EK()(entries_[i]) = 0;
+ num_entries_ = 0;
+ // relies on consolidating non-empty entries to the front
+ for (size_t i = 0; i < nominal_size; ++i) {
+ insert(find(EK()(old_entries[i])).first, std::move(old_entries[i])); // consider a special insert with no comparison
+ old_entries[i].~EN();
+ }
+ for (size_t i = nominal_size; i < num_old_entries; ++i) old_entries[i].~EN();
+ allocator_.deallocate(old_entries, size);
+}
+
+template<typename EN, typename EK, typename A>
+void theta_update_sketch_base<EN, EK, A>::trim() {
+ if (num_entries_ > static_cast<uint32_t>(1 << lg_nom_size_)) rebuild();
+}
+
+template<typename EN, typename EK, typename A>
+void theta_update_sketch_base<EN, EK, A>::consolidate_non_empty(EN* entries, size_t size, size_t num) {
+ // find the first empty slot
+ size_t i = 0;
+ while (i < size) {
+ if (EK()(entries[i]) == 0) break;
+ ++i;
+ }
+ // scan the rest and move non-empty entries to the front
+ for (size_t j = i + 1; j < size; ++j) {
+ if (EK()(entries[j]) != 0) {
+ new (&entries[i]) EN(std::move(entries[j]));
+ entries[j].~EN();
+ EK()(entries[j]) = 0;
+ ++i;
+ if (i == num) break;
+ }
+ }
+}
+
+// builder
+
+template<typename Derived, typename Allocator>
+theta_base_builder<Derived, Allocator>::theta_base_builder(const Allocator& allocator):
+allocator_(allocator), lg_k_(DEFAULT_LG_K), rf_(DEFAULT_RESIZE_FACTOR), p_(1), seed_(DEFAULT_SEED) {}
+
+template<typename Derived, typename Allocator>
+Derived& theta_base_builder<Derived, Allocator>::set_lg_k(uint8_t lg_k) {
+ if (lg_k < MIN_LG_K) {
+ throw std::invalid_argument("lg_k must not be less than " + std::to_string(MIN_LG_K) + ": " + std::to_string(lg_k));
+ }
+ if (lg_k > MAX_LG_K) {
+ throw std::invalid_argument("lg_k must not be greater than " + std::to_string(MAX_LG_K) + ": " + std::to_string(lg_k));
+ }
+ lg_k_ = lg_k;
+ return static_cast<Derived&>(*this);
+}
+
+template<typename Derived, typename Allocator>
+Derived& theta_base_builder<Derived, Allocator>::set_resize_factor(resize_factor rf) {
+ rf_ = rf;
+ return static_cast<Derived&>(*this);
+}
+
+template<typename Derived, typename Allocator>
+Derived& theta_base_builder<Derived, Allocator>::set_p(float p) {
+ if (p <= 0 || p > 1) throw std::invalid_argument("sampling probability must be between 0 and 1");
+ p_ = p;
+ return static_cast<Derived&>(*this);
+}
+
+template<typename Derived, typename Allocator>
+Derived& theta_base_builder<Derived, Allocator>::set_seed(uint64_t seed) {
+ seed_ = seed;
+ return static_cast<Derived&>(*this);
+}
+
+template<typename Derived, typename Allocator>
+uint64_t theta_base_builder<Derived, Allocator>::starting_theta() const {
+ if (p_ < 1) return theta_constants::MAX_THETA * p_;
+ return theta_constants::MAX_THETA;
+}
+
+template<typename Derived, typename Allocator>
+uint8_t theta_base_builder<Derived, Allocator>::starting_lg_size() const {
+ return starting_sub_multiple(lg_k_ + 1, MIN_LG_K, static_cast<uint8_t>(rf_));
+}
+
+template<typename Derived, typename Allocator>
+uint8_t theta_base_builder<Derived, Allocator>::starting_sub_multiple(uint8_t lg_tgt, uint8_t lg_min, uint8_t lg_rf) {
+ return (lg_tgt <= lg_min) ? lg_min : (lg_rf == 0) ? lg_tgt : ((lg_tgt - lg_min) % lg_rf) + lg_min;
+}
+
+// iterator
+
+template<typename Entry, typename ExtractKey>
+theta_iterator<Entry, ExtractKey>::theta_iterator(Entry* entries, uint32_t size, uint32_t index):
+entries_(entries), size_(size), index_(index) {
+ while (index_ < size_ && ExtractKey()(entries_[index_]) == 0) ++index_;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_iterator<Entry, ExtractKey>::operator++() -> theta_iterator& {
+ ++index_;
+ while (index_ < size_ && ExtractKey()(entries_[index_]) == 0) ++index_;
+ return *this;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_iterator<Entry, ExtractKey>::operator++(int) -> theta_iterator {
+ theta_iterator tmp(*this);
+ operator++();
+ return tmp;
+}
+
+template<typename Entry, typename ExtractKey>
+bool theta_iterator<Entry, ExtractKey>::operator!=(const theta_iterator& other) const {
+ return index_ != other.index_;
+}
+
+template<typename Entry, typename ExtractKey>
+bool theta_iterator<Entry, ExtractKey>::operator==(const theta_iterator& other) const {
+ return index_ == other.index_;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_iterator<Entry, ExtractKey>::operator*() const -> Entry& {
+ return entries_[index_];
+}
+
+// const iterator
+
+template<typename Entry, typename ExtractKey>
+theta_const_iterator<Entry, ExtractKey>::theta_const_iterator(const Entry* entries, uint32_t size, uint32_t index):
+entries_(entries), size_(size), index_(index) {
+ while (index_ < size_ && ExtractKey()(entries_[index_]) == 0) ++index_;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_const_iterator<Entry, ExtractKey>::operator++() -> theta_const_iterator& {
+ ++index_;
+ while (index_ < size_ && ExtractKey()(entries_[index_]) == 0) ++index_;
+ return *this;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_const_iterator<Entry, ExtractKey>::operator++(int) -> theta_const_iterator {
+ theta_const_iterator tmp(*this);
+ operator++();
+ return tmp;
+}
+
+template<typename Entry, typename ExtractKey>
+bool theta_const_iterator<Entry, ExtractKey>::operator!=(const theta_const_iterator& other) const {
+ return index_ != other.index_;
+}
+
+template<typename Entry, typename ExtractKey>
+bool theta_const_iterator<Entry, ExtractKey>::operator==(const theta_const_iterator& other) const {
+ return index_ == other.index_;
+}
+
+template<typename Entry, typename ExtractKey>
+auto theta_const_iterator<Entry, ExtractKey>::operator*() const -> const Entry& {
+ return entries_[index_];
+}
+
+} /* namespace datasketches */
+
+#endif
diff --git a/be/src/thirdparty/datasketches/u32_table.hpp b/be/src/thirdparty/datasketches/u32_table.hpp
index 2316fc1..fe228a5 100644
--- a/be/src/thirdparty/datasketches/u32_table.hpp
+++ b/be/src/thirdparty/datasketches/u32_table.hpp
@@ -39,8 +39,8 @@ template<typename A>
class u32_table {
public:
- u32_table();
- u32_table(uint8_t lg_size, uint8_t num_valid_bits);
+ u32_table(const A& allocator);
+ u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator);
inline size_t get_num_items() const;
inline const uint32_t* get_slots() const;
@@ -52,7 +52,7 @@ public:
// returns true iff the item was present and was therefore removed from the table
inline bool maybe_delete(uint32_t item);
- static u32_table make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k);
+ static u32_table make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k, const A& allocator);
vector_u32<A> unwrapping_get_items() const;
diff --git a/be/src/thirdparty/datasketches/u32_table_impl.hpp b/be/src/thirdparty/datasketches/u32_table_impl.hpp
index aa44ba2..bf8ece9 100644
--- a/be/src/thirdparty/datasketches/u32_table_impl.hpp
+++ b/be/src/thirdparty/datasketches/u32_table_impl.hpp
@@ -29,19 +29,19 @@
namespace datasketches {
template<typename A>
-u32_table<A>::u32_table():
+u32_table<A>::u32_table(const A& allocator):
lg_size(0),
num_valid_bits(0),
num_items(0),
-slots()
+slots(allocator)
{}
template<typename A>
-u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits):
+u32_table<A>::u32_table(uint8_t lg_size, uint8_t num_valid_bits, const A& allocator):
lg_size(lg_size),
num_valid_bits(num_valid_bits),
num_items(0),
-slots(1 << lg_size, UINT32_MAX)
+slots(1 << lg_size, UINT32_MAX, allocator)
{
if (lg_size < 2) throw std::invalid_argument("lg_size must be >= 2");
if (num_valid_bits < 1 || num_valid_bits > 32) throw std::invalid_argument("num_valid_bits must be between 1 and 32");
@@ -110,10 +110,10 @@ bool u32_table<A>::maybe_delete(uint32_t item) {
// this one is specifically tailored to be a part of fm85 decompression scheme
template<typename A>
-u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k) {
+u32_table<A> u32_table<A>::make_from_pairs(const uint32_t* pairs, size_t num_pairs, uint8_t lg_k, const A& allocator) {
uint8_t lg_num_slots = 2;
while (U32_TABLE_UPSIZE_DENOM * num_pairs > U32_TABLE_UPSIZE_NUMER * (1 << lg_num_slots)) lg_num_slots++;
- u32_table<A> table(lg_num_slots, 6 + lg_k);
+ u32_table<A> table(lg_num_slots, 6 + lg_k, allocator);
// Note: there is a possible "snowplow effect" here because the caller is passing in a sorted pairs array
// However, we are starting out with the correct final table size, so the problem might not occur
for (size_t i = 0; i < num_pairs; i++) {
@@ -152,7 +152,7 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
const size_t new_size = 1 << new_lg_size;
if (new_size <= num_items) throw std::logic_error("new_size <= num_items");
vector_u32<A> old_slots = std::move(slots);
- slots = vector_u32<A>(new_size, UINT32_MAX);
+ slots = vector_u32<A>(new_size, UINT32_MAX, old_slots.get_allocator());
lg_size = new_lg_size;
for (size_t i = 0; i < old_size; i++) {
if (old_slots[i] != UINT32_MAX) {
@@ -169,9 +169,9 @@ void u32_table<A>::rebuild(uint8_t new_lg_size) {
// The result is nearly sorted, so make sure to use an efficient sort for that case
template<typename A>
vector_u32<A> u32_table<A>::unwrapping_get_items() const {
- if (num_items == 0) return vector_u32<A>();
+ if (num_items == 0) return vector_u32<A>(slots.get_allocator());
const size_t table_size = 1 << lg_size;
- vector_u32<A> result(num_items);
+ vector_u32<A> result(num_items, 0, slots.get_allocator());
size_t i = 0;
size_t l = 0;
size_t r = num_items - 1;