You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datasketches.apache.org by al...@apache.org on 2020/08/04 01:33:44 UTC

[incubator-datasketches-cpp] branch tuple_sketch updated: array of doubles tuple sketch

This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch tuple_sketch
in repository https://gitbox.apache.org/repos/asf/incubator-datasketches-cpp.git


The following commit(s) were added to refs/heads/tuple_sketch by this push:
     new 3683317  array of doubles tuple sketch
3683317 is described below

commit 368331757a32d1fd0b6d03943b60b696453836cc
Author: AlexanderSaydakov <Al...@users.noreply.github.com>
AuthorDate: Mon Aug 3 18:33:27 2020 -0700

    array of doubles tuple sketch
---
 tuple/CMakeLists.txt                             |   3 +
 tuple/include/array_of_doubles_sketch.hpp        |  72 +++++++++++++++++++++
 tuple/include/array_of_doubles_sketch_impl.hpp   |  76 +++++++++++++++++++++++
 tuple/include/theta_helpers.hpp                  |   6 ++
 tuple/include/tuple_sketch.hpp                   |   5 +-
 tuple/test/CMakeLists.txt                        |   1 +
 tuple/test/aod_1_compact_estimation_from_java.sk | Bin 0 -> 69496 bytes
 tuple/test/array_of_doubles_sketch_test.cpp      |  67 ++++++++++++++++++++
 tuple/test/tuple_sketch_test.cpp                 |   2 +-
 9 files changed, 228 insertions(+), 4 deletions(-)

diff --git a/tuple/CMakeLists.txt b/tuple/CMakeLists.txt
index 3b0e891..fce73ab 100644
--- a/tuple/CMakeLists.txt
+++ b/tuple/CMakeLists.txt
@@ -43,6 +43,7 @@ list(APPEND tuple_HEADERS "include/theta_intersection_base.hpp;include/theta_int
 list(APPEND tuple_HEADERS "include/theta_set_difference_base.hpp;include/theta_set_difference_base_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_sketch_experimental.hpp;include/theta_sketch_experimental_impl.hpp")
 list(APPEND tuple_HEADERS "include/theta_union_experimental.hpp;include/theta_union_experimental_impl.hpp")
+list(APPEND tuple_HEADERS "include/array_of_doubles_sketch.hpp;include/array_of_doubles_sketch_impl.hpp")
 
 install(TARGETS tuple
   EXPORT ${PROJECT_NAME}
@@ -73,4 +74,6 @@ target_sources(tuple
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_sketch_experimental_impl.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_experimental.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/include/theta_union_experimental_impl.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch.hpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/include/array_of_doubles_sketch_impl.hpp
 )
diff --git a/tuple/include/array_of_doubles_sketch.hpp b/tuple/include/array_of_doubles_sketch.hpp
new file mode 100644
index 0000000..1ca217d
--- /dev/null
+++ b/tuple/include/array_of_doubles_sketch.hpp
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef ARRAY_OF_DOUBLES_SKETCH_HPP_
+#define ARRAY_OF_DOUBLES_SKETCH_HPP_
+
+#include <array>
+
+#include "serde.hpp"
+#include "tuple_sketch.hpp"
+
+namespace datasketches {
+
+// equivalent of ArrayOfDoublesSketch in Java
+
+template<int num>
+struct array_of_doubles_update_policy {
+  std::array<double, num> create() const {
+    return std::array<double, num>();
+  }
+  void update(std::array<double, num>& summary, const std::array<double, num>& update) const {
+    for (int i = 0; i < num; ++i) summary[i] += update[i];
+  }
+};
+
+template<int num, typename A = std::allocator<std::array<double, num>>>
+using update_array_of_doubles_sketch = update_tuple_sketch<
+    std::array<double, num>, std::array<double, num>, array_of_doubles_update_policy<num>, A>;
+
+template<int num, typename A = std::allocator<std::array<double, num>>>
+class compact_array_of_doubles_sketch: public compact_tuple_sketch<std::array<double, num>, A> {
+public:
+  using Summary = std::array<double, num>;
+  using Base = compact_tuple_sketch<Summary, A>;
+  using Entry = typename Base::Entry;
+  using AllocEntry = typename Base::AllocEntry;
+  using AllocU64 = typename Base::AllocU64;
+  using flags = typename Base::flags;
+
+  static const uint8_t SERIAL_VERSION = 1;
+  static const uint8_t SKETCH_FAMILY = 9;
+  static const uint8_t SKETCH_TYPE = 3;
+
+  compact_array_of_doubles_sketch(const Base& other, bool ordered = true);
+
+  static compact_array_of_doubles_sketch deserialize(std::istream& is, uint64_t seed = DEFAULT_SEED, const A& allocator = A());
+
+  // for internal use
+  compact_array_of_doubles_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
+};
+
+} /* namespace datasketches */
+
+#include "array_of_doubles_sketch_impl.hpp"
+
+#endif
diff --git a/tuple/include/array_of_doubles_sketch_impl.hpp b/tuple/include/array_of_doubles_sketch_impl.hpp
new file mode 100644
index 0000000..e208ed1
--- /dev/null
+++ b/tuple/include/array_of_doubles_sketch_impl.hpp
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+namespace datasketches {
+
+template<int num, typename A>
+compact_array_of_doubles_sketch<num, A>::compact_array_of_doubles_sketch(const Base& other, bool ordered):
+Base(other, ordered) {}
+
+template<int num, typename A>
+compact_array_of_doubles_sketch<num, A>::compact_array_of_doubles_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries):
+Base(is_empty, is_ordered, seed_hash, theta, std::move(entries)) {}
+
+template<int num, typename A>
+compact_array_of_doubles_sketch<num, A> compact_array_of_doubles_sketch<num, A>::deserialize(std::istream& is, uint64_t seed, const A& allocator) {
+  uint8_t preamble_longs;
+  is.read((char*)&preamble_longs, sizeof(preamble_longs));
+  uint8_t serial_version;
+  is.read((char*)&serial_version, sizeof(serial_version));
+  uint8_t family;
+  is.read((char*)&family, sizeof(family));
+  uint8_t type;
+  is.read((char*)&type, sizeof(type));
+  uint8_t flags_byte;
+  is.read((char*)&flags_byte, sizeof(flags_byte));
+  uint8_t num_values;
+  is.read((char*)&num_values, sizeof(num_values));
+  uint16_t seed_hash;
+  is.read((char*)&seed_hash, sizeof(seed_hash));
+  checker<true>::check_sketch_family(family, SKETCH_FAMILY);
+  checker<true>::check_sketch_type(type, SKETCH_TYPE);
+  checker<true>::check_serial_version(serial_version, SERIAL_VERSION);
+  const bool is_empty = flags_byte & (1 << flags::IS_EMPTY);
+  if (!is_empty) checker<true>::check_seed_hash(seed_hash, compute_seed_hash(seed));
+
+  uint64_t theta;
+  is.read((char*)&theta, sizeof(theta));
+  uint32_t num_entries = 0;
+  if (!is_empty) {
+    is.read((char*)&num_entries, sizeof(num_entries));
+    uint32_t unused32;
+    is.read((char*)&unused32, sizeof(unused32));
+  }
+  std::vector<Entry, AllocEntry> entries(allocator);
+  if (!is_empty) {
+    entries.reserve(num_entries);
+    std::vector<uint64_t, AllocU64> keys(num_entries, 0, allocator);
+    is.read((char*)keys.data(), num_entries * sizeof(uint64_t));
+    std::vector<Summary, A> summaries(num_entries, Summary(), allocator);
+    is.read((char*)summaries.data(), num_entries * sizeof(Summary));
+    for (size_t i = 0; i < num_entries; ++i) {
+      entries.push_back(Entry(keys[i], summaries[i]));
+    }
+  }
+  if (!is.good()) throw std::runtime_error("error reading from std::istream");
+  const bool is_ordered = flags_byte & (1 << flags::IS_ORDERED);
+  return compact_array_of_doubles_sketch(is_empty, is_ordered, seed_hash, theta, std::move(entries));
+}
+
+} /* namespace datasketches */
diff --git a/tuple/include/theta_helpers.hpp b/tuple/include/theta_helpers.hpp
index 53ca01b..ca3c655 100644
--- a/tuple/include/theta_helpers.hpp
+++ b/tuple/include/theta_helpers.hpp
@@ -28,6 +28,12 @@ namespace datasketches {
 template<bool dummy>
 class checker {
 public:
+  static void check_sketch_family(uint8_t actual, uint8_t expected) {
+      if (actual != expected) {
+        throw std::invalid_argument("Sketch family mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
+      }
+  }
+
   static void check_sketch_type(uint8_t actual, uint8_t expected) {
       if (actual != expected) {
         throw std::invalid_argument("Sketch type mismatch: expected " + std::to_string((int)expected) + ", actual " + std::to_string((int)actual));
diff --git a/tuple/include/tuple_sketch.hpp b/tuple/include/tuple_sketch.hpp
index 9e61efb..9b960d0 100644
--- a/tuple/include/tuple_sketch.hpp
+++ b/tuple/include/tuple_sketch.hpp
@@ -354,8 +354,9 @@ public:
   using vector_bytes = std::vector<uint8_t, AllocBytes>;
   using comparator = compare_by_key<ExtractKey>;
 
-  static const uint8_t SERIAL_VERSION = 3;
+  static const uint8_t SERIAL_VERSION = 1;
   static const uint8_t SKETCH_TYPE = 3;
+  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
 
   // Instances of this type can be obtained:
   // - by compacting an update_tuple_sketch
@@ -414,8 +415,6 @@ public:
   compact_tuple_sketch(bool is_empty, bool is_ordered, uint16_t seed_hash, uint64_t theta, std::vector<Entry, AllocEntry>&& entries);
 
 private:
-  enum flags { IS_BIG_ENDIAN, IS_READ_ONLY, IS_EMPTY, IS_COMPACT, IS_ORDERED };
-
   bool is_empty_;
   bool is_ordered_;
   uint16_t seed_hash_;
diff --git a/tuple/test/CMakeLists.txt b/tuple/test/CMakeLists.txt
index f604b70..53a3a7e 100644
--- a/tuple/test/CMakeLists.txt
+++ b/tuple/test/CMakeLists.txt
@@ -45,4 +45,5 @@ target_sources(tuple_test
     tuple_a_not_b_test.cpp
     theta_sketch_experimental_test.cpp
     theta_union_experimental_test.cpp
+    array_of_doubles_sketch_test.cpp
 )
diff --git a/tuple/test/aod_1_compact_estimation_from_java.sk b/tuple/test/aod_1_compact_estimation_from_java.sk
new file mode 100644
index 0000000..d086489
Binary files /dev/null and b/tuple/test/aod_1_compact_estimation_from_java.sk differ
diff --git a/tuple/test/array_of_doubles_sketch_test.cpp b/tuple/test/array_of_doubles_sketch_test.cpp
new file mode 100644
index 0000000..fa7bfe2
--- /dev/null
+++ b/tuple/test/array_of_doubles_sketch_test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <array>
+
+#include <catch.hpp>
+#include <array_of_doubles_sketch.hpp>
+
+namespace datasketches {
+
+#ifdef TEST_BINARY_INPUT_PATH
+const std::string inputPath = TEST_BINARY_INPUT_PATH;
+#else
+const std::string inputPath = "test/";
+#endif
+
+TEST_CASE("tuple sketch: array of doubles serialization compatibility with java", "[tuple_sketch]") {
+  auto update_sketch = update_array_of_doubles_sketch<1>::builder().build();
+  std::array<double, 1> a = {1};
+  for (int i = 0; i < 8192; ++i) update_sketch.update(i, a);
+  auto compact_sketch = update_sketch.compact();
+
+  // read binary sketch from Java
+  std::ifstream is;
+  is.exceptions(std::ios::failbit | std::ios::badbit);
+  is.open(inputPath + "aod_1_compact_estimation_from_java.sk", std::ios::binary);
+  auto compact_sketch_from_java = compact_array_of_doubles_sketch<1>::deserialize(is);
+  REQUIRE(compact_sketch.get_num_retained() == compact_sketch_from_java.get_num_retained());
+  REQUIRE(compact_sketch.get_theta() == Approx(compact_sketch_from_java.get_theta()).margin(1e-10));
+  REQUIRE(compact_sketch.get_estimate() == Approx(compact_sketch_from_java.get_estimate()).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(1) == Approx(compact_sketch_from_java.get_lower_bound(1)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(1) == Approx(compact_sketch_from_java.get_upper_bound(1)).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(2) == Approx(compact_sketch_from_java.get_lower_bound(2)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(2) == Approx(compact_sketch_from_java.get_upper_bound(2)).margin(1e-10));
+  REQUIRE(compact_sketch.get_lower_bound(3) == Approx(compact_sketch_from_java.get_lower_bound(3)).margin(1e-10));
+  REQUIRE(compact_sketch.get_upper_bound(3) == Approx(compact_sketch.get_upper_bound(3)).margin(1e-10));
+
+  // sketch from Java is not ordered
+  // transform it to ordered so that iteration sequence would match exactly
+  compact_array_of_doubles_sketch<1> ordered_sketch_from_java(compact_sketch_from_java, true);
+  auto it = ordered_sketch_from_java.begin();
+  for (const auto& entry: compact_sketch) {
+    REQUIRE(entry.first == (*it).first);
+    REQUIRE(entry.second == (*it).second);
+    ++it;
+  }
+}
+
+} /* namespace datasketches */
diff --git a/tuple/test/tuple_sketch_test.cpp b/tuple/test/tuple_sketch_test.cpp
index 560fef3..b4773aa 100644
--- a/tuple/test/tuple_sketch_test.cpp
+++ b/tuple/test/tuple_sketch_test.cpp
@@ -217,7 +217,7 @@ struct three_doubles_update_policy {
   }
 };
 
-TEST_CASE("tuple sketch: array of doubles", "[tuple_sketch]") {
+TEST_CASE("tuple sketch: tuple of doubles", "[tuple_sketch]") {
   using three_doubles_update_tuple_sketch = update_tuple_sketch<three_doubles, three_doubles, three_doubles_update_policy>;
   auto update_sketch = three_doubles_update_tuple_sketch::builder().build();
   update_sketch.update(1, three_doubles(1, 2, 3));


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datasketches.apache.org
For additional commands, e-mail: commits-help@datasketches.apache.org