You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by aw...@apache.org on 2020/04/09 04:03:04 UTC
[kudu] 03/03: encoding-test: Clean up bitshuffle tests a little

This is an automated email from the ASF dual-hosted git repository.

awong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 972edb9332dde7cc6fba542343c7a7434ef6cbff
Author: Bankim Bhavsar <ba...@cloudera.com>
AuthorDate: Tue Mar 31 12:40:47 2020 -0700

    encoding-test: Clean up bitshuffle tests a little
    
    Also adds functions in random number generator to generate 128-bit integers.
    
    Change-Id: I8b7bee1f952c46c09df5102ac1916141c6935284
    Reviewed-on: http://gerrit.cloudera.org:8080/15043
    Reviewed-by: Andrew Wong <aw...@cloudera.com>
    Tested-by: Kudu Jenkins
---
 src/kudu/cfile/encoding-test.cc   | 58 ++++++++++++++++++++++-----------------
 src/kudu/util/random.h            | 53 +++++++++++++++++++++++++++++++----
 src/kudu/util/random_util-test.cc | 50 +++++++++++++++++++++++++++++++++
 src/kudu/util/random_util.h       | 31 +++++++++++++++++++--
 4 files changed, 158 insertions(+), 34 deletions(-)

diff --git a/src/kudu/cfile/encoding-test.cc b/src/kudu/cfile/encoding-test.cc
index cb25c1b..007f5d4 100644
--- a/src/kudu/cfile/encoding-test.cc
+++ b/src/kudu/cfile/encoding-test.cc
@@ -23,6 +23,7 @@
 #include <cstdio>
 #include <cstdlib>
 #include <functional>
+#include <initializer_list>
 #include <limits>
 #include <memory>
 #include <ostream>
@@ -436,7 +437,7 @@ class TestEncoding : public KuduTest {
   }
 
   template <DataType Type, class BlockBuilder, class BlockDecoder>
-  void TestEncodeDecodeTemplateBlockEncoder(typename TypeTraits<Type>::cpp_type* src,
+  void TestEncodeDecodeTemplateBlockEncoder(const typename TypeTraits<Type>::cpp_type* src,
                                             size_t size) {
     typedef typename TypeTraits<Type>::cpp_type CppType;
     const uint32_t kOrdinalPosBase = 12345;
@@ -719,39 +720,46 @@ TEST_F(TestEncoding, TestPlainBlockEncoder) {
 
 // Test for bitshuffle block, for INT32, INT64, INT128, FLOAT, DOUBLE
 TEST_F(TestEncoding, TestBShufInt32BlockEncoder) {
-  const uint32_t kSize = 10000;
-
-  unique_ptr<int32_t[]> ints(new int32_t[kSize]);
-  for (int i = 0; i < kSize; i++) {
-    ints.get()[i] = random();
+  using limits = std::numeric_limits<int32_t>;
+  Random rng(SeedRandom());
+  auto sequences = {
+      CreateRandomIntegersInRange<int32_t>(10000, 1000, 2000, &rng),
+      CreateRandomIntegersInRange<int32_t>(10000, 0, limits::max(), &rng),
+      CreateRandomIntegersInRange<int32_t>(10000, limits::min(), limits::max(), &rng)
+  };
+  for (const auto& ints : sequences) {
+    TestEncodeDecodeTemplateBlockEncoder<INT32, BShufBlockBuilder<INT32>,
+                                         BShufBlockDecoder<INT32> >(ints.data(), ints.size());
   }
-
-  TestEncodeDecodeTemplateBlockEncoder<INT32, BShufBlockBuilder<INT32>,
-      BShufBlockDecoder<INT32>>(ints.get(), kSize);
 }
 
 TEST_F(TestEncoding, TestBShufInt64BlockEncoder) {
-  const uint32_t kSize = 10000;
-
-  unique_ptr<int64_t[]> ints(new int64_t[kSize]);
-  for (int i = 0; i < kSize; i++) {
-    ints.get()[i] = random();
+  using limits = std::numeric_limits<int64_t>;
+  Random rng(SeedRandom());
+  auto sequences = {
+      CreateRandomIntegersInRange<int64_t>(10000, 1000, 2000, &rng),
+      CreateRandomIntegersInRange<int64_t>(10000, 0, limits::max(), &rng),
+      CreateRandomIntegersInRange<int64_t>(10000, limits::min(), limits::max(), &rng)
+  };
+  for (const auto& ints : sequences) {
+    TestEncodeDecodeTemplateBlockEncoder<INT64, BShufBlockBuilder<INT64>,
+                                         BShufBlockDecoder<INT64> >(ints.data(), ints.size());
   }
-
-  TestEncodeDecodeTemplateBlockEncoder<INT64, BShufBlockBuilder<INT64>,
-      BShufBlockDecoder<INT64>>(ints.get(), kSize);
 }
 
 TEST_F(TestEncoding, TestBShufInt128BlockEncoder) {
-  const uint32_t kSize = 10000;
-
-  unique_ptr<int128_t[]> ints(new int128_t[kSize]);
-  for (int i = 0; i < kSize; i++) {
-    ints.get()[i] = random();
+  Random rng(SeedRandom());
+  // As per the note in int128.h, numeric_limits<> on int128_t can give incorrect results.
+  // Hence using predefined min, max constants.
+  auto sequences = {
+      CreateRandomIntegersInRange<int128_t>(10000, 1000, 2000, &rng),
+      CreateRandomIntegersInRange<int128_t>(10000, 0, INT128_MAX, &rng),
+      CreateRandomIntegersInRange<int128_t>(10000, INT128_MIN, INT128_MAX, &rng)
+  };
+  for (const auto& ints : sequences) {
+    TestEncodeDecodeTemplateBlockEncoder<INT128, BShufBlockBuilder<INT128>,
+                                         BShufBlockDecoder<INT128> >(ints.data(), ints.size());
   }
-
-  TestEncodeDecodeTemplateBlockEncoder<INT128, BShufBlockBuilder<INT128>,
-      BShufBlockDecoder<INT128>>(ints.get(), kSize);
 }
 
 TEST_F(TestEncoding, TestBShufFloatBlockEncoder) {
diff --git a/src/kudu/util/random.h b/src/kudu/util/random.h
index 51e578b..9c2722d 100644
--- a/src/kudu/util/random.h
+++ b/src/kudu/util/random.h
@@ -12,6 +12,7 @@
 
 #include "kudu/gutil/casts.h"
 #include "kudu/gutil/map-util.h"
+#include "kudu/util/int128.h"
 #include "kudu/util/locks.h"
 
 namespace kudu {
@@ -82,6 +83,16 @@ class Random {
     return large;
   }
 
+  // Next pseudo-random 128-bit unsigned integer.
+  uint128_t Next128() {
+    uint128_t large = Next64();
+    large <<= 64U;
+    large |= Next64();
+    // Next64() provides entire range of numbers up to 64th bit.
+    // So unlike Next64(), no need to fill MSB bit(s).
+    return large;
+  }
+
   // Returns a uniformly distributed value in the range [0..n-1]
   // REQUIRES: n > 0
   uint32_t Uniform(uint32_t n) { return Next() % n; }
@@ -93,6 +104,9 @@ class Random {
   // REQUIRES: n > 0
   uint64_t Uniform64(uint64_t n) { return Next64() % n; }
 
+  // Returns a uniformly distributed 128-bit value in the range [0..n-1]
+  uint128_t Uniform128(uint128_t n) { return Next128() % n; }
+
   // Randomly returns true ~"1/n" of the time, and false otherwise.
   // REQUIRES: n > 0
   bool OneIn(int n) { return (Next() % n) == 0; }
@@ -140,6 +154,11 @@ class ThreadSafeRandom {
     return random_.Next64();
   }
 
+  uint128_t Next128() {
+    std::lock_guard<simple_spinlock> l(lock_);
+    return random_.Next128();
+  }
+
   uint32_t Uniform(uint32_t n) {
     std::lock_guard<simple_spinlock> l(lock_);
     return random_.Uniform(n);
@@ -155,6 +174,11 @@ class ThreadSafeRandom {
     return random_.Uniform64(n);
   }
 
+  uint128_t Uniform128(uint128_t n) {
+    std::lock_guard<simple_spinlock> l(lock_);
+    return random_.Uniform128(n);
+  }
+
   bool OneIn(int n) {
     std::lock_guard<simple_spinlock> l(lock_);
     return random_.OneIn(n);
@@ -207,18 +231,35 @@ inline double Random::Normal(double mean, double std_dev) {
 }
 
 // Generate next random integer with data-type specified by IntType template
-// parameter which must be a 32-bit or 64-bit integer. This constexpr function
+// parameter which must be a 32-bit, 64-bit, or 128-bit unsigned integer. This constexpr function
 // is useful when generating random numbers in a loop with template parameter
-// and avoids the run-time cost of determining Next32() v/s Next64() in RNG.
+// and avoids the run-time cost of determining Next32() v/s Next64() v/s Next128() in RNG.
 //
 // Note: This constexpr function can't be a member function of
 // Random/ThreadSafeRandom class because it invokes non-const member function.
 template<typename IntType, class RNG>
 constexpr IntType GetNextRandom(RNG* rand) {
-  static_assert(
-      std::is_integral<IntType>::value && (sizeof(IntType) == 4 || sizeof(IntType) == 8),
-      "Only 32-bit and 64-bit integers supported");
-  return sizeof(IntType) == 4 ? rand->Next32() : rand->Next64();
+  // type_traits not defined for 128-bit int data types, hence not checking for integral value.
+  static_assert(sizeof(IntType) == 4 || sizeof(IntType) == 8 || sizeof(IntType) == 16,
+                "Only 32-bit, 64-bit, or 128-bit integers supported");
+
+  // if/switch statement disallowed in constexpr function in C++11.
+  return sizeof(IntType) == 4 ? rand->Next32() :
+                                sizeof(IntType) == 8 ? rand->Next64() : rand->Next128();
+}
+
+// Generate next random integer in the [0, n-1] range with data-type specified by IntType template
+// parameter which must be a 32-bit, 64-bit, or 128-bit unsigned integer.
+// Note: Same comments apply to this constexpr function as GetNextRandom() function above.
+template<typename IntType, class RNG>
+constexpr IntType GetNextUniformRandom(IntType n, RNG* rand) {
+  // type_traits not defined for 128-bit int data types, hence not checking for integral value.
+  static_assert(sizeof(n) == 4 || sizeof(n) == 8 || sizeof(n) == 16,
+                "Only 32-bit, 64-bit, or 128-bit integers generated");
+
+  // if/switch statement disallowed in constexpr function in C++11.
+  return sizeof(n) == 4 ? rand->Uniform32(n) :
+                          sizeof(n) == 8 ? rand->Uniform64(n) : rand->Uniform128(n);
 }
 
 }  // namespace kudu
diff --git a/src/kudu/util/random_util-test.cc b/src/kudu/util/random_util-test.cc
index cf17225..9170aca 100644
--- a/src/kudu/util/random_util-test.cc
+++ b/src/kudu/util/random_util-test.cc
@@ -19,13 +19,16 @@
 
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <ostream>
 #include <unordered_set>
+#include <vector>
 
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
 #include "kudu/gutil/map-util.h"
+#include "kudu/util/int128.h"
 #include "kudu/util/random.h"
 #include "kudu/util/test_util.h"
 
@@ -77,6 +80,26 @@ TEST_F(RandomUtilTest, TestRandomString) {
   CheckEmpty(start, 0, 0, kLenMax);
 }
 
+// Static helper function for testing generation of random numbers in range.
+// CreateRandomUniqueIntegers() doesn't support 128-bit integers yet. Hence a separate
+// template function instead of member of TemplateRandomUtilTest class.
+// TODO(bankim): Once CreateRandomIntegersInRange() supports 128-bit integers make it a
+//               member function of TemplateRandomUtilTest class.
+template <typename IntType>
+static void RunCreateRandomIntegersInRangerHelper(int num_trials, IntType min_val, IntType max_val,
+                                                  Random* rng) {
+  static constexpr int kMaxNumVals = 1000;
+  for (int i = 0; i < num_trials; ++i) {
+    int num_vals = rng->Uniform(kMaxNumVals);
+    auto vals = CreateRandomIntegersInRange<IntType>(num_vals, min_val, max_val, rng);
+    ASSERT_EQ(num_vals, vals.size());
+    for (const auto& v : vals) {
+      ASSERT_GE(v, min_val);
+      ASSERT_LT(v, max_val);
+    }
+  }
+}
+
 template<typename IntType>
 class TemplateRandomUtilTest : public RandomUtilTest {
  public:
@@ -95,6 +118,15 @@ class TemplateRandomUtilTest : public RandomUtilTest {
       }
     }
   }
+
+  void RunCreateRandomIntegersInRange() {
+    static constexpr IntType min_val = std::numeric_limits<IntType>::min();
+    static constexpr IntType max_val = std::numeric_limits<IntType>::max();
+    // Exercise entire range.
+    RunCreateRandomIntegersInRangerHelper(kNumTrials, min_val, max_val, &rng_);
+    // Exercise partial range.
+    RunCreateRandomIntegersInRangerHelper(kNumTrials, min_val / 2, max_val / 2, &rng_);
+  }
 };
 
 // Testing with char, short data-types will result in compile-time error, as expected.
@@ -106,4 +138,22 @@ TYPED_TEST(TemplateRandomUtilTest, RunCreateRandomUniqueIntegers) {
   this->RunCreateRandomUniqueIntegers();
 }
 
+TYPED_TEST(TemplateRandomUtilTest, RunCreateRandomIntegersInRange) {
+  this->RunCreateRandomIntegersInRange();
+}
+
+// TODO(bankim): CreateRandomUniqueIntegers() doesn't support 128-bit integers yet.
+//               Once it does add int128_t and uint128_t types to IntTypes and use
+//               templatized TemplateRandomUtilTest instead.
+TEST_F(RandomUtilTest, RunCreateRandom128BitIntegersInRange) {
+  // Exercise entire range.
+  RunCreateRandomIntegersInRangerHelper<int128_t>(kNumTrials, INT128_MIN, INT128_MAX, &rng_);
+  RunCreateRandomIntegersInRangerHelper<uint128_t>(kNumTrials, 0, UINT128_MAX, &rng_);
+
+  // Exercise partial range.
+  RunCreateRandomIntegersInRangerHelper<int128_t>(kNumTrials, INT128_MIN / 2, INT128_MAX / 2,
+                                                  &rng_);
+  RunCreateRandomIntegersInRangerHelper<uint128_t>(kNumTrials, 0, UINT128_MAX / 2, &rng_);
+}
+
 } // namespace kudu
diff --git a/src/kudu/util/random_util.h b/src/kudu/util/random_util.h
index b89ea3c..4448023 100644
--- a/src/kudu/util/random_util.h
+++ b/src/kudu/util/random_util.h
@@ -27,6 +27,7 @@
 #include <glog/logging.h>
 
 #include "kudu/gutil/map-util.h"
+#include "kudu/util/int128_util.h"
 #include "kudu/util/random.h"
 
 namespace kudu {
@@ -103,13 +104,13 @@ void ReservoirSample(const Collection& c, int k, const Set& avoid,
 
 // Generates random and unique 32-bit or 64-bit integers that are not present in
 // the "avoid" set.
+// TODO(bankim): Add support for 128-bit integers by providing hash value for 128-bit data types.
 template<typename IntType, class Set, class RNG>
 std::unordered_set<IntType> CreateRandomUniqueIntegers(const int num_values,
                                                        const Set& avoid,
                                                        RNG* rand) {
-  static_assert(
-      std::is_integral<IntType>::value && (sizeof(IntType) == 4 || sizeof(IntType) == 8),
-      "Only 32-bit and 64-bit integers generated");
+  static_assert(std::is_integral<IntType>::value && (sizeof(IntType) == 4 || sizeof(IntType) == 8),
+                "Only 32-bit or 64-bit integers generated");
 
   std::unordered_set<IntType> result;
   while (result.size() < num_values) {
@@ -124,5 +125,29 @@ std::unordered_set<IntType> CreateRandomUniqueIntegers(const int num_values,
   return result;
 }
 
+// Generates random 32-bit, 64-bit, or 128-bit integers in the [min_val, max_val) range.
+template<typename IntType, class RNG>
+// When entire range is specified for signed IntType, result of max_val - min_val is -1
+// which when converted to unsigned type correctly represents the max value for the
+// same unsigned IntType. Variants of Uniform() in RNG work with unsigned types.
+// Hence the ASAN suppression.
+ATTRIBUTE_NO_SANITIZE_INTEGER
+std::vector<IntType> CreateRandomIntegersInRange(const int num_values,
+                                                 IntType min_val,
+                                                 IntType max_val,
+                                                 RNG* rand) {
+  // type_traits not defined for 128-bit int data types, hence not checking for integral value.
+  static_assert(sizeof(IntType) == 4 || sizeof(IntType) == 8 || sizeof(IntType) == 16,
+                "Only 32-bit, 64-bit, or 128-bit integers generated");
+  CHECK_LT(min_val, max_val);
+
+  std::vector<IntType> result(num_values);
+  for (auto& v : result) {
+    v = min_val + GetNextUniformRandom(max_val - min_val, rand);
+  }
+  return result;
+}
+
+
 } // namespace kudu