You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by wd...@apache.org on 2018/10/09 17:07:01 UTC

[3/4] kudu git commit: [compaction] Small improvements to compaction policy tests

[compaction] Small improvements to compaction policy tests

To learn more about compaction policy, I spent some time reading over
compaction_policy-test.cc, and in doing so I made a bunch of small
improvements. I also added one test covering a facet of the policy
that's pretty important but isn't obviously covered in the design docs
or the existing tests.

There are no functional non-test changes in this patch. I added
scaled-down versions of a couple of slow tests that can run all the
time, and made one slow test only run when slow tests are allowed.

Change-Id: I8f7a0e2c69f301d7fc7ca2fac657569ea240d4e3
Reviewed-on: http://gerrit.cloudera.org:8080/11605
Tested-by: Kudu Jenkins
Reviewed-by: Attila Bukor <ab...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/0c91532e
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/0c91532e
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/0c91532e

Branch: refs/heads/master
Commit: 0c91532e742df3b8f09d94ede5d7cde701b3857c
Parents: 77ead35
Author: Will Berkeley <wd...@gmail.org>
Authored: Fri Oct 5 20:41:19 2018 -0700
Committer: Will Berkeley <wd...@gmail.com>
Committed: Tue Oct 9 16:44:00 2018 +0000

----------------------------------------------------------------------
 src/kudu/tablet/compaction_policy-test.cc | 196 +++++++++++++++++--------
 1 file changed, 131 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/0c91532e/src/kudu/tablet/compaction_policy-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tablet/compaction_policy-test.cc b/src/kudu/tablet/compaction_policy-test.cc
index ddf917f..4277f9d 100644
--- a/src/kudu/tablet/compaction_policy-test.cc
+++ b/src/kudu/tablet/compaction_policy-test.cc
@@ -60,124 +60,188 @@ class TestCompactionPolicy : public KuduTest {
 
     BudgetedCompactionPolicy policy(size_budget_mb);
 
-    ASSERT_OK(policy.PickRowSets(tree, picked, quality, nullptr));
+    ASSERT_OK(policy.PickRowSets(tree, picked, quality, /*log=*/nullptr));
   }
 };
 
 
 // Simple test for budgeted compaction: with three rowsets which
-// mostly overlap, and an high budget, they should all be selected.
+// mostly overlap, and a high budget, they should all be selected.
 TEST_F(TestCompactionPolicy, TestBudgetedSelection) {
-  RowSetVector vec = {
+  /*
+   *   [C ------ c]
+   *  [B ----- a]
+   * [A ------- b]
+   */
+  const RowSetVector rowsets = {
     std::make_shared<MockDiskRowSet>("C", "c"),
     std::make_shared<MockDiskRowSet>("B", "a"),
     std::make_shared<MockDiskRowSet>("A", "b")
   };
 
-  const int kBudgetMb = 1000; // enough to select all
+  constexpr auto kBudgetMb = 1000; // Enough to select all rowsets.
   unordered_set<RowSet*> picked;
-  double quality = 0;
-  NO_FATALS(RunTestCase(vec, kBudgetMb, &picked, &quality));
-  ASSERT_EQ(3, picked.size());
+  double quality = 0.0;
+  NO_FATALS(RunTestCase(rowsets, kBudgetMb, &picked, &quality));
+  ASSERT_EQ(rowsets.size(), picked.size());
+  // Since all three rowsets are picked and each covers almost the entire key
+  // range, the sum of widths is about 3 while the union width is 1, so the
+  // quality score should be between 1 and 2.
   ASSERT_GE(quality, 1.0);
+  ASSERT_LE(quality, 2.0);
 }
 
 // Test for the case when we have many rowsets, but none of them
 // overlap at all. This is likely to occur in workloads where the
 // primary key is always increasing (such as a timestamp).
-TEST_F(TestCompactionPolicy, TestNonOverlappingRowsets) {
-  if (!AllowSlowTests()) {
-    LOG(INFO) << "Skipped";
-    return;
-  }
-  RowSetVector vec;
-  for (int i = 0; i < 10000;  i++) {
-    vec.emplace_back(new MockDiskRowSet(
+TEST_F(TestCompactionPolicy, TestNonOverlappingRowSets) {
+  /* NB: Zero-padding of string keys omitted to save space.
+   *
+   * [0 - 1] [2 - 3] ... [198 - 199]
+   */
+  const auto kNumRowSets = AllowSlowTests() ? 10000 : 100;
+  RowSetVector rowsets;
+  for (auto i = 0; i < kNumRowSets; i++) {
+    rowsets.emplace_back(new MockDiskRowSet(
         StringPrintf("%010d", i * 2),
         StringPrintf("%010d", i * 2 + 1)));
   }
+  constexpr auto kBudgetMb = 128;
   unordered_set<RowSet*> picked;
-  double quality = 0;
-  NO_FATALS(RunTestCase(vec, /*size_budget_mb=*/128, &picked, &quality));
-  ASSERT_EQ(quality, 0);
+  double quality = 0.0;
+  NO_FATALS(RunTestCase(rowsets, kBudgetMb, &picked, &quality));
+  ASSERT_EQ(quality, 0.0);
   ASSERT_TRUE(picked.empty());
 }
 
 // Similar to the above, but with some small overlap between adjacent
 // rowsets.
-TEST_F(TestCompactionPolicy, TestTinyOverlapRowsets) {
-  if (!AllowSlowTests()) {
-    LOG(INFO) << "Skipped";
-    return;
-  }
-
-  RowSetVector vec;
-  for (int i = 0; i < 10000;  i++) {
-    vec.emplace_back(new MockDiskRowSet(
-        StringPrintf("%010d", i * 10000),
-        StringPrintf("%010d", i * 10000 + 11000)));
+TEST_F(TestCompactionPolicy, TestTinyOverlapRowSets) {
+  /* NB: Zero-padding of string keys omitted to save space.
+   *
+   * [0 - 11000]
+   *    [10000 - 21000]
+   *           ...
+   *                    [990000 - 1001000]
+   */
+  const auto kNumRowSets = AllowSlowTests() ? 10000 : 100;
+  RowSetVector rowsets;
+  for (auto i = 0; i < kNumRowSets; i++) {
+    rowsets.emplace_back(new MockDiskRowSet(
+        StringPrintf("%09d", i * 10000),
+        StringPrintf("%09d", i * 10000 + 11000)));
   }
+  constexpr auto kBudgetMb = 128;
   unordered_set<RowSet*> picked;
   double quality = 0;
-  NO_FATALS(RunTestCase(vec, /*size_budget_mb=*/128, &picked, &quality));
-  ASSERT_EQ(quality, 0);
+  NO_FATALS(RunTestCase(rowsets, kBudgetMb, &picked, &quality));
+  // With such small overlaps, no compaction will be considered worthwhile.
+  ASSERT_EQ(quality, 0.0);
   ASSERT_TRUE(picked.empty());
 }
 
 // Test case with 100 rowsets, each of which overlaps with its two
 // neighbors to the right.
 TEST_F(TestCompactionPolicy, TestSignificantOverlap) {
-  RowSetVector vec;
-  for (int i = 0; i < 100;  i++) {
-    vec.emplace_back(new MockDiskRowSet(
+  /* NB: Zero-padding of string keys omitted to save space.
+   *
+   * [0 ------ 20000]
+   *    [10000 ------ 30000]
+   *           [20000 ------ 40000]
+   *                  ...
+   *                                [990000 - 1010000]
+   */
+  constexpr auto kNumRowSets = 100;
+  RowSetVector rowsets;
+  for (auto i = 0; i < kNumRowSets;  i++) {
+    rowsets.emplace_back(new MockDiskRowSet(
         StringPrintf("%010d", i * 10000),
         StringPrintf("%010d", (i + 2) * 10000)));
   }
+  constexpr auto kBudgetMb = 64;
   unordered_set<RowSet*> picked;
-  double quality = 0;
-  const int kBudgetMb = 64;
-  NO_FATALS(RunTestCase(vec, kBudgetMb, &picked, &quality));
-  ASSERT_GT(quality, 0.5);
-  // Each rowset is 1MB so we should exactly fill the budget.
+  double quality = 0.0;
+  NO_FATALS(RunTestCase(rowsets, kBudgetMb, &picked, &quality));
+  // Each rowset is 1MB so the number of rowsets picked should be the number of
+  // MB in the budget.
   ASSERT_EQ(picked.size(), kBudgetMb);
+
+  // In picking 64 of 100 equally-sized and -spaced rowsets, the union width is
+  // about 0.64. Each rowset intersects the next in half its width, which makes
+  // the sum of widths about 2 * 0.64 = 1.28, if we ignore the smaller overlap
+  // between the ith and (i + 2)th rowsets. So, the quality score should be
+  // around 0.64.
+  ASSERT_GT(quality, 0.5);
+}
+
+// Test the adjustment we make to the quality measure that penalizes wider
+// solutions that have almost the same quality score. For example, with no
+// adjustment and inputs
+//
+//  [ -- A -- ]
+//  [ -- B -- ]
+//            [ -- C -- ]
+//
+// compacting {A, B, C} results in the same quality score as {A, B}, 0.67, but
+// uses more budget. By penalizing the wider solution {A, B, C}, we ensure we
+// don't waste I/O.
+TEST_F(TestCompactionPolicy, TestSupportAdjust) {
+  const RowSetVector rowsets = {
+    std::make_shared<MockDiskRowSet>("A", "B"),
+    std::make_shared<MockDiskRowSet>("A", "B"),
+    std::make_shared<MockDiskRowSet>("B", "C"),
+  };
+  constexpr auto kBudgetMb = 1000; // Enough to select all rowsets.
+  unordered_set<RowSet*> picked;
+  double quality = 0.0;
+  NO_FATALS(RunTestCase(rowsets, kBudgetMb, &picked, &quality));
+  ASSERT_EQ(2, picked.size());
+  // The weights of A and B are both 0.67, so with the adjustment the quality
+  // score should be a little less than 0.67.
+  ASSERT_GE(quality, 0.6);
 }
 
 static RowSetVector LoadFile(const string& name) {
-  RowSetVector ret;
-  string path = JoinPathSegments(GetTestExecutableDirectory(), name);
+  RowSetVector rowsets;
+  const string path = JoinPathSegments(GetTestExecutableDirectory(), name);
   faststring data;
   CHECK_OK_PREPEND(ReadFileToString(Env::Default(), path, &data),
-                   strings::Substitute("Unable to load test data file $0", path));
-  vector<string> lines = strings::Split(data.ToString(), "\n");
+                   strings::Substitute("unable to load test data file $0", path));
+  const vector<string> lines = strings::Split(data.ToString(), "\n");
   for (const auto& line : lines) {
     if (line.empty() || line[0] == '#') continue;
-    vector<string> fields = strings::Split(line, "\t");
+    const vector<string> fields = strings::Split(line, "\t");
     CHECK_EQ(3, fields.size()) << "Expected 3 fields on line: " << line;
-    int size_mb = ParseLeadingInt32Value(fields[0], -1);
+    const int size_mb = ParseLeadingInt32Value(fields[0], -1);
     CHECK_GE(size_mb, 1) << "Expected size at least 1MB on line: " << line;
-    ret.emplace_back(new MockDiskRowSet(fields[1] /* min key */,
-                                        fields[2] /* max key */,
-                                        size_mb * 1024 * 1024));
+    rowsets.emplace_back(new MockDiskRowSet(fields[1] /* min key */,
+                                            fields[2] /* max key */,
+                                            size_mb * 1024 * 1024));
   }
 
-  return ret;
+  return rowsets;
 }
 
-// Realistic test using data scraped from a tablet containing 200+GB of YCSB data.
-// This test can be used as a benchmark for optimizing the compaction policy,
-// and also serves as a basic regression/stress test using real data.
+// Realistic test using data scraped from a tablet containing 200GB+ of YCSB
+// data. This test can be used as a benchmark for optimizing the compaction
+// policy, and also serves as a basic regression/stress test using real data.
 TEST_F(TestCompactionPolicy, TestYcsbCompaction) {
-  RowSetVector vec = LoadFile("ycsb-test-rowsets.tsv");
+  if (!AllowSlowTests()) {
+    LOG(WARNING) << "test is skipped; set KUDU_ALLOW_SLOW_TESTS=1 to run";
+    return;
+  }
+  const RowSetVector rowsets = LoadFile("ycsb-test-rowsets.tsv");
   RowSetTree tree;
-  ASSERT_OK(tree.Reset(vec));
+  ASSERT_OK(tree.Reset(rowsets));
   vector<double> qualities;
   for (int budget_mb : {128, 256, 512, 1024}) {
     BudgetedCompactionPolicy policy(budget_mb);
 
     unordered_set<RowSet*> picked;
-    double quality = 0;
-    LOG_TIMING(INFO, strings::Substitute("Computing compaction with $0MB budget", budget_mb)) {
-      ASSERT_OK(policy.PickRowSets(tree, &picked, &quality, nullptr));
+    double quality = 0.0;
+    LOG_TIMING(INFO, strings::Substitute("computing compaction with $0MB budget",
+                                         budget_mb)) {
+      ASSERT_OK(policy.PickRowSets(tree, &picked, &quality, /*log=*/nullptr));
     }
     LOG(INFO) << "quality=" << quality;
     int total_size = 0;
@@ -188,26 +252,28 @@ TEST_F(TestCompactionPolicy, TestYcsbCompaction) {
     qualities.push_back(quality);
   }
 
-  // Given increasing budgets, our solutions should also be higher quality.
-  ASSERT_TRUE(std::is_sorted(qualities.begin(), qualities.end()))
-      << qualities;
+  // Since the budget increased in each iteration, the qualities should be
+  // non-decreasing.
+  ASSERT_TRUE(std::is_sorted(qualities.begin(), qualities.end())) << qualities;
 }
 
 // Regression test for KUDU-2251 which ensures that large (> 2GiB) rowsets don't
 // cause integer overflow in compaction planning.
 TEST_F(TestCompactionPolicy, KUDU2251) {
-  RowSetVector vec = {
+  // Same arrangement as in TestBudgetedSelection.
+  const RowSetVector rowsets = {
     std::make_shared<MockDiskRowSet>("C", "c", 1L << 31),
     std::make_shared<MockDiskRowSet>("B", "a", 1L << 32),
     std::make_shared<MockDiskRowSet>("A", "b", 1L << 33)
   };
 
-  const int kBudgetMb = 1L << 30; // enough to select all
+  constexpr auto kBudgetMb = 1L << 30; // Enough to select all rowsets.
   unordered_set<RowSet*> picked;
-  double quality = 0;
-  NO_FATALS(RunTestCase(vec, kBudgetMb, &picked, &quality));
+  double quality = 0.0;
+  NO_FATALS(RunTestCase(rowsets, kBudgetMb, &picked, &quality));
   ASSERT_EQ(3, picked.size());
-  ASSERT_GE(quality, 1.0);
+  ASSERT_GT(quality, 1.0);
+  ASSERT_LT(quality, 2.0);
 }
 
 } // namespace tablet