You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2016/07/22 22:02:26 UTC

[6/9] incubator-kudu git commit: ksck: fix a crash in checksum mode on tables with many tablets

ksck: fix a crash in checksum mode on tables with many tablets

In the case that the list of tablets had to be fetched in multiple
batches, we would improperly re-fetch the last tablet of the previous
batch as the first tablet of the next batch. This would then cause
a tablet to be inserted twice into the list, which would later cause
a CHECK failure when we tried to InsertOrDie() this tablet ID into
a map.

This fixes the issue by making sure that we look for more tablets starting with
the *successor* partition key compared to the previous tablet we fetched.
I also updated the integration test to use a table with more tablets
so that the batching code was exercised.

Change-Id: I4ca7ef75bd22ce27885e31ab20cf0e8e0ee2d355
Reviewed-on: http://gerrit.cloudera.org:8080/3714
Tested-by: Kudu Jenkins
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/bcf1adc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/bcf1adc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/bcf1adc1

Branch: refs/heads/master
Commit: bcf1adc1b88fe28bf89b4b8c1a4daac96c7f0242
Parents: 2e04bf5
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Jul 20 15:40:34 2016 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Fri Jul 22 20:34:58 2016 +0000

----------------------------------------------------------------------
 src/kudu/tools/ksck_remote-test.cc | 11 ++++++++---
 src/kudu/tools/ksck_remote.cc      | 16 +++++++++-------
 2 files changed, 17 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/bcf1adc1/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index c10fabb..f8d7a83 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -28,6 +28,7 @@
 #include "kudu/util/test_util.h"
 
 DECLARE_int32(heartbeat_interval_ms);
+DECLARE_int32(tablets_batch_size_max);
 
 namespace kudu {
 namespace tools {
@@ -62,6 +63,10 @@ class RemoteKsckTest : public KuduTest {
     // Speed up testing, saves about 700ms per TEST_F.
     FLAGS_heartbeat_interval_ms = 10;
 
+    // Fetch the tablets in smaller batches to regression test a bug
+    // previously seen in the batching code.
+    FLAGS_tablets_batch_size_max = 5;
+
     MiniClusterOptions opts;
     opts.num_tablet_servers = 3;
     mini_cluster_.reset(new MiniCluster(env_.get(), opts));
@@ -137,10 +142,10 @@ class RemoteKsckTest : public KuduTest {
   // Generate a set of split rows for tablets used in this test.
   vector<const KuduPartialRow*> GenerateSplitRows() {
     vector<const KuduPartialRow*> split_rows;
-    vector<int> split_nums = { 33, 66 };
-    for (int i : split_nums) {
+    int num_tablets = AllowSlowTests() ? 10 : 3;
+    for (int i = 1; i < num_tablets; i++) {
       KuduPartialRow* row = schema_.NewRow();
-      CHECK_OK(row->SetInt32(0, i));
+      CHECK_OK(row->SetInt32(0, i * 10));
       split_rows.push_back(row);
     }
     return split_rows;

http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/bcf1adc1/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index 66a495a..3ba95ee 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -21,12 +21,13 @@
 #include "kudu/common/wire_protocol.h"
 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/strings/substitute.h"
+#include "kudu/gutil/strings/util.h"
 #include "kudu/util/net/net_util.h"
 #include "kudu/util/net/sockaddr.h"
 
 DEFINE_bool(checksum_cache_blocks, false, "Should the checksum scanners cache the read blocks");
 DEFINE_int64(timeout_ms, 1000 * 60, "RPC timeout in milliseconds");
-DEFINE_int64(tablets_batch_size_max, 100, "How many tablets to get from the Master per RPC");
+DEFINE_int32(tablets_batch_size_max, 100, "How many tablets to get from the Master per RPC");
 
 namespace kudu {
 namespace tools {
@@ -307,10 +308,10 @@ Status RemoteKsckMaster::RetrieveTablesList(vector<shared_ptr<KsckTable>>* table
 Status RemoteKsckMaster::RetrieveTabletsList(const shared_ptr<KsckTable>& table) {
   vector<shared_ptr<KsckTablet>> tablets;
   bool more_tablets = true;
-  string last_key;
+  string next_key;
   int retries = 0;
   while (more_tablets) {
-    Status s = GetTabletsBatch(table, &last_key, tablets, &more_tablets);
+    Status s = GetTabletsBatch(table, &next_key, tablets, &more_tablets);
     if (s.IsServiceUnavailable() && retries++ < 25) {
       SleepFor(MonoDelta::FromMilliseconds(100 * retries));
     } else if (!s.ok()) {
@@ -323,7 +324,7 @@ Status RemoteKsckMaster::RetrieveTabletsList(const shared_ptr<KsckTable>& table)
 }
 
 Status RemoteKsckMaster::GetTabletsBatch(const shared_ptr<KsckTable>& table,
-                                         string* last_partition_key,
+                                         string* next_partition_key,
                                          vector<shared_ptr<KsckTablet>>& tablets,
                                          bool* more_tablets) {
   master::GetTableLocationsRequestPB req;
@@ -332,16 +333,17 @@ Status RemoteKsckMaster::GetTabletsBatch(const shared_ptr<KsckTable>& table,
 
   req.mutable_table()->set_table_name(table->name());
   req.set_max_returned_locations(FLAGS_tablets_batch_size_max);
-  req.set_partition_key_start(*last_partition_key);
+  req.set_partition_key_start(*next_partition_key);
 
   rpc.set_timeout(GetDefaultTimeout());
   RETURN_NOT_OK(proxy_->GetTableLocations(req, &resp, &rpc));
   for (const master::TabletLocationsPB& locations : resp.tablet_locations()) {
-    if (locations.partition().partition_key_start() < *last_partition_key) {
+    if (locations.partition().partition_key_start() < *next_partition_key) {
       // We've already seen this partition.
       continue;
     }
-    *last_partition_key = locations.partition().partition_key_start();
+
+    *next_partition_key = ImmediateSuccessor(locations.partition().partition_key_start());
 
     shared_ptr<KsckTablet> tablet(new KsckTablet(table.get(), locations.tablet_id()));
     vector<shared_ptr<KsckTabletReplica>> replicas;