You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2016/07/22 22:02:26 UTC
[6/9] incubator-kudu git commit: ksck: fix a crash in checksum mode
on tables with many tablets
ksck: fix a crash in checksum mode on tables with many tablets
In the case that the list of tablets had to be fetched in multiple
batches, we would improperly re-fetch the last tablet of the previous
batch as the first tablet of the next batch. This would then cause
a tablet to be inserted twice into the list, which would later cause
a CHECK failure when we tried to InsertOrDie() this tablet ID into
a map.
This fixes the issue by making sure that we look for more tablets starting with
the *successor* partition key compared to the previous tablet we fetched.
I also updated the integration test to use a table with more tablets
so that the batching code was exercised.
Change-Id: I4ca7ef75bd22ce27885e31ab20cf0e8e0ee2d355
Reviewed-on: http://gerrit.cloudera.org:8080/3714
Tested-by: Kudu Jenkins
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/bcf1adc1
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/bcf1adc1
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/bcf1adc1
Branch: refs/heads/master
Commit: bcf1adc1b88fe28bf89b4b8c1a4daac96c7f0242
Parents: 2e04bf5
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Jul 20 15:40:34 2016 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Fri Jul 22 20:34:58 2016 +0000
----------------------------------------------------------------------
src/kudu/tools/ksck_remote-test.cc | 11 ++++++++---
src/kudu/tools/ksck_remote.cc | 16 +++++++++-------
2 files changed, 17 insertions(+), 10 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/bcf1adc1/src/kudu/tools/ksck_remote-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote-test.cc b/src/kudu/tools/ksck_remote-test.cc
index c10fabb..f8d7a83 100644
--- a/src/kudu/tools/ksck_remote-test.cc
+++ b/src/kudu/tools/ksck_remote-test.cc
@@ -28,6 +28,7 @@
#include "kudu/util/test_util.h"
DECLARE_int32(heartbeat_interval_ms);
+DECLARE_int32(tablets_batch_size_max);
namespace kudu {
namespace tools {
@@ -62,6 +63,10 @@ class RemoteKsckTest : public KuduTest {
// Speed up testing, saves about 700ms per TEST_F.
FLAGS_heartbeat_interval_ms = 10;
+ // Fetch the tablets in smaller batches to regression test a bug
+ // previously seen in the batching code.
+ FLAGS_tablets_batch_size_max = 5;
+
MiniClusterOptions opts;
opts.num_tablet_servers = 3;
mini_cluster_.reset(new MiniCluster(env_.get(), opts));
@@ -137,10 +142,10 @@ class RemoteKsckTest : public KuduTest {
// Generate a set of split rows for tablets used in this test.
vector<const KuduPartialRow*> GenerateSplitRows() {
vector<const KuduPartialRow*> split_rows;
- vector<int> split_nums = { 33, 66 };
- for (int i : split_nums) {
+ int num_tablets = AllowSlowTests() ? 10 : 3;
+ for (int i = 1; i < num_tablets; i++) {
KuduPartialRow* row = schema_.NewRow();
- CHECK_OK(row->SetInt32(0, i));
+ CHECK_OK(row->SetInt32(0, i * 10));
split_rows.push_back(row);
}
return split_rows;
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/bcf1adc1/src/kudu/tools/ksck_remote.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index 66a495a..3ba95ee 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -21,12 +21,13 @@
#include "kudu/common/wire_protocol.h"
#include "kudu/gutil/map-util.h"
#include "kudu/gutil/strings/substitute.h"
+#include "kudu/gutil/strings/util.h"
#include "kudu/util/net/net_util.h"
#include "kudu/util/net/sockaddr.h"
DEFINE_bool(checksum_cache_blocks, false, "Should the checksum scanners cache the read blocks");
DEFINE_int64(timeout_ms, 1000 * 60, "RPC timeout in milliseconds");
-DEFINE_int64(tablets_batch_size_max, 100, "How many tablets to get from the Master per RPC");
+DEFINE_int32(tablets_batch_size_max, 100, "How many tablets to get from the Master per RPC");
namespace kudu {
namespace tools {
@@ -307,10 +308,10 @@ Status RemoteKsckMaster::RetrieveTablesList(vector<shared_ptr<KsckTable>>* table
Status RemoteKsckMaster::RetrieveTabletsList(const shared_ptr<KsckTable>& table) {
vector<shared_ptr<KsckTablet>> tablets;
bool more_tablets = true;
- string last_key;
+ string next_key;
int retries = 0;
while (more_tablets) {
- Status s = GetTabletsBatch(table, &last_key, tablets, &more_tablets);
+ Status s = GetTabletsBatch(table, &next_key, tablets, &more_tablets);
if (s.IsServiceUnavailable() && retries++ < 25) {
SleepFor(MonoDelta::FromMilliseconds(100 * retries));
} else if (!s.ok()) {
@@ -323,7 +324,7 @@ Status RemoteKsckMaster::RetrieveTabletsList(const shared_ptr<KsckTable>& table)
}
Status RemoteKsckMaster::GetTabletsBatch(const shared_ptr<KsckTable>& table,
- string* last_partition_key,
+ string* next_partition_key,
vector<shared_ptr<KsckTablet>>& tablets,
bool* more_tablets) {
master::GetTableLocationsRequestPB req;
@@ -332,16 +333,17 @@ Status RemoteKsckMaster::GetTabletsBatch(const shared_ptr<KsckTable>& table,
req.mutable_table()->set_table_name(table->name());
req.set_max_returned_locations(FLAGS_tablets_batch_size_max);
- req.set_partition_key_start(*last_partition_key);
+ req.set_partition_key_start(*next_partition_key);
rpc.set_timeout(GetDefaultTimeout());
RETURN_NOT_OK(proxy_->GetTableLocations(req, &resp, &rpc));
for (const master::TabletLocationsPB& locations : resp.tablet_locations()) {
- if (locations.partition().partition_key_start() < *last_partition_key) {
+ if (locations.partition().partition_key_start() < *next_partition_key) {
// We've already seen this partition.
continue;
}
- *last_partition_key = locations.partition().partition_key_start();
+
+ *next_partition_key = ImmediateSuccessor(locations.partition().partition_key_start());
shared_ptr<KsckTablet> tablet(new KsckTablet(table.get(), locations.tablet_id()));
vector<shared_ptr<KsckTabletReplica>> replicas;