You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2019/05/07 05:52:28 UTC
[kudu] 01/03: KUDU-2711 (part 3). Only send one TSInfoPB per server
in GetTableLocations
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
commit 586e957f76a547340f2ab93a7eebc3f116ff825e
Author: Todd Lipcon <to...@apache.org>
AuthorDate: Tue Feb 26 11:55:39 2019 -0800
KUDU-2711 (part 3). Only send one TSInfoPB per server in GetTableLocations
This changes the response for GetTableLocations to send a top-level list
of TSInfoPB, and then each replica refers into that list by index. This
avoids having to copy a TSInfoPB per replica, and instead bounds the
number of serializations to the number of TS in the cluster. In the
worst case, this may slightly regress performance if the number of
replicas in the response is much fewer than the number of TS in the
cluster, since there is some extra overhead of building the index
mapping. In the best case (large number of replicas in the response)
there is a big improvement.
To handle wire compatibility, the server only uses the new response
format in the case that the client indicates it expects it by passing
a new boolean in the request.
The client handles old servers by handling a response in either format.
This patch only implements the new protocol in the C++ client. The Java client
will be updated in a follow-up patch.
Benchmarked a ~5x improvement with:
table_locations-itest --gtest_filter=\*Bench\* \
--benchmark_num_tablets 300 --benchmark_num_threads 30 \
--benchmark_runtime_secs=10
with rwlock (previous patch):
Count: 22179
Mean: 3901.19
Percentiles:
0% (min) = 52
25% = 3600
50% (med) = 3904
75% = 4192
95% = 4608
99% = 4896
99.9% = 5760
99.99% = 10048
100% (max) = 10661
with these improvements:
Count: 109590
Mean: 569.492
Percentiles:
0% (min) = 31
25% = 512
50% (med) = 556
75% = 604
95% = 716
99% = 912
99.9% = 1232
99.99% = 2528
100% (max) = 6336
Change-Id: Ief65d0825e919f681b7ade6a856c103201f8305c
Reviewed-on: http://gerrit.cloudera.org:8080/12615
Reviewed-by: Adar Dembo <ad...@cloudera.com>
Tested-by: Will Berkeley <wd...@gmail.com>
---
.../org/apache/kudu/client/AsyncKuduClient.java | 1 +
src/kudu/client/meta_cache.cc | 62 +++++++++++++++-----
src/kudu/client/meta_cache.h | 9 +--
src/kudu/consensus/quorum_util-test.cc | 6 ++
src/kudu/consensus/quorum_util.cc | 23 +++++++-
src/kudu/consensus/quorum_util.h | 8 +++
src/kudu/integration-tests/registration-test.cc | 6 +-
.../integration-tests/table_locations-itest.cc | 39 +++++++++++++
src/kudu/master/catalog_manager.cc | 68 +++++++++++++++-------
src/kudu/master/catalog_manager.h | 31 ++++++++--
src/kudu/master/master.proto | 25 ++++++++
src/kudu/master/master_service.cc | 4 +-
12 files changed, 233 insertions(+), 49 deletions(-)
diff --git a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
index 8f666ed..afc54d1 100644
--- a/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
+++ b/java/kudu-client/src/main/java/org/apache/kudu/client/AsyncKuduClient.java
@@ -2160,6 +2160,7 @@ public class AsyncKuduClient implements AutoCloseable {
int requestedBatchSize,
List<Master.TabletLocationsPB> locations,
long ttl) throws KuduException {
+ // TODO(todd): handle "interned" response here
String tableId = table.getTableId();
String tableName = table.getName();
diff --git a/src/kudu/client/meta_cache.cc b/src/kudu/client/meta_cache.cc
index 1ac130f..30c144b 100644
--- a/src/kudu/client/meta_cache.cc
+++ b/src/kudu/client/meta_cache.cc
@@ -71,6 +71,7 @@ using master::GetTableLocationsRequestPB;
using master::GetTableLocationsResponsePB;
using master::MasterServiceProxy;
using master::TabletLocationsPB;
+using master::TabletLocationsPB_InternedReplicaPB;
using master::TabletLocationsPB_ReplicaPB;
using master::TSInfoPB;
using rpc::BackoffType;
@@ -184,20 +185,40 @@ void RemoteTabletServer::GetHostPorts(vector<HostPort>* host_ports) const {
////////////////////////////////////////////////////////////
-void RemoteTablet::Refresh(const TabletServerMap& tservers,
- const google::protobuf::RepeatedPtrField
- <TabletLocationsPB_ReplicaPB>& replicas) {
+Status RemoteTablet::Refresh(
+ const TabletServerMap& tservers,
+ const TabletLocationsPB& locs_pb,
+ const google::protobuf::RepeatedPtrField<TSInfoPB>& ts_info_dict) {
+
+ vector<std::pair<string, consensus::RaftPeerPB::Role>> uuid_and_role;
+
+ for (const TabletLocationsPB_ReplicaPB& r : locs_pb.replicas()) {
+ uuid_and_role.emplace_back(r.ts_info().permanent_uuid(), r.role());
+ }
+ for (const TabletLocationsPB_InternedReplicaPB& r : locs_pb.interned_replicas()) {
+ if (r.ts_info_idx() >= ts_info_dict.size()) {
+ return Status::Corruption(Substitute(
+ "invalid response from master: referenced tablet idx $0 but only $1 present",
+ r.ts_info_idx(), ts_info_dict.size()));
+ }
+ const TSInfoPB& ts_info = ts_info_dict.Get(r.ts_info_idx());
+ uuid_and_role.emplace_back(ts_info.permanent_uuid(), r.role());
+ }
+
// Adopt the data from the successful response.
std::lock_guard<simple_spinlock> l(lock_);
replicas_.clear();
- for (const TabletLocationsPB_ReplicaPB& r : replicas) {
+
+ for (const auto& p : uuid_and_role) {
RemoteReplica rep;
- rep.ts = FindOrDie(tservers, r.ts_info().permanent_uuid());
- rep.role = r.role();
+ rep.ts = FindOrDie(tservers, p.first);
+ rep.role = p.second;
rep.failed = false;
- replicas_.push_back(rep);
+ replicas_.emplace_back(rep);
}
+
stale_ = false;
+ return Status::OK();
}
void RemoteTablet::MarkStale() {
@@ -683,6 +704,7 @@ void LookupRpc::SendRpcSlowPath() {
req_.mutable_table()->set_table_id(table_->id());
req_.set_partition_key_start(partition_key_);
req_.set_max_returned_locations(locations_to_fetch());
+ req_.set_intern_ts_infos_in_response(true);
if (replica_visibility_ == ReplicaController::Visibility::ALL) {
req_.set_replica_type_filter(master::ANY_REPLICA);
}
@@ -778,6 +800,18 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
VLOG(3) << "Caching '" << rpc.table_name() << "' entry " << entry.DebugString(rpc.table());
InsertOrDie(&tablets_by_key, "", entry);
} else {
+ // First, update the tserver cache, needed for the Refresh calls below.
+ for (const TabletLocationsPB& tablet : tablet_locations) {
+ for (const TabletLocationsPB_ReplicaPB& replicas : tablet.replicas()) {
+ UpdateTabletServer(replicas.ts_info());
+ }
+ }
+ // In the case of "interned" replicas, the above 'replicas' lists will be empty
+ // and instead we'll need to update from the top-level list of tservers.
+ const auto& ts_infos = rpc.resp().ts_infos();
+ for (const TSInfoPB& ts_info : ts_infos) {
+ UpdateTabletServer(ts_info);
+ }
// The comments below will reference the following diagram:
//
@@ -832,11 +866,6 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
// and the entry TTL. If the tablet is unknown, then we need to create a
// new RemoteTablet for it.
- // First, update the tserver cache, needed for the Refresh calls below.
- for (const TabletLocationsPB_ReplicaPB& replicas : tablet.replicas()) {
- UpdateTabletServer(replicas.ts_info());
- }
-
const string& tablet_id = tablet.tablet_id();
scoped_refptr<RemoteTablet> remote = FindPtrOrNull(tablets_by_id_, tablet_id);
if (remote.get() != nullptr) {
@@ -846,8 +875,9 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
VLOG(3) << "Refreshing tablet " << tablet_id << ": "
<< pb_util::SecureShortDebugString(tablet);
- remote->Refresh(ts_cache_, tablet.replicas());
-
+ RETURN_NOT_OK_PREPEND(remote->Refresh(ts_cache_, tablet, ts_infos),
+ Substitute("failed to refresh locations for tablet $0",
+ tablet_id));
// Update the entry TTL.
auto& entry = FindOrDie(tablets_by_key, tablet_lower_bound);
DCHECK(!entry.is_non_covered_range() &&
@@ -864,7 +894,9 @@ Status MetaCache::ProcessLookupResponse(const LookupRpc& rpc,
Partition partition;
Partition::FromPB(tablet.partition(), &partition);
remote = new RemoteTablet(tablet_id, partition);
- remote->Refresh(ts_cache_, tablet.replicas());
+ RETURN_NOT_OK_PREPEND(remote->Refresh(ts_cache_, tablet, ts_infos),
+ Substitute("failed to refresh locations for tablet $0",
+ tablet_id));
MetaCacheEntry entry(expiration_time, remote);
VLOG(3) << "Caching '" << rpc.table_name() << "' entry " << entry.DebugString(rpc.table());
diff --git a/src/kudu/client/meta_cache.h b/src/kudu/client/meta_cache.h
index 83c104c..c645f54 100644
--- a/src/kudu/client/meta_cache.h
+++ b/src/kudu/client/meta_cache.h
@@ -53,8 +53,8 @@ class TabletServerServiceProxy;
} // namespace tserver
namespace master {
-class TabletLocationsPB_ReplicaPB;
class TSInfoPB;
+class TabletLocationsPB;
} // namespace master
namespace client {
@@ -201,9 +201,10 @@ class RemoteTablet : public RefCountedThreadSafe<RemoteTablet> {
}
// Updates this tablet's replica locations.
- void Refresh(const TabletServerMap& tservers,
- const google::protobuf::RepeatedPtrField
- <master::TabletLocationsPB_ReplicaPB>& replicas);
+ Status Refresh(
+ const TabletServerMap& tservers,
+ const master::TabletLocationsPB& locs_pb,
+ const google::protobuf::RepeatedPtrField<master::TSInfoPB>& ts_info_dict);
// Mark this tablet as stale, indicating that the cached tablet metadata is
// out of date. Staleness is checked by the MetaCache when
diff --git a/src/kudu/consensus/quorum_util-test.cc b/src/kudu/consensus/quorum_util-test.cc
index a32e368..20d6393 100644
--- a/src/kudu/consensus/quorum_util-test.cc
+++ b/src/kudu/consensus/quorum_util-test.cc
@@ -307,6 +307,12 @@ TEST(QuorumUtilTest, TestGetConsensusRole) {
ASSERT_EQ(NON_PARTICIPANT, GetConsensusRole("D", cstate));
cstate.set_leader_uuid("D");
ASSERT_EQ(NON_PARTICIPANT, GetConsensusRole("D", cstate)); // Illegal.
+
+ // Test GetParticipantRole() on the participants in the config.
+ for (const auto& peer : config2.peers()) {
+ ASSERT_EQ(GetParticipantRole(peer, cstate),
+ GetConsensusRole(peer.permanent_uuid(), cstate));
+ }
}
TEST(QuorumUtilTest, TestIsRaftConfigVoter) {
diff --git a/src/kudu/consensus/quorum_util.cc b/src/kudu/consensus/quorum_util.cc
index 88a231c..6a2745b 100644
--- a/src/kudu/consensus/quorum_util.cc
+++ b/src/kudu/consensus/quorum_util.cc
@@ -154,11 +154,30 @@ RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
const ConsensusStatePB& cstate) {
// The active config is the pending config if there is one, else it's the committed config.
const RaftConfigPB& config = cstate.has_pending_config() ?
- cstate.pending_config() :
- cstate.committed_config();
+ cstate.pending_config() :
+ cstate.committed_config();
return GetConsensusRole(peer_uuid, cstate.leader_uuid(), config);
}
+
+RaftPeerPB::Role GetParticipantRole(const RaftPeerPB& peer,
+ const ConsensusStatePB& cstate) {
+ const auto& peer_uuid = peer.permanent_uuid();
+ DCHECK_NE(RaftPeerPB::NON_PARTICIPANT,
+ GetConsensusRole(peer_uuid, cstate))
+ << "Peer " << peer_uuid << " << not a participant in " << cstate.ShortDebugString();
+
+ switch (peer.member_type()) {
+ case RaftPeerPB::VOTER:
+ if (peer_uuid == cstate.leader_uuid()) {
+ return RaftPeerPB::LEADER;
+ }
+ return RaftPeerPB::FOLLOWER;
+ default:
+ return RaftPeerPB::LEARNER;
+ }
+}
+
Status VerifyRaftConfig(const RaftConfigPB& config) {
std::set<string> uuids;
if (config.peers().empty()) {
diff --git a/src/kudu/consensus/quorum_util.h b/src/kudu/consensus/quorum_util.h
index a4cc095..4cd30ba 100644
--- a/src/kudu/consensus/quorum_util.h
+++ b/src/kudu/consensus/quorum_util.h
@@ -95,6 +95,14 @@ RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
RaftPeerPB::Role GetConsensusRole(const std::string& peer_uuid,
const ConsensusStatePB& cstate);
+
+// Same as above, but requires that the given 'peer' is a participant
+// in the active configuration in specified consensus state.
+// If not, it will return incorrect results.
+RaftPeerPB::Role GetParticipantRole(const RaftPeerPB& peer,
+ const ConsensusStatePB& cstate);
+
+
// Verifies that the provided configuration is well formed.
Status VerifyRaftConfig(const RaftConfigPB& config);
diff --git a/src/kudu/integration-tests/registration-test.cc b/src/kudu/integration-tests/registration-test.cc
index 74fcf36..6e6eb36 100644
--- a/src/kudu/integration-tests/registration-test.cc
+++ b/src/kudu/integration-tests/registration-test.cc
@@ -200,7 +200,11 @@ class RegistrationTest : public KuduTest {
break; // exiting out of the 'do {...} while (false)' scope
}
RETURN_NOT_OK(ls);
- s = catalog->GetTabletLocations(tablet_id, master::VOTER_REPLICA, &loc, /*user=*/none);
+ s = catalog->GetTabletLocations(tablet_id,
+ master::VOTER_REPLICA,
+ &loc,
+ /*ts_infos_dict=*/nullptr,
+ /*user=*/none);
} while (false);
if (s.ok() && loc.replicas_size() == expected_count) {
if (locations) {
diff --git a/src/kudu/integration-tests/table_locations-itest.cc b/src/kudu/integration-tests/table_locations-itest.cc
index 1ec6126..efa5169 100644
--- a/src/kudu/integration-tests/table_locations-itest.cc
+++ b/src/kudu/integration-tests/table_locations-itest.cc
@@ -246,6 +246,44 @@ TEST_F(TableLocationsTest, TestGetTableLocations) {
EXPECT_EQ("a", resp.tablet_locations(0).partition().partition_key_start());
EXPECT_EQ("aa", resp.tablet_locations(1).partition().partition_key_start());
EXPECT_EQ("ab", resp.tablet_locations(2).partition().partition_key_start());
+
+ // Check that a UUID was returned for every replica
+ for (const auto& loc : resp.tablet_locations()) {
+ ASSERT_GT(loc.replicas_size(), 0);
+ ASSERT_EQ(0, loc.interned_replicas_size());
+ for (const auto& replica : loc.replicas()) {
+ ASSERT_NE("", replica.ts_info().permanent_uuid());
+ }
+ }
+ }
+
+ { // from "", with TSInfo interning enabled.
+ GetTableLocationsRequestPB req;
+ GetTableLocationsResponsePB resp;
+ RpcController controller;
+ req.mutable_table()->set_table_name(table_name);
+ req.set_partition_key_start("");
+ req.set_max_returned_locations(3);
+ req.set_intern_ts_infos_in_response(true);
+ ASSERT_OK(proxy_->GetTableLocations(req, &resp, &controller));
+ SCOPED_TRACE(SecureDebugString(resp));
+
+ ASSERT_TRUE(!resp.has_error());
+ ASSERT_EQ(3, resp.tablet_locations().size());
+ EXPECT_EQ("a", resp.tablet_locations(0).partition().partition_key_start());
+ EXPECT_EQ("aa", resp.tablet_locations(1).partition().partition_key_start());
+ EXPECT_EQ("ab", resp.tablet_locations(2).partition().partition_key_start());
+ // Check that a UUID was returned for every replica
+ for (const auto& loc : resp.tablet_locations()) {
+ ASSERT_EQ(loc.replicas_size(), 0);
+ ASSERT_GT(loc.interned_replicas_size(), 0);
+ for (const auto& replica : loc.interned_replicas()) {
+ int idx = replica.ts_info_idx();
+ ASSERT_GE(idx, 0);
+ ASSERT_LE(idx, resp.ts_infos_size());
+ ASSERT_NE("", resp.ts_infos(idx).permanent_uuid());
+ }
+ }
}
{ // from "b"
@@ -367,6 +405,7 @@ TEST_F(TableLocationsTest, GetTableLocationsBenchmark) {
RpcController controller;
req.mutable_table()->set_table_name(table_name);
req.set_max_returned_locations(1000);
+ req.set_intern_ts_infos_in_response(true);
CHECK_OK(proxies[i]->GetTableLocations(req, &resp, &controller));
CHECK_EQ(resp.tablet_locations_size(), kNumSplits + 1);
}
diff --git a/src/kudu/master/catalog_manager.cc b/src/kudu/master/catalog_manager.cc
index 6d0e27d..88d0d16 100644
--- a/src/kudu/master/catalog_manager.cc
+++ b/src/kudu/master/catalog_manager.cc
@@ -4616,7 +4616,8 @@ void CatalogManager::SendCreateTabletRequest(const scoped_refptr<TabletInfo>& ta
Status CatalogManager::BuildLocationsForTablet(
const scoped_refptr<TabletInfo>& tablet,
ReplicaTypeFilter filter,
- TabletLocationsPB* locs_pb) {
+ TabletLocationsPB* locs_pb,
+ TSInfosDict* ts_infos_dict) {
TabletMetadataLock l_tablet(tablet.get(), LockMode::READ);
if (PREDICT_FALSE(l_tablet.data().is_deleted())) {
return Status::NotFound("Tablet deleted", l_tablet.data().pb.state_msg());
@@ -4632,8 +4633,6 @@ Status CatalogManager::BuildLocationsForTablet(
const ConsensusStatePB& cstate = l_tablet.data().pb.consensus_state();
for (const consensus::RaftPeerPB& peer : cstate.committed_config().peers()) {
DCHECK(!peer.has_health_report()); // Health report shouldn't be persisted.
- // TODO(adar): GetConsensusRole() iterates over all of the peers, making this an
- // O(n^2) loop. If replication counts get high, it should be optimized.
switch (filter) {
case VOTER_REPLICA:
if (!peer.has_member_type() ||
@@ -4655,24 +4654,44 @@ Status CatalogManager::BuildLocationsForTablet(
}
}
- TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
- replica_pb->set_role(GetConsensusRole(peer.permanent_uuid(), cstate));
-
- TSInfoPB* tsinfo_pb = replica_pb->mutable_ts_info();
- tsinfo_pb->set_permanent_uuid(peer.permanent_uuid());
+ // Helper function to create a TSInfoPB.
+ auto make_tsinfo_pb = [this, &peer]() {
+ unique_ptr<TSInfoPB> tsinfo_pb(new TSInfoPB());
+ tsinfo_pb->set_permanent_uuid(peer.permanent_uuid());
+ shared_ptr<TSDescriptor> ts_desc;
+ if (master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc)) {
+ ServerRegistrationPB reg;
+ ts_desc->GetRegistration(®);
+ tsinfo_pb->mutable_rpc_addresses()->Swap(reg.mutable_rpc_addresses());
+ if (ts_desc->location()) tsinfo_pb->set_location(*(ts_desc->location()));
+ } else {
+ // If we've never received a heartbeat from the tserver, we'll fall back
+ // to the last known RPC address in the RaftPeerPB.
+ //
+ // TODO(wdberkeley): We should track these RPC addresses in the master table itself.
+ tsinfo_pb->add_rpc_addresses()->CopyFrom(peer.last_known_addr());
+ }
+ return tsinfo_pb;
+ };
- shared_ptr<TSDescriptor> ts_desc;
- if (master_->ts_manager()->LookupTSByUUID(peer.permanent_uuid(), &ts_desc)) {
- ServerRegistrationPB reg;
- ts_desc->GetRegistration(®);
- tsinfo_pb->mutable_rpc_addresses()->Swap(reg.mutable_rpc_addresses());
- if (ts_desc->location()) tsinfo_pb->set_location(*(ts_desc->location()));
+ auto role = GetParticipantRole(peer, cstate);
+ if (ts_infos_dict) {
+ int idx = *ComputeIfAbsent(
+ &ts_infos_dict->uuid_to_idx, peer.permanent_uuid(),
+ [&]() -> int {
+ auto pb = make_tsinfo_pb();
+ int idx = ts_infos_dict->ts_info_pbs.size();
+ ts_infos_dict->ts_info_pbs.emplace_back(pb.release());
+ return idx;
+ });
+
+ auto* interned_replica_pb = locs_pb->add_interned_replicas();
+ interned_replica_pb->set_ts_info_idx(idx);
+ interned_replica_pb->set_role(role);
} else {
- // If we've never received a heartbeat from the tserver, we'll fall back
- // to the last known RPC address in the RaftPeerPB.
- //
- // TODO: We should track these RPC addresses in the master table itself.
- tsinfo_pb->add_rpc_addresses()->CopyFrom(peer.last_known_addr());
+ TabletLocationsPB_ReplicaPB* replica_pb = locs_pb->add_replicas();
+ replica_pb->set_allocated_ts_info(make_tsinfo_pb().release());
+ replica_pb->set_role(role);
}
}
@@ -4688,6 +4707,7 @@ Status CatalogManager::BuildLocationsForTablet(
Status CatalogManager::GetTabletLocations(const string& tablet_id,
ReplicaTypeFilter filter,
TabletLocationsPB* locs_pb,
+ TSInfosDict* ts_infos_dict,
optional<const string&> user) {
leader_lock_.AssertAcquiredForReading();
RETURN_NOT_OK(CheckOnline());
@@ -4710,7 +4730,7 @@ Status CatalogManager::GetTabletLocations(const string& tablet_id,
NormalizeTableName(table_lock.data().name()), *user));
}
- return BuildLocationsForTablet(tablet_info, filter, locs_pb);
+ return BuildLocationsForTablet(tablet_info, filter, locs_pb, ts_infos_dict);
}
Status CatalogManager::ReplaceTablet(const string& tablet_id, ReplaceTabletResponsePB* resp) {
@@ -4828,9 +4848,12 @@ Status CatalogManager::GetTableLocations(const GetTableLocationsRequestPB* req,
vector<scoped_refptr<TabletInfo>> tablets_in_range;
table->GetTabletsInRange(req, &tablets_in_range);
+ TSInfosDict infos_dict;
+
for (const auto& tablet : tablets_in_range) {
Status s = BuildLocationsForTablet(
- tablet, req->replica_type_filter(), resp->add_tablet_locations());
+ tablet, req->replica_type_filter(), resp->add_tablet_locations(),
+ req->intern_ts_infos_in_response() ? &infos_dict : nullptr);
if (s.ok()) {
continue;
}
@@ -4854,6 +4877,9 @@ Status CatalogManager::GetTableLocations(const GetTableLocationsRequestPB* req,
<< s.ToString();
}
}
+ for (auto& pb : infos_dict.ts_info_pbs) {
+ resp->mutable_ts_infos()->AddAllocated(pb.release());
+ }
resp->set_ttl_millis(FLAGS_table_locations_ttl_ms);
return Status::OK();
}
diff --git a/src/kudu/master/catalog_manager.h b/src/kudu/master/catalog_manager.h
index b7c8290..e78cbda 100644
--- a/src/kudu/master/catalog_manager.h
+++ b/src/kudu/master/catalog_manager.h
@@ -34,12 +34,14 @@
#include <boost/optional/optional.hpp>
#include <glog/logging.h>
#include <gtest/gtest_prod.h>
+#include <sparsehash/dense_hash_map>
#include "kudu/consensus/metadata.pb.h"
#include "kudu/gutil/gscoped_ptr.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/port.h"
#include "kudu/gutil/ref_counted.h"
+#include "kudu/gutil/strings/stringpiece.h"
#include "kudu/master/master.pb.h"
#include "kudu/tserver/tablet_replica_lookup.h"
#include "kudu/tserver/tserver.pb.h"
@@ -51,6 +53,8 @@
#include "kudu/util/rw_mutex.h"
#include "kudu/util/status.h"
+template <class X> struct GoodFastHash;
+
namespace kudu {
class AuthzTokenTest_TestSingleMasterUnavailable_Test;
@@ -102,7 +106,6 @@ class PlacementPolicy;
class SysCatalogTable;
class TSDescriptor;
class TableInfo;
-
struct DeferredAssignmentActions;
// The data related to a tablet which is persisted on disk.
@@ -600,11 +603,22 @@ class CatalogManager : public tserver::TabletReplicaLookupIf {
GetTableLocationsResponsePB* resp,
boost::optional<const std::string&> user);
+ struct TSInfosDict {
+ std::vector<std::unique_ptr<TSInfoPB>> ts_info_pbs;
+ google::dense_hash_map<StringPiece, int, GoodFastHash<StringPiece>> uuid_to_idx;
+ TSInfosDict() {
+ uuid_to_idx.set_empty_key(StringPiece());
+ }
+ };
+
// Look up the locations of the given tablet. If 'user' is provided, checks
// that the user is authorized to get such information. Adds only information
// on replicas which satisfy the 'filter'. The locations vector is overwritten
// (not appended to).
//
+ // If 'ts_infos_dict' is not null, the returned locations use it as a dictionary
+ // to reference TSInfoPBs instead of making many copies.
+ //
// If the tablet is not found, returns Status::NotFound.
// If the tablet is not running, returns Status::ServiceUnavailable.
// Otherwise, returns Status::OK and puts the result in 'locs_pb'.
@@ -612,6 +626,7 @@ class CatalogManager : public tserver::TabletReplicaLookupIf {
Status GetTabletLocations(const std::string& tablet_id,
master::ReplicaTypeFilter filter,
TabletLocationsPB* locs_pb,
+ TSInfosDict* ts_infos_dict,
boost::optional<const std::string&> user);
// Replace the given tablet with a new, empty one. The replaced tablet is
@@ -847,13 +862,19 @@ class CatalogManager : public tserver::TabletReplicaLookupIf {
scoped_refptr<TabletInfo> CreateTabletInfo(const scoped_refptr<TableInfo>& table,
const PartitionPB& partition);
+
// Builds the TabletLocationsPB for a tablet based on the provided TabletInfo
- // and the replica type filter specified. Populates locs_pb and returns
- // Status::OK on success. Returns Status::ServiceUnavailable if tablet is
- // not running.
+ // and the replica type fiter specified. Populates locs_pb and returns
+ // Status::OK on success.
+ //
+ // If 'ts_infos_dict' is not null, the returned locations use it as a dictionary
+ // to reference TSInfoPBs.
+ //
+ // Returns Status::ServiceUnavailable if tablet is not running.
Status BuildLocationsForTablet(const scoped_refptr<TabletInfo>& tablet,
master::ReplicaTypeFilter filter,
- TabletLocationsPB* locs_pb);
+ TabletLocationsPB* locs_pb,
+ TSInfosDict* ts_infos_dict);
// Looks up the table, locks it with the provided lock mode, and, if 'user' is
// provided, checks that the user is authorized to operate on the table.
diff --git a/src/kudu/master/master.proto b/src/kudu/master/master.proto
index 89afecd..8bbced0 100644
--- a/src/kudu/master/master.proto
+++ b/src/kudu/master/master.proto
@@ -357,11 +357,18 @@ message TSHeartbeatResponsePB {
//////////////////////////////
message TabletLocationsPB {
+ // DEPRECATED: new clients should prefer the 'Interned' type below.
message ReplicaPB {
required TSInfoPB ts_info = 1;
required consensus.RaftPeerPB.Role role = 2;
}
+ message InternedReplicaPB {
+ // Index into the 'ts_infos' list in the top-level RPC response.
+ required uint32 ts_info_idx = 1;
+ required consensus.RaftPeerPB.Role role = 2;
+ }
+
required bytes tablet_id = 1;
// DEPRECATED.
@@ -370,8 +377,16 @@ message TabletLocationsPB {
optional PartitionPB partition = 6;
+ // Used only if interned replicas are not supported by client.
repeated ReplicaPB replicas = 4;
+ // More efficient representation of replicas: instead of duplicating the TSInfoPB
+ // in each tablet location, instead we just encode indexes into a list of TSInfoPB
+ // which is serialized in the top-level response.
+ //
+ // Used when supported by client.
+ repeated InternedReplicaPB interned_replicas = 7;
+
// DEPRECATED. Still set by servers, but should be ignored by clients.
optional bool DEPRECATED_stale = 5;
}
@@ -404,6 +419,9 @@ message GetTabletLocationsRequestPB {
// What type of tablet replicas to include in the response.
optional ReplicaTypeFilter replica_type_filter = 2 [ default = VOTER_REPLICA ];
+
+ // NOTE: we don't support the "interned" TSInfoPB response in this request
+ // since this RPC is currently called with a small number of tablets.
}
message GetTabletLocationsResponsePB {
@@ -497,6 +515,9 @@ message GetTableLocationsRequestPB {
// What type of tablet replicas to include in the
// 'GetTableLocationsResponsePB::tablet_locations' response field.
optional ReplicaTypeFilter replica_type_filter = 6 [ default = VOTER_REPLICA ];
+
+ // Whether the response should use the 'interned_replicas' field.
+ optional bool intern_ts_infos_in_response = 7 [ default = false ];
}
// The response to a GetTableLocations RPC. The master guarantees that:
@@ -518,6 +539,10 @@ message GetTableLocationsResponsePB {
repeated TabletLocationsPB tablet_locations = 2;
+ // Used if 'intern_ts_infos_in_response' was requested.
+ // See InternedReplicaPB above.
+ repeated TSInfoPB ts_infos = 4;
+
// If the client caches table locations, the entries should not live longer
// than this timeout. Defaults to one hour.
optional uint32 ttl_millis = 3 [default = 36000000];
diff --git a/src/kudu/master/master_service.cc b/src/kudu/master/master_service.cc
index eaab0b9..fadfae4 100644
--- a/src/kudu/master/master_service.cc
+++ b/src/kudu/master/master_service.cc
@@ -309,7 +309,9 @@ void MasterServiceImpl::GetTabletLocations(const GetTabletLocationsRequestPB* re
// TODO(todd): once we have catalog data. ACL checks would also go here, probably.
TabletLocationsPB* locs_pb = resp->add_tablet_locations();
Status s = server_->catalog_manager()->GetTabletLocations(
- tablet_id, req->replica_type_filter(), locs_pb,
+ tablet_id, req->replica_type_filter(),
+ locs_pb,
+ /*ts_infos_dict=*/nullptr,
make_optional<const string&>(rpc->remote_user().username()));
if (!s.ok()) {
resp->mutable_tablet_locations()->RemoveLast();