You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2016/07/22 22:02:24 UTC
[4/9] incubator-kudu git commit: ksck: multi-thread the fetching of
replica info from tablet servers
ksck: multi-thread the fetching of replica info from tablet servers
In clusters with a lot of tablets, fetching the data from each tablet server
can take a while (eg ~100ms per server from my laptop to a test cluster in our
datacenter). For a large cluster (~70 nodes), this makes ksck rather slow.
Multi-threading the fetching makes this much faster (5s vs original 31 seconds
for the above test cluster).
Change-Id: Ib7784697fb227743dccaa98922fb958cd6a3270e
Reviewed-on: http://gerrit.cloudera.org:8080/3705
Tested-by: Kudu Jenkins
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>
Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/9cc2e041
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/9cc2e041
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/9cc2e041
Branch: refs/heads/master
Commit: 9cc2e041f3440fe9a731644bbc269b2fdace8fab
Parents: 7ba9d99
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Jul 20 14:39:04 2016 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Fri Jul 22 20:24:56 2016 +0000
----------------------------------------------------------------------
src/kudu/tools/ksck.cc | 29 ++++++++++++++++++++++-------
1 file changed, 22 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/9cc2e041/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index cb95f3a..67f9ff2 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -27,9 +27,11 @@
#include "kudu/gutil/ref_counted.h"
#include "kudu/gutil/strings/join.h"
#include "kudu/gutil/strings/substitute.h"
+#include "kudu/util/atomic.h"
#include "kudu/util/blocking_queue.h"
#include "kudu/util/locks.h"
#include "kudu/util/monotime.h"
+#include "kudu/util/threadpool.h"
namespace kudu {
namespace tools {
@@ -53,6 +55,9 @@ DEFINE_uint64(checksum_snapshot_timestamp, ChecksumOptions::kCurrentTimestamp,
"timestamp to use for snapshot checksum scans, defaults to 0, which "
"uses the current timestamp of a tablet server involved in the scan");
+DEFINE_int32(fetch_replica_info_concurrency, 20,
+ "Number of concurrent tablet servers to fetch replica info from.");
+
// The stream to write output to. If this is NULL, defaults to cerr.
// This is used by tests to capture output.
ostream* g_err_stream = NULL;
@@ -148,20 +153,30 @@ Status Ksck::FetchInfoFromTabletServers() {
return Status::NotFound("No tablet servers found");
}
- int bad_servers = 0;
+
+ gscoped_ptr<ThreadPool> pool;
+ RETURN_NOT_OK(ThreadPoolBuilder("ksck-fetch")
+ .set_max_threads(FLAGS_fetch_replica_info_concurrency)
+ .Build(&pool));
+
+ AtomicInt<int32_t> bad_servers(0);
VLOG(1) << "Fetching info from all the Tablet Servers";
for (const KsckMaster::TSMap::value_type& entry : cluster_->tablet_servers()) {
- Status s = ConnectToTabletServer(entry.second);
- if (!s.ok()) {
- bad_servers++;
- }
+ CHECK_OK(pool->SubmitFunc([&]() {
+ Status s = ConnectToTabletServer(entry.second);
+ if (!s.ok()) {
+ bad_servers.Increment();
+ }
+ }));
}
- if (bad_servers == 0) {
+ pool->Wait();
+
+ if (bad_servers.Load() == 0) {
Info() << Substitute("Fetched info from all $0 Tablet Servers", servers_count) << endl;
return Status::OK();
} else {
Warn() << Substitute("Fetched info from $0 Tablet Servers, $1 weren't reachable",
- servers_count - bad_servers, bad_servers) << endl;
+ servers_count - bad_servers.Load(), bad_servers.Load()) << endl;
return Status::NetworkError("Not all Tablet Servers are reachable");
}
}