You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by jd...@apache.org on 2016/07/22 22:02:24 UTC

[4/9] incubator-kudu git commit: ksck: multi-thread the fetching of replica info from tablet servers

ksck: multi-thread the fetching of replica info from tablet servers

In clusters with a lot of tablets, fetching the data from each tablet server
can take a while (eg ~100ms per server from my laptop to a test cluster in our
datacenter). For a large cluster (~70 nodes), this makes ksck rather slow.

Multi-threading the fetching makes this much faster (5s vs original 31 seconds
for the above test cluster).

Change-Id: Ib7784697fb227743dccaa98922fb958cd6a3270e
Reviewed-on: http://gerrit.cloudera.org:8080/3705
Tested-by: Kudu Jenkins
Reviewed-by: Jean-Daniel Cryans <jd...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/9cc2e041
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/9cc2e041
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/9cc2e041

Branch: refs/heads/master
Commit: 9cc2e041f3440fe9a731644bbc269b2fdace8fab
Parents: 7ba9d99
Author: Todd Lipcon <to...@apache.org>
Authored: Wed Jul 20 14:39:04 2016 -0700
Committer: Jean-Daniel Cryans <jd...@apache.org>
Committed: Fri Jul 22 20:24:56 2016 +0000

----------------------------------------------------------------------
 src/kudu/tools/ksck.cc | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/9cc2e041/src/kudu/tools/ksck.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index cb95f3a..67f9ff2 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -27,9 +27,11 @@
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/strings/join.h"
 #include "kudu/gutil/strings/substitute.h"
+#include "kudu/util/atomic.h"
 #include "kudu/util/blocking_queue.h"
 #include "kudu/util/locks.h"
 #include "kudu/util/monotime.h"
+#include "kudu/util/threadpool.h"
 
 namespace kudu {
 namespace tools {
@@ -53,6 +55,9 @@ DEFINE_uint64(checksum_snapshot_timestamp, ChecksumOptions::kCurrentTimestamp,
               "timestamp to use for snapshot checksum scans, defaults to 0, which "
               "uses the current timestamp of a tablet server involved in the scan");
 
+DEFINE_int32(fetch_replica_info_concurrency, 20,
+             "Number of concurrent tablet servers to fetch replica info from.");
+
 // The stream to write output to. If this is NULL, defaults to cerr.
 // This is used by tests to capture output.
 ostream* g_err_stream = NULL;
@@ -148,20 +153,30 @@ Status Ksck::FetchInfoFromTabletServers() {
     return Status::NotFound("No tablet servers found");
   }
 
-  int bad_servers = 0;
+
+  gscoped_ptr<ThreadPool> pool;
+  RETURN_NOT_OK(ThreadPoolBuilder("ksck-fetch")
+                .set_max_threads(FLAGS_fetch_replica_info_concurrency)
+                .Build(&pool));
+
+  AtomicInt<int32_t> bad_servers(0);
   VLOG(1) << "Fetching info from all the Tablet Servers";
   for (const KsckMaster::TSMap::value_type& entry : cluster_->tablet_servers()) {
-    Status s = ConnectToTabletServer(entry.second);
-    if (!s.ok()) {
-      bad_servers++;
-    }
+    CHECK_OK(pool->SubmitFunc([&]() {
+          Status s = ConnectToTabletServer(entry.second);
+          if (!s.ok()) {
+            bad_servers.Increment();
+          }
+        }));
   }
-  if (bad_servers == 0) {
+  pool->Wait();
+
+  if (bad_servers.Load() == 0) {
     Info() << Substitute("Fetched info from all $0 Tablet Servers", servers_count) << endl;
     return Status::OK();
   } else {
     Warn() << Substitute("Fetched info from $0 Tablet Servers, $1 weren't reachable",
-                         servers_count - bad_servers, bad_servers) << endl;
+                         servers_count - bad_servers.Load(), bad_servers.Load()) << endl;
     return Status::NetworkError("Not all Tablet Servers are reachable");
   }
 }