You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2017/09/20 18:37:25 UTC

[2/3] kudu git commit: Fix tablet state metrics race

Fix tablet state metrics race

6fef9ca5c9d1dcf571795c514cb9fecc493d0714 added tablet state summary
metrics by adding a struct with cached counts that was periodically
refreshed by walking the tablet manager, but also added a race
between reading the cached counts and refreshing them. This fixes the
race by centralizing reading and writing the counts in the tablet
manager and protecting them with the tablet manager's lock.

Change-Id: I88585c04692a6dc168b3750be9888769240cf3f6
Reviewed-on: http://gerrit.cloudera.org:8080/8082
Tested-by: Kudu Jenkins
Reviewed-by: Todd Lipcon <to...@apache.org>
Reviewed-by: David Ribeiro Alves <da...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/fb74d345
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/fb74d345
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/fb74d345

Branch: refs/heads/master
Commit: fb74d34597d80e91bb6188d51b04e5a6707a12c2
Parents: adabde5
Author: Will Berkeley <wd...@apache.org>
Authored: Fri Sep 15 10:09:06 2017 -0700
Committer: Will Berkeley <wd...@gmail.com>
Committed: Wed Sep 20 17:28:22 2017 +0000

----------------------------------------------------------------------
 src/kudu/tserver/CMakeLists.txt               |   1 -
 src/kudu/tserver/ts_tablet_manager.cc         | 128 ++++++++++++-----
 src/kudu/tserver/ts_tablet_manager.h          |  29 ++--
 src/kudu/tserver/ts_tablet_manager_metrics.cc | 158 ---------------------
 src/kudu/tserver/ts_tablet_manager_metrics.h  |  63 --------
 5 files changed, 106 insertions(+), 273 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/fb74d345/src/kudu/tserver/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/CMakeLists.txt b/src/kudu/tserver/CMakeLists.txt
index 048a237..84daec8 100644
--- a/src/kudu/tserver/CMakeLists.txt
+++ b/src/kudu/tserver/CMakeLists.txt
@@ -114,7 +114,6 @@ set(TSERVER_SRCS
   tablet_server_options.cc
   tablet_service.cc
   ts_tablet_manager.cc
-  ts_tablet_manager_metrics.cc
   tserver-path-handlers.cc
 )
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/fb74d345/src/kudu/tserver/ts_tablet_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index 7cab0b4..5b6f9e7 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -116,6 +116,46 @@ DEFINE_int32(tablet_state_walk_min_period_ms, 1000,
              "tablet map to update tablet state counts.");
 TAG_FLAG(tablet_state_walk_min_period_ms, advanced);
 
+METRIC_DEFINE_gauge_int32(server, tablets_num_not_initialized,
+                          "Number of Not Initialized Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of tablets currently not initialized");
+
+METRIC_DEFINE_gauge_int32(server, tablets_num_initialized,
+                          "Number of Initialized Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of tablets currently initialized");
+
+METRIC_DEFINE_gauge_int32(server, tablets_num_bootstrapping,
+                          "Number of Bootstrapping Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of tablets currently bootstrapping");
+
+METRIC_DEFINE_gauge_int32(server, tablets_num_running,
+                          "Number of Running Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of tablets currently running");
+
+METRIC_DEFINE_gauge_int32(server, tablets_num_failed,
+                          "Number of Failed Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of failed tablets");
+
+METRIC_DEFINE_gauge_int32(server, tablets_num_stopping,
+                          "Number of Stopping Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of tablets currently stopping");
+
+METRIC_DEFINE_gauge_int32(server, tablets_num_stopped,
+                          "Number of Stopped Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of tablets currently stopped");
+
+METRIC_DEFINE_gauge_int32(server, tablets_num_shutdown,
+                          "Number of Shut Down Tablets",
+                          kudu::MetricUnit::kTablets,
+                          "Number of tablets currently shut down");
+
 using std::shared_ptr;
 using std::string;
 using std::vector;
@@ -159,8 +199,48 @@ TSTabletManager::TSTabletManager(TabletServer* server)
     server_(server),
     metric_registry_(server->metric_registry()),
     tablet_copy_metrics_(server->metric_entity()),
-    tablet_manager_metrics_(server->metric_entity(), this),
-    state_(MANAGER_INITIALIZING) {}
+    state_(MANAGER_INITIALIZING) {
+  METRIC_tablets_num_not_initialized.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::NOT_INITIALIZED))
+      ->AutoDetach(&metric_detacher_);
+  METRIC_tablets_num_initialized.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::INITIALIZED))
+      ->AutoDetach(&metric_detacher_);
+  METRIC_tablets_num_bootstrapping.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::BOOTSTRAPPING))
+      ->AutoDetach(&metric_detacher_);
+  METRIC_tablets_num_running.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::RUNNING))
+      ->AutoDetach(&metric_detacher_);
+  METRIC_tablets_num_failed.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::FAILED))
+      ->AutoDetach(&metric_detacher_);
+  METRIC_tablets_num_stopping.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::STOPPING))
+      ->AutoDetach(&metric_detacher_);
+  METRIC_tablets_num_stopped.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::STOPPED))
+      ->AutoDetach(&metric_detacher_);
+  METRIC_tablets_num_shutdown.InstantiateFunctionGauge(
+          server->metric_entity(),
+          Bind(&TSTabletManager::RefreshTabletStateCacheAndReturnCount,
+               Unretained(this), tablet::SHUTDOWN))
+      ->AutoDetach(&metric_detacher_);
+}
 
 TSTabletManager::~TSTabletManager() {
 }
@@ -1170,46 +1250,18 @@ void TSTabletManager::FailTabletsInDataDir(const string& uuid) {
                            dd_manager->FindDataDirByUuidIndex(uuid_idx)->dir());
 }
 
-void TSTabletManager::RefreshTabletStateCache() {
+int TSTabletManager::RefreshTabletStateCacheAndReturnCount(tablet::TabletStatePB st) {
   MonoDelta period = MonoDelta::FromMilliseconds(FLAGS_tablet_state_walk_min_period_ms);
   std::lock_guard<rw_spinlock> lock(lock_);
-  // Cache is fresh enough; don't regenerate counts.
-  if (last_walked_ + period > MonoTime::Now()) {
-    return;
-  }
-  tablet_manager_metrics_.ResetTabletStateCounts();
-  for (const auto& entry : tablet_map_) {
-    switch (entry.second->state()) {
-      case tablet::NOT_INITIALIZED:
-        tablet_manager_metrics_.num_not_initialized++;
-        break;
-      case tablet::INITIALIZED:
-        tablet_manager_metrics_.num_initialized++;
-        break;
-      case tablet::BOOTSTRAPPING:
-        tablet_manager_metrics_.num_bootstrapping++;
-        break;
-      case tablet::RUNNING:
-        tablet_manager_metrics_.num_running++;
-        break;
-      case tablet::FAILED:
-        tablet_manager_metrics_.num_failed++;
-        break;
-      case tablet::STOPPING:
-        tablet_manager_metrics_.num_stopping++;
-        break;
-      case tablet::STOPPED:
-        tablet_manager_metrics_.num_stopped++;
-        break;
-      case tablet::SHUTDOWN:
-        tablet_manager_metrics_.num_shutdown++;
-        break;
-      case tablet::UNKNOWN:
-        // pass
-        break;
+  if (last_walked_ + period < MonoTime::Now()) {
+    // Old cache: regenerate counts.
+    tablet_state_counts_.clear();
+    for (const auto& entry : tablet_map_) {
+      tablet_state_counts_[entry.second->state()]++;
     }
+    last_walked_ = MonoTime::Now();
   }
-  last_walked_ = MonoTime::Now();
+  return FindWithDefault(tablet_state_counts_, st, 0);
 }
 
 TransitionInProgressDeleter::TransitionInProgressDeleter(

http://git-wip-us.apache.org/repos/asf/kudu/blob/fb74d345/src/kudu/tserver/ts_tablet_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager.h b/src/kudu/tserver/ts_tablet_manager.h
index e7ce2cf..88bebff 100644
--- a/src/kudu/tserver/ts_tablet_manager.h
+++ b/src/kudu/tserver/ts_tablet_manager.h
@@ -17,8 +17,9 @@
 
 #pragma once
 
-#include <functional>
 #include <cstdint>
+#include <functional>
+#include <map>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -33,10 +34,10 @@
 #include "kudu/tablet/metadata.pb.h"
 #include "kudu/tserver/tablet_copy_client.h"
 #include "kudu/tserver/tablet_replica_lookup.h"
-#include "kudu/tserver/ts_tablet_manager_metrics.h"
 #include "kudu/tserver/tserver.pb.h"
 #include "kudu/tserver/tserver_admin.pb.h"
 #include "kudu/util/locks.h"
+#include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/status.h"
 
@@ -48,7 +49,6 @@ class optional;
 namespace kudu {
 
 class FsManager;
-class MetricRegistry;
 class NodeInstancePB;
 class Partition;
 class PartitionSchema;
@@ -201,8 +201,9 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
   // Forces shutdown of the tablet replicas in the data dir corresponding to 'uuid'.
   void FailTabletsInDataDir(const std::string& uuid);
 
-  // Refresh the cached counts of tablet states, if the cache is old enough.
-  void RefreshTabletStateCache();
+  // Refresh the cached counts of tablet states, if the cache is old enough,
+  // and return the count for tablet state 'st'.
+  int RefreshTabletStateCacheAndReturnCount(tablet::TabletStatePB st);
 
  private:
   FRIEND_TEST(TsTabletManagerTest, TestPersistBlocks);
@@ -313,19 +314,12 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
 
   // Lock protecting tablet_map_, dirty_tablets_, state_,
   // transition_in_progress_, perm_deleted_tablet_ids_,
-  // tablet_manager_metrics_, and last_walked_.
+  // tablet_state_counts_, and last_walked_.
   mutable rw_spinlock lock_;
 
   // Map from tablet ID to tablet
   TabletMap tablet_map_;
 
-  // Timestamp indicating the last time tablet_map_ was walked to count
-  // tablet states.
-  MonoTime last_walked_ = MonoTime::Min();
-
-  // Metrics container for the tablet manager; holds cached tablet states from tablet_map_.
-  TSTabletManagerMetrics tablet_manager_metrics_;
-
   // Permanently deleted tablet ids. If a tablet is removed with status
   // TABLET_DATA_DELETED then it is added to this map (until the next process
   // restart).
@@ -339,6 +333,13 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
 
   TabletCopyClientMetrics tablet_copy_metrics_;
 
+  // Timestamp indicating the last time tablet_map_ was walked to count
+  // tablet states.
+  MonoTime last_walked_ = MonoTime::Min();
+
+  // Holds cached tablet states from tablet_map_.
+  std::map<tablet::TabletStatePB, int> tablet_state_counts_;
+
   TSTabletManagerStatePB state_;
 
   // Thread pool used to run tablet copy operations.
@@ -347,6 +348,8 @@ class TSTabletManager : public tserver::TabletReplicaLookupIf {
   // Thread pool used to open the tablets async, whether bootstrap is required or not.
   gscoped_ptr<ThreadPool> open_tablet_pool_;
 
+  FunctionGaugeDetacher metric_detacher_;
+
   DISALLOW_COPY_AND_ASSIGN(TSTabletManager);
 };
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/fb74d345/src/kudu/tserver/ts_tablet_manager_metrics.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager_metrics.cc b/src/kudu/tserver/ts_tablet_manager_metrics.cc
deleted file mode 100644
index 74df6ca..0000000
--- a/src/kudu/tserver/ts_tablet_manager_metrics.cc
+++ /dev/null
@@ -1,158 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "kudu/tserver/ts_tablet_manager_metrics.h"
-
-#include "kudu/gutil/bind.h"
-#include "kudu/gutil/bind_helpers.h"
-#include "kudu/tserver/ts_tablet_manager.h"
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_not_initialized,
-                          "Number of Not Initialized Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of tablets currently not initialized");
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_initialized,
-                          "Number of Initialized Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of tablets currently initialized");
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_bootstrapping,
-                          "Number of Bootstrapping Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of tablets currently bootstrapping");
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_running,
-                          "Number of Running Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of tablets currently running");
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_failed,
-                          "Number of Failed Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of failed tablets");
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_stopping,
-                          "Number of Stopping Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of tablets currently stopping");
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_stopped,
-                          "Number of Stopped Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of tablets currently stopped");
-
-METRIC_DEFINE_gauge_int32(server, tablets_num_shutdown,
-                          "Number of Shut Down Tablets",
-                          kudu::MetricUnit::kTablets,
-                          "Number of tablets currently shut down");
-
-namespace kudu {
-namespace tserver {
-
-TSTabletManagerMetrics::TSTabletManagerMetrics(const scoped_refptr<MetricEntity>& metric_entity,
-                                               TSTabletManager* ts_tablet_manager) :
-    ts_tablet_manager_(ts_tablet_manager) {
-  METRIC_tablets_num_not_initialized.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumNotInitializedTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-  METRIC_tablets_num_initialized.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumInitializedTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-  METRIC_tablets_num_bootstrapping.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumBootstrappingTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-  METRIC_tablets_num_running.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumRunningTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-  METRIC_tablets_num_failed.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumFailedTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-  METRIC_tablets_num_stopping.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumStoppingTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-  METRIC_tablets_num_stopped.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumStoppedTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-  METRIC_tablets_num_shutdown.InstantiateFunctionGauge(
-          metric_entity,
-          Bind(&TSTabletManagerMetrics::NumShutdownTablets, Unretained(this)))
-      ->AutoDetach(&metric_detacher_);
-
-  ResetTabletStateCounts();
-}
-
-int TSTabletManagerMetrics::NumNotInitializedTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_not_initialized;
-}
-
-int TSTabletManagerMetrics::NumInitializedTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_initialized;
-}
-
-int TSTabletManagerMetrics::NumBootstrappingTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_bootstrapping;
-}
-
-int TSTabletManagerMetrics::NumRunningTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_running;
-}
-
-int TSTabletManagerMetrics::NumFailedTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_failed;
-}
-
-int TSTabletManagerMetrics::NumStoppingTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_stopping;
-}
-
-int TSTabletManagerMetrics::NumStoppedTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_stopped;
-}
-
-int TSTabletManagerMetrics::NumShutdownTablets() {
-  ts_tablet_manager_->RefreshTabletStateCache();
-  return num_shutdown;
-}
-
-void TSTabletManagerMetrics::ResetTabletStateCounts() {
-  num_not_initialized = 0;
-  num_initialized = 0;
-  num_bootstrapping = 0;
-  num_running = 0;
-  num_failed = 0;
-  num_stopping = 0;
-  num_stopped = 0;
-  num_shutdown = 0;
-}
-
-} // namespace tserver
-} // namespace kudu

http://git-wip-us.apache.org/repos/asf/kudu/blob/fb74d345/src/kudu/tserver/ts_tablet_manager_metrics.h
----------------------------------------------------------------------
diff --git a/src/kudu/tserver/ts_tablet_manager_metrics.h b/src/kudu/tserver/ts_tablet_manager_metrics.h
deleted file mode 100644
index 47218b9..0000000
--- a/src/kudu/tserver/ts_tablet_manager_metrics.h
+++ /dev/null
@@ -1,63 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include "kudu/gutil/ref_counted.h"
-#include "kudu/util/metrics.h"
-
-namespace kudu {
-namespace tserver {
-
-class TSTabletManager;
-
-// A container for tablet manager metrics.
-struct TSTabletManagerMetrics {
-  TSTabletManagerMetrics(const scoped_refptr<MetricEntity>& metric_entity,
-                         TSTabletManager* ts_tablet_manager);
-
-  // Functions that return their respective tablet state count and potentially
-  // refresh the counts from ts_tablet_manager_.
-  int NumNotInitializedTablets();
-  int NumInitializedTablets();
-  int NumBootstrappingTablets();
-  int NumRunningTablets();
-  int NumFailedTablets();
-  int NumStoppingTablets();
-  int NumStoppedTablets();
-  int NumShutdownTablets();
-
-  void ResetTabletStateCounts();
-
-  // Cached tablet state counts computed from ts_tablet_manager_.
-  int num_not_initialized;
-  int num_initialized;
-  int num_bootstrapping;
-  int num_running;
-  int num_failed;
-  int num_stopping;
-  int num_stopped;
-  int num_shutdown;
-
- private:
-  TSTabletManager* ts_tablet_manager_;
-
-  FunctionGaugeDetacher metric_detacher_;
-};
-
-} // namespace tserver
-} // namespace kudu