You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by al...@apache.org on 2022/03/09 22:02:44 UTC
[kudu] branch master updated: [tablet] output more info for long row lock waits
This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 8e704f2 [tablet] output more info for long row lock waits
8e704f2 is described below
commit 8e704f28b172e211477bd06409a429b20a1e55a1
Author: Alexey Serbin <al...@apache.org>
AuthorDate: Mon Mar 7 19:47:36 2022 -0800
[tablet] output more info for long row lock waits
While troubleshooting one performance issue, I found that the messages
on long-wait row locks didn't provide a lot of useful information
especially when log redaction was on (it's on by default).
This patch address that, so now it's easier to tell:
* what's the tablet where the congestion happens
* whether recurrent messages are about the same primary key
as for prior messages
Before:
Waited 2 seconds to obtain row lock on key <redacted> cur holder: 0x2f2e85180
After:
Waited 2 seconds to obtain row lock on key '<redacted>' (key hash 8560710080558783806) tablet 18b47e40a6864ff39d1d82accf7202ec cur holder 0x2f2e85180
Change-Id: I521976ff6986676ad9a33a810c56eaa8e7deeab8
Reviewed-on: http://gerrit.cloudera.org:8080/18300
Tested-by: Kudu Jenkins
Reviewed-by: Andrew Wong <aw...@cloudera.com>
---
src/kudu/tablet/lock_manager.cc | 18 ++++++++---
src/kudu/tablet/tablet_replica-test.cc | 58 ++++++++++++++++++++++++++++++++++
2 files changed, 71 insertions(+), 5 deletions(-)
diff --git a/src/kudu/tablet/lock_manager.cc b/src/kudu/tablet/lock_manager.cc
index bf9d57f..2981b5d 100644
--- a/src/kudu/tablet/lock_manager.cc
+++ b/src/kudu/tablet/lock_manager.cc
@@ -34,6 +34,8 @@
#include "kudu/gutil/port.h"
#include "kudu/gutil/strings/substitute.h"
#include "kudu/gutil/walltime.h"
+#include "kudu/tablet/ops/op.h"
+#include "kudu/tablet/tablet_replica.h"
#include "kudu/tserver/tserver.pb.h"
#include "kudu/util/array_view.h"
#include "kudu/util/faststring.h"
@@ -53,8 +55,6 @@ using strings::Substitute;
namespace kudu {
namespace tablet {
-class OpState;
-
// ============================================================================
// LockTable
// ============================================================================
@@ -79,6 +79,10 @@ class LockEntry {
return KUDU_REDACT(key_.ToDebugString());
}
+ uint64_t key_hash() const {
+ return key_hash_;
+ }
+
void Unlock();
// Mutex used by the LockManager
@@ -579,8 +583,11 @@ void LockManager::AcquireLockOnEntry(LockEntry* entry, const OpState* op) {
int waited_seconds = 0;
while (!entry->sem.TimedAcquire(MonoDelta::FromSeconds(1))) {
const OpState* cur_holder = ANNOTATE_UNPROTECTED_READ(entry->holder_);
- LOG(WARNING) << "Waited " << (++waited_seconds) << " seconds to obtain row lock on key "
- << entry->ToString() << " cur holder: " << cur_holder;
+ LOG(WARNING) << Substitute(
+ "Waited $0 seconds to obtain row lock on key '$1' (key hash $2) "
+ "tablet $3 cur holder $4",
+ ++waited_seconds, entry->ToString(), entry->key_hash(),
+ op->tablet_replica()->tablet_id(), cur_holder);
// TODO(unknown): would be nice to also include some info about the blocking op,
// but it's a bit tricky to do in a non-racy fashion (the other op may
// complete at any point)
@@ -588,7 +595,8 @@ void LockManager::AcquireLockOnEntry(LockEntry* entry, const OpState* op) {
MicrosecondsInt64 wait_us = GetMonoTimeMicros() - start_wait_us;
TRACE_COUNTER_INCREMENT("row_lock_wait_us", wait_us);
if (wait_us > 100 * 1000) {
- TRACE("Waited $0us for lock on $1", wait_us, KUDU_REDACT(entry->ToString()));
+ TRACE("Waited $0us for lock on $1 (key hash $2)",
+ wait_us, entry->ToString(), entry->key_hash());
}
}
diff --git a/src/kudu/tablet/tablet_replica-test.cc b/src/kudu/tablet/tablet_replica-test.cc
index e6980a9..0e7734c 100644
--- a/src/kudu/tablet/tablet_replica-test.cc
+++ b/src/kudu/tablet/tablet_replica-test.cc
@@ -22,6 +22,7 @@
#include <memory>
#include <ostream>
#include <string>
+#include <thread>
#include <utility>
#include <boost/optional/optional.hpp>
@@ -46,6 +47,7 @@
#include "kudu/fs/fs_manager.h"
#include "kudu/gutil/macros.h"
#include "kudu/gutil/ref_counted.h"
+#include "kudu/tablet/lock_manager.h"
#include "kudu/tablet/ops/alter_schema_op.h"
#include "kudu/tablet/ops/op.h"
#include "kudu/tablet/ops/op_driver.h"
@@ -57,12 +59,14 @@
#include "kudu/tablet/tablet_replica_mm_ops.h"
#include "kudu/tserver/tserver.pb.h"
#include "kudu/tserver/tserver_admin.pb.h"
+#include "kudu/util/array_view.h"
#include "kudu/util/countdown_latch.h"
#include "kudu/util/maintenance_manager.h"
#include "kudu/util/metrics.h"
#include "kudu/util/monotime.h"
#include "kudu/util/pb_util.h"
#include "kudu/util/random.h"
+#include "kudu/util/slice.h"
#include "kudu/util/status.h"
#include "kudu/util/test_macros.h"
#include "kudu/util/test_util.h"
@@ -89,6 +93,7 @@ using kudu::tserver::WriteRequestPB;
using kudu::tserver::WriteResponsePB;
using std::shared_ptr;
using std::string;
+using std::thread;
using std::unique_ptr;
namespace kudu {
@@ -689,5 +694,58 @@ TEST_F(TabletReplicaTest, TestRestartAfterGCDeletedRowsets) {
ASSERT_EQ(1, tablet->num_rowsets());
}
+// This is a trivial test scenario to check how row locking works in case of
+// concurrent attempts to lock the same row with relatively long waiting times.
+// The thread attempting to acquire the row lock for long times should be able
+// to acquire the lock eventually and log about its attempts to acquire the log.
+// The logging part isn't covered by any special assertions, though.
+// An alternative place to add this scenario could be lock_manager-test.cc, but
+// for proper logging a real WriteOpState backed by a tablet is necessary.
+TEST_F(TabletReplicaTest, RowLocksLongWaitAndLogging) {
+ SKIP_IF_SLOW_NOT_ALLOWED();
+
+ constexpr const char* const kKey = "key";
+ constexpr int32_t kValue = 0;
+
+ const Schema schema(GetTestSchema());
+
+ Slice key[]{kKey};
+ unique_ptr<WriteRequestPB> req(new WriteRequestPB);
+ req->set_tablet_id(tablet()->tablet_id());
+ CHECK_OK(SchemaToPB(schema, req->mutable_schema()));
+ KuduPartialRow row(&schema);
+ CHECK_OK(row.SetInt32(kKey, kValue));
+ {
+ RowOperationsPBEncoder enc(req->mutable_row_operations());
+ enc.Add(RowOperationsPB::DELETE, row);
+ }
+ unique_ptr<WriteResponsePB> resp(new WriteResponsePB);
+ LockManager lock_manager;
+
+ thread t0([&]{
+ unique_ptr<WriteOpState> op_state(new WriteOpState(
+ tablet_replica_.get(), req.get(), nullptr, resp.get()));
+ ScopedRowLock row_lock(
+ &lock_manager, op_state.get(), key, LockManager::LOCK_EXCLUSIVE);
+ CHECK(row_lock.acquired());
+ // Pause for a while when the other thread tries to acquire the lock,
+ // so the other thread logs about its attempts to acquire the row lock.
+ SleepFor(MonoDelta::FromMilliseconds(3000));
+ });
+
+ thread t1([&]{
+ // Let the other thread acquire the lock first.
+ SleepFor(MonoDelta::FromMilliseconds(500));
+ unique_ptr<WriteOpState> op_state(new WriteOpState(
+ tablet_replica_.get(), req.get(), nullptr, resp.get()));
+ ScopedRowLock row_lock(
+ &lock_manager, op_state.get(), key, LockManager::LOCK_EXCLUSIVE);
+ CHECK(row_lock.acquired());
+ });
+
+ t0.join();
+ t1.join();
+}
+
} // namespace tablet
} // namespace kudu