You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@kudu.apache.org by ad...@apache.org on 2019/02/02 18:49:22 UTC

[kudu] 01/03: Add tracing to StartTabletCopy

This is an automated email from the ASF dual-hosted git repository.

adar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git

commit 530b852c8d8e13e0559a4cfa847e55baae31cac8
Author: Will Berkeley <wd...@gmail.org>
AuthorDate: Thu Jan 31 11:17:25 2019 -0800

    Add tracing to StartTabletCopy
    
    This patch adds tracing information to the StartTabletCopy RPC. The
    sequence of events is the following:
    
    1. A remote peer initiates a tablet copy on the local peer via
       StartTabletCopy.
    2. The local peer starts processing the StartTabletCopy. If the copy is
       copying over an existing, running replica, the replica is shutdown and
       tombstoned (so its data is deleted).
    3. The local peer creates a tablet copy client.
    4. The local peer opens a tablet copy session between the local and
       remote peer by calling BeginTabletCopySession on the remote peer. The
       local peer downloads the tablet superblock as part of this.
    5. The local peer either
         a. replaces tombstoned metadata with new metadata, or
         b. writes new tablet metadata.
    6. The local peer writes the new consensus metadata for the tablet.
    7. The local peer responds to the StartTabletCopy. Downloading data and
       WALs occurs asynchronously, managed by the tablet copy session.
    
    Steps 2 and 4-6 may be slow due to disk or network I/O, but there was no
    tracing to help diagnose the source of slowness in cases where
    StartTabletCopy was slow. This patch adds simple tracing for the
    completion of steps 2, 4, 5a, 5b, and 6, plus a trace message for when
    the replica is registered with the tablet manager at the end of the
    processing of the RPC.
    
    Change-Id: I3f32871f3ac1c0d4513a9ba2c955be42dd6e478e
    Reviewed-on: http://gerrit.cloudera.org:8080/12326
    Tested-by: Kudu Jenkins
    Reviewed-by: Attila Bukor <ab...@apache.org>
---
 src/kudu/tserver/tablet_copy_client.cc | 13 +++++++++++--
 src/kudu/tserver/ts_tablet_manager.cc  |  2 ++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/kudu/tserver/tablet_copy_client.cc b/src/kudu/tserver/tablet_copy_client.cc
index d8bbc55..6c3669c 100644
--- a/src/kudu/tserver/tablet_copy_client.cc
+++ b/src/kudu/tserver/tablet_copy_client.cc
@@ -66,6 +66,7 @@
 #include "kudu/util/random_util.h"
 #include "kudu/util/scoped_cleanup.h"
 #include "kudu/util/status.h"
+#include "kudu/util/trace.h"
 
 DEFINE_int32(tablet_copy_begin_session_timeout_ms, 30000,
              "Tablet server RPC client timeout for BeginTabletCopySession calls. "
@@ -236,8 +237,9 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                                    Substitute("$0 (resolved to $1)",
                                               copy_source_addr.host(), addr.host()));
   }
-  LOG_WITH_PREFIX(INFO) << "Beginning tablet copy session"
-                        << " from remote peer at address " << copy_source_addr.ToString();
+  LOG_WITH_PREFIX(INFO) <<
+      Substitute("Beginning tablet copy session from remote peer at address $0",
+                 copy_source_addr.ToString());
 
   // Set up an RPC proxy for the TabletCopyService.
   proxy_.reset(new TabletCopyServiceProxy(messenger_, addr, copy_source_addr.host()));
@@ -254,6 +256,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
     return proxy_->BeginTabletCopySession(req, &resp, &controller);
   }), "unable to begin tablet copy session");
 
+  TRACE("Tablet copy session begun");
+
   string copy_peer_uuid = resp.has_responder_uuid()
       ? resp.responder_uuid() : "(unknown uuid)";
   if (resp.superblock().tablet_data_state() != tablet::TABLET_DATA_READY) {
@@ -331,6 +335,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                                           tablet::TABLET_DATA_COPYING,
                                           /*last_logged_opid=*/ boost::none),
         "Could not replace superblock with COPYING data state");
+    TRACE("Replaced tombstoned tablet metadata.");
+
     RETURN_NOT_OK_PREPEND(fs_manager_->dd_manager()->CreateDataDirGroup(tablet_id_),
         "Could not create a new directory group for tablet copy");
   } else {
@@ -355,6 +361,8 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
                                             superblock_->tablet_data_state(),
                                             superblock_->tombstone_last_logged_opid(),
                                             &meta_));
+    TRACE("Wrote new tablet metadata");
+
     // We have begun persisting things to disk. Update the tablet copy state
     // machine so we know there is state to clean up in case we fail.
     state_ = kStarting;
@@ -365,6 +373,7 @@ Status TabletCopyClient::Start(const HostPort& copy_source_addr,
   // Create the ConsensusMetadata before returning from Start() so that it's
   // possible to vote while we are copying the replica for the first time.
   RETURN_NOT_OK(WriteConsensusMetadata());
+  TRACE("Wrote new consensus metadata");
 
   state_ = kStarted;
   if (meta) {
diff --git a/src/kudu/tserver/ts_tablet_manager.cc b/src/kudu/tserver/ts_tablet_manager.cc
index fc5d8f6..b365ba4 100644
--- a/src/kudu/tserver/ts_tablet_manager.cc
+++ b/src/kudu/tserver/ts_tablet_manager.cc
@@ -653,6 +653,7 @@ void TSTabletManager::RunTabletCopy(
               s.CloneAndPrepend(Substitute("Unable to delete on-disk data from tablet $0",
                                            tablet_id)));
         }
+        TRACE("Shutdown and deleted data from running replica");
         break;
       }
       default:
@@ -722,6 +723,7 @@ void TSTabletManager::RunTabletCopy(
   RegisterTabletReplicaMode mode = replacing_tablet ? REPLACEMENT_REPLICA : NEW_REPLICA;
   scoped_refptr<TabletReplica> replica;
   CALLBACK_RETURN_NOT_OK(CreateAndRegisterTabletReplica(meta, mode, &replica));
+  TRACE("Replica registered.");
 
   // Now we invoke the StartTabletCopy callback and respond success to the
   // remote caller, since StartTabletCopy() is an asynchronous RPC call. Then