You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by jo...@apache.org on 2020/10/14 22:48:37 UTC

[impala] 06/06: IMPALA-10220: Fix negative value bug in RpcNetworkTime counter.

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit c7581b5d8a9e00006a34cb82e88fa0ede26cfb3f
Author: Riza Suminto <ri...@cloudera.com>
AuthorDate: Mon Oct 5 18:32:07 2020 -0700

    IMPALA-10220: Fix negative value bug in RpcNetworkTime counter.
    
    Total RPC time was incorrectly computed using
    resp_.receiver_latency_ns() in function EndDataStreamCompleteCb(). This
    patch fix the bug by replacing it with eos_rsp_.receiver_latency_ns().
    This patch also fix logging mistakes in LogSlowRpc() to use its 'resp'
    parameter instead of 'resp_' field member.
    
    Testing:
    - Manually run data loading query that exhibit the bug for several times
      and verify that the Min value of RpcNetworkTime counter is always
      positive after the patch. The query used in testing is insert query to
      TPC-DS fact table store_sales of scale 10GB in single machine mini
      cluster.
    - Add DCHECK to verify that total rpc time is greater than or equal to
      receiver_latency_ns.
    - Run and pass core tests.
    
    Change-Id: I2a4d65a3e0f88349bd4ee1b01290bd2c386acc69
    Reviewed-on: http://gerrit.cloudera.org:8080/16552
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/krpc-data-stream-sender.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/be/src/runtime/krpc-data-stream-sender.cc b/be/src/runtime/krpc-data-stream-sender.cc
index 9a0f28e..42bcec9 100644
--- a/be/src/runtime/krpc-data-stream-sender.cc
+++ b/be/src/runtime/krpc-data-stream-sender.cc
@@ -390,12 +390,12 @@ void KrpcDataStreamSender::Channel::MarkDone(const Status& status) {
 template <typename ResponsePBType>
 void KrpcDataStreamSender::Channel::LogSlowRpc(
     const char* rpc_name, int64_t total_time_ns, const ResponsePBType& resp) {
-  int64_t network_time_ns = total_time_ns - resp_.receiver_latency_ns();
+  int64_t network_time_ns = total_time_ns - resp.receiver_latency_ns();
   LOG(INFO) << "Slow " << rpc_name << " RPC to " << address_
             << " (fragment_instance_id=" << PrintId(fragment_instance_id_) << "): "
             << "took " << PrettyPrinter::Print(total_time_ns, TUnit::TIME_NS) << ". "
             << "Receiver time: "
-            << PrettyPrinter::Print(resp_.receiver_latency_ns(), TUnit::TIME_NS)
+            << PrettyPrinter::Print(resp.receiver_latency_ns(), TUnit::TIME_NS)
             << " Network time: " << PrettyPrinter::Print(network_time_ns, TUnit::TIME_NS);
 }
 
@@ -489,15 +489,16 @@ void KrpcDataStreamSender::Channel::HandleFailedRPC(const DoRpcFn& rpc_fn,
 }
 
 void KrpcDataStreamSender::Channel::TransmitDataCompleteCb() {
-  DCHECK_NE(rpc_start_time_ns_, 0);
-  int64_t total_time = MonotonicNanos() - rpc_start_time_ns_;
   std::unique_lock<SpinLock> l(lock_);
   DCHECK(rpc_in_flight_);
+  DCHECK_NE(rpc_start_time_ns_, 0);
+  int64_t total_time = MonotonicNanos() - rpc_start_time_ns_;
   const kudu::Status controller_status = rpc_controller_.status();
   if (LIKELY(controller_status.ok())) {
     DCHECK(rpc_in_flight_batch_ != nullptr);
     // 'receiver_latency_ns' is calculated with MonoTime, so it must be non-negative.
     DCHECK_GE(resp_.receiver_latency_ns(), 0);
+    DCHECK_GE(total_time, resp_.receiver_latency_ns());
     int64_t row_batch_size = RowBatch::GetSerializedSize(*rpc_in_flight_batch_);
     int64_t network_time = total_time - resp_.receiver_latency_ns();
     COUNTER_ADD(parent_->bytes_sent_counter_, row_batch_size);
@@ -631,8 +632,9 @@ void KrpcDataStreamSender::Channel::EndDataStreamCompleteCb() {
   const kudu::Status controller_status = rpc_controller_.status();
   if (LIKELY(controller_status.ok())) {
     // 'receiver_latency_ns' is calculated with MonoTime, so it must be non-negative.
-    DCHECK_GE(resp_.receiver_latency_ns(), 0);
-    int64_t network_time_ns = total_time_ns - resp_.receiver_latency_ns();
+    DCHECK_GE(eos_resp_.receiver_latency_ns(), 0);
+    DCHECK_GE(total_time_ns, eos_resp_.receiver_latency_ns());
+    int64_t network_time_ns = total_time_ns - eos_resp_.receiver_latency_ns();
     parent_->network_time_stats_->UpdateCounter(network_time_ns);
     parent_->recvr_time_stats_->UpdateCounter(eos_resp_.receiver_latency_ns());
     if (IsSlowRpc(total_time_ns)) LogSlowRpc("EndDataStream", total_time_ns, eos_resp_);