You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2020/08/28 16:54:03 UTC

[impala] 02/02: IMPALA-10050: Fixed DCHECK error for backend in terminal state.

This is an automated email from the ASF dual-hosted git repository.

stakiar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 3733c4cc2cfb78d7f13463fb1ee9e1c4560d4a3d
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Thu Aug 6 15:52:47 2020 -0700

    IMPALA-10050: Fixed DCHECK error for backend in terminal state.
    
    Recent patch for IMPALA-6788 makes coordinator to cancel inflight
    query fragment instances when it receives failure report from one
    backend. It's possible the BackendState::Cancel() is called for
    one fragment instance before the first execution status report
    from its backend is received and processed by the coordinator.
    Since the status of BackendState is set as Cancelled after Cancel()
    is called, the execution of the fragment instance is treated as
    Done in such case so that the status report will NOT be processed.
    Hence the backend receives response OK from coordinator even it
    sent a report with execution error. This make backend hit DCHECK
    error if backend in the terminal state with error.
    This patch fixs the issue by making coordinator send CANCELLED
    status in the response of status report if the backend status is not
    ok and the execution status report is not applied.
    
    Testing:
     - The issue could be reproduced by running test_failpoints for about
       20 iterations. Verified the fixing by running test_failpoints over
       200 iterations without DCHECK failure.
     - Passed TestProcessFailures::test_kill_coordinator.
     - Psssed TestRPCException::test_state_report_error.
     - Passed exhaustive tests.
    
    Change-Id: Iba6a72f98c0f9299c22c58830ec5a643335b966a
    Reviewed-on: http://gerrit.cloudera.org:8080/16303
    Reviewed-by: Thomas Tauber-Marshall <tm...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/coordinator.cc    |  7 ++++++-
 be/src/runtime/query-exec-mgr.cc | 20 +++++++++++---------
 be/src/runtime/query-state.cc    |  3 +++
 3 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index b4f5178..119e40b 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -1005,7 +1005,12 @@ Status Coordinator::UpdateBackendExecStatus(const ReportExecStatusRequestPB& req
 
   // If query execution has terminated, return a cancelled status to force the fragment
   // instance to stop executing.
-  return IsExecuting() ? Status::OK() : Status::CANCELLED;
+  // After cancelling backend_state, it's possible that current exec_state is still
+  // EXECUTING but the backend status is not OK since execution status report is not
+  // applied to update the overall status. In such case, we should return a cancelled
+  // status to backend.
+  return (IsExecuting() && backend_state->GetStatus().ok()) ? Status::OK() :
+                                                              Status::CANCELLED;
 }
 
 Status Coordinator::UpdateBlacklistWithAuxErrorInfo(
diff --git a/be/src/runtime/query-exec-mgr.cc b/be/src/runtime/query-exec-mgr.cc
index ac07085..6874ca4 100644
--- a/be/src/runtime/query-exec-mgr.cc
+++ b/be/src/runtime/query-exec-mgr.cc
@@ -226,15 +226,17 @@ void QueryExecMgr::CancelQueriesForFailedCoordinators(
   // Build a list of queries that are scheduled by failed coordinators (as
   // evidenced by their absence from the cluster membership list).
   std::vector<QueryCancellationTask> to_cancel;
-  qs_map_.DoFuncForAllEntries([&](QueryState* qs) {
-    if (qs != nullptr && !qs->IsCancelled()) {
-      if (current_membership.find(qs->coord_backend_id()) == current_membership.end()) {
-        // decremented by ReleaseQueryState()
-        AcquireQueryStateLocked(qs);
-        to_cancel.push_back(QueryCancellationTask(qs));
-      }
-    }
-  });
+  ExecEnv::GetInstance()->query_exec_mgr()->qs_map_.DoFuncForAllEntries(
+      [&](QueryState* qs) {
+        if (qs != nullptr && !qs->IsCancelled()) {
+          if (current_membership.find(qs->coord_backend_id())
+              == current_membership.end()) {
+            // decremented by ReleaseQueryState()
+            AcquireQueryStateLocked(qs);
+            to_cancel.push_back(QueryCancellationTask(qs));
+          }
+        }
+      });
 
   // Since we are the only producer for the cancellation thread pool, we can find the
   // remaining capacity of the pool and submit the new cancellation requests without
diff --git a/be/src/runtime/query-state.cc b/be/src/runtime/query-state.cc
index ce61f99..8e071f9 100644
--- a/be/src/runtime/query-state.cc
+++ b/be/src/runtime/query-state.cc
@@ -806,6 +806,9 @@ done:
       DCHECK(entry.second->IsDone());
     }
   } else {
+    // If the query execution hit an error, when the final status report is sent, the
+    // coordinator's response will instruct the QueryState to cancel itself, so Cancel()
+    // should have always been called by this point.
     DCHECK_EQ(is_cancelled_.Load(), 1);
   }
 }