You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/05/24 19:26:48 UTC

[2/2] impala git commit: IMPALA-7055: fix race with DML errors

IMPALA-7055: fix race with DML errors

Error statuses could be lost because backend_exec_complete_barrier_
went to 0 before the query was transitioned to an error state.
Reordering the UpdateExecState() and backend_exec_complete_barrier_
calls prevents this race.

Change-Id: Idafd0b342e77a065be7cc28fa8c8a9df445622c2
Reviewed-on: http://gerrit.cloudera.org:8080/10491
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/2362b672
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/2362b672
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/2362b672

Branch: refs/heads/master
Commit: 2362b672ccd94ed97331fe9c84ac1603ecb3772f
Parents: a22ee64
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed May 23 14:03:12 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu May 24 19:09:49 2018 +0000

----------------------------------------------------------------------
 be/src/runtime/coordinator.cc | 24 ++++++++++++++++--------
 be/src/runtime/coordinator.h  |  3 +++
 2 files changed, 19 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/2362b672/be/src/runtime/coordinator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index 998fee2..90e2390 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -669,25 +669,33 @@ Status Coordinator::UpdateBackendExecStatus(const TReportExecStatusParams& param
 
   if (backend_state->ApplyExecStatusReport(params, &exec_summary_, &progress_)) {
     // This backend execution has completed.
+    if (VLOG_QUERY_IS_ON) {
+      // Don't log backend completion if the query has already been cancelled.
+      int pending_backends = backend_exec_complete_barrier_->pending();
+      if (pending_backends >= 1) {
+        VLOG_QUERY << "Backend completed:"
+                   << " host=" << TNetworkAddressToString(backend_state->impalad_address())
+                   << " remaining=" << pending_backends
+                   << " query_id=" << PrintId(query_id());
+        BackendState::LogFirstInProgress(backend_states_);
+      }
+    }
     bool is_fragment_failure;
     TUniqueId failed_instance_id;
     Status status = backend_state->GetStatus(&is_fragment_failure, &failed_instance_id);
-    int pending_backends = backend_exec_complete_barrier_->Notify();
-    if (VLOG_QUERY_IS_ON && pending_backends >= 0) {
-      VLOG_QUERY << "Backend completed:"
-                 << " host=" << TNetworkAddressToString(backend_state->impalad_address())
-                 << " remaining=" << pending_backends
-                 << " query_id=" << PrintId(query_id());
-      BackendState::LogFirstInProgress(backend_states_);
-    }
     if (!status.ok()) {
       // We may start receiving status reports before all exec rpcs are complete.
       // Can't apply state transition until no more exec rpcs will be sent.
       exec_rpcs_complete_barrier_->Wait();
+      // Transition the status if we're not already in a terminal state. This won't block
+      // because either this transitions to an ERROR state or the query is already in
+      // a terminal state.
       discard_result(UpdateExecState(status,
               is_fragment_failure ? &failed_instance_id : nullptr,
               TNetworkAddressToString(backend_state->impalad_address())));
     }
+    // We've applied all changes from the final status report - notify waiting threads.
+    backend_exec_complete_barrier_->Notify();
   }
   // If all results have been returned, return a cancelled status to force the fragment
   // instance to stop executing.

http://git-wip-us.apache.org/repos/asf/impala/blob/2362b672/be/src/runtime/coordinator.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.h b/be/src/runtime/coordinator.h
index ae85bcd..5bb399f 100644
--- a/be/src/runtime/coordinator.h
+++ b/be/src/runtime/coordinator.h
@@ -350,6 +350,9 @@ class Coordinator { // NOLINT: The member variables could be re-ordered to save
   /// the Coordinator object), then finalizes execution (cancels remaining backends if
   /// transitioning to CANCELLED; in all cases releases resources and calls
   /// ComputeQuerySummary()). Must not be called if exec RPCs are pending.
+  /// Will block waiting for backends to completed if transitioning to the
+  /// RETURNED_RESULTS terminal state. Does not block if already in terminal state or
+  /// transitioning to ERROR or CANCELLED.
   void HandleExecStateTransition(const ExecState old_state, const ExecState new_state);
 
   /// Return true if 'exec_state_' is RETURNED_RESULTS.