You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2018/05/24 19:26:48 UTC
[2/2] impala git commit: IMPALA-7055: fix race with DML errors
IMPALA-7055: fix race with DML errors
Error statuses could be lost because backend_exec_complete_barrier_
went to 0 before the query was transitioned to an error state.
Reordering the UpdateExecState() and backend_exec_complete_barrier_
calls prevents this race.
Change-Id: Idafd0b342e77a065be7cc28fa8c8a9df445622c2
Reviewed-on: http://gerrit.cloudera.org:8080/10491
Reviewed-by: Tim Armstrong <ta...@cloudera.com>
Tested-by: Impala Public Jenkins <im...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/2362b672
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/2362b672
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/2362b672
Branch: refs/heads/master
Commit: 2362b672ccd94ed97331fe9c84ac1603ecb3772f
Parents: a22ee64
Author: Tim Armstrong <ta...@cloudera.com>
Authored: Wed May 23 14:03:12 2018 -0700
Committer: Impala Public Jenkins <im...@cloudera.com>
Committed: Thu May 24 19:09:49 2018 +0000
----------------------------------------------------------------------
be/src/runtime/coordinator.cc | 24 ++++++++++++++++--------
be/src/runtime/coordinator.h | 3 +++
2 files changed, 19 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/2362b672/be/src/runtime/coordinator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index 998fee2..90e2390 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -669,25 +669,33 @@ Status Coordinator::UpdateBackendExecStatus(const TReportExecStatusParams& param
if (backend_state->ApplyExecStatusReport(params, &exec_summary_, &progress_)) {
// This backend execution has completed.
+ if (VLOG_QUERY_IS_ON) {
+ // Don't log backend completion if the query has already been cancelled.
+ int pending_backends = backend_exec_complete_barrier_->pending();
+ if (pending_backends >= 1) {
+ VLOG_QUERY << "Backend completed:"
+ << " host=" << TNetworkAddressToString(backend_state->impalad_address())
+ << " remaining=" << pending_backends
+ << " query_id=" << PrintId(query_id());
+ BackendState::LogFirstInProgress(backend_states_);
+ }
+ }
bool is_fragment_failure;
TUniqueId failed_instance_id;
Status status = backend_state->GetStatus(&is_fragment_failure, &failed_instance_id);
- int pending_backends = backend_exec_complete_barrier_->Notify();
- if (VLOG_QUERY_IS_ON && pending_backends >= 0) {
- VLOG_QUERY << "Backend completed:"
- << " host=" << TNetworkAddressToString(backend_state->impalad_address())
- << " remaining=" << pending_backends
- << " query_id=" << PrintId(query_id());
- BackendState::LogFirstInProgress(backend_states_);
- }
if (!status.ok()) {
// We may start receiving status reports before all exec rpcs are complete.
// Can't apply state transition until no more exec rpcs will be sent.
exec_rpcs_complete_barrier_->Wait();
+ // Transition the status if we're not already in a terminal state. This won't block
+ // because either this transitions to an ERROR state or the query is already in
+ // a terminal state.
discard_result(UpdateExecState(status,
is_fragment_failure ? &failed_instance_id : nullptr,
TNetworkAddressToString(backend_state->impalad_address())));
}
+ // We've applied all changes from the final status report - notify waiting threads.
+ backend_exec_complete_barrier_->Notify();
}
// If all results have been returned, return a cancelled status to force the fragment
// instance to stop executing.
http://git-wip-us.apache.org/repos/asf/impala/blob/2362b672/be/src/runtime/coordinator.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.h b/be/src/runtime/coordinator.h
index ae85bcd..5bb399f 100644
--- a/be/src/runtime/coordinator.h
+++ b/be/src/runtime/coordinator.h
@@ -350,6 +350,9 @@ class Coordinator { // NOLINT: The member variables could be re-ordered to save
/// the Coordinator object), then finalizes execution (cancels remaining backends if
/// transitioning to CANCELLED; in all cases releases resources and calls
/// ComputeQuerySummary()). Must not be called if exec RPCs are pending.
+ /// Will block waiting for backends to completed if transitioning to the
+ /// RETURNED_RESULTS terminal state. Does not block if already in terminal state or
+ /// transitioning to ERROR or CANCELLED.
void HandleExecStateTransition(const ExecState old_state, const ExecState new_state);
/// Return true if 'exec_state_' is RETURNED_RESULTS.