You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by cs...@apache.org on 2020/01/15 14:29:32 UTC

[impala] 03/04: IMPALA-9295: Read aux error info regardless of overall query status

This is an automated email from the ASF dual-hosted git repository.

csringhofer pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 47053ccbc40e83eefc2f8e2b9ec59edc26d77e44
Author: Sahil Takiar <ta...@gmail.com>
AuthorDate: Mon Jan 13 14:30:01 2020 -0800

    IMPALA-9295: Read aux error info regardless of overall query status
    
    IMPALA-9137 added AuxErrorInfoPB to FragmentInstanceExecStatusPB
    in order to propagate additional error information from Impala executors
    to coordinators. The extra error information helps the coordinator decide
    if it should blacklist any nodes.
    
    Prior to this patch, the aux error info was only read if the
    overall_status of the ReportExecStatusRequestPB was an error. However,
    Impala executors don't synchronize the setting of the the aux error info
    and the overall_status, so it is possible the aux error info is dropped.
    
    This patch fixes this issue by always looking for the aux error info
    from each fragment, regardless of the overall_status from the report.
    
    Testing:
    * Core tests pass
    
    Change-Id: I7dfb5efd7a70626dfc5227f28288ff52ebb4861a
    Reviewed-on: http://gerrit.cloudera.org:8080/15027
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/runtime/coordinator-backend-state.cc |  7 ++++++-
 be/src/runtime/coordinator-backend-state.h  |  7 +++++--
 be/src/runtime/coordinator.cc               | 22 +++++++++++-----------
 be/src/runtime/coordinator.h                | 11 ++++++-----
 4 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc
index 79dde0a..f4b0536 100644
--- a/be/src/runtime/coordinator-backend-state.cc
+++ b/be/src/runtime/coordinator-backend-state.cc
@@ -357,7 +357,8 @@ inline bool Coordinator::BackendState::IsDoneLocked(
 bool Coordinator::BackendState::ApplyExecStatusReport(
     const ReportExecStatusRequestPB& backend_exec_status,
     const TRuntimeProfileForest& thrift_profiles, ExecSummary* exec_summary,
-    ProgressUpdater* scan_range_progress, DmlExecState* dml_exec_state) {
+    ProgressUpdater* scan_range_progress, DmlExecState* dml_exec_state,
+    vector<AuxErrorInfoPB>* aux_error_info) {
   DCHECK(!IsEmptyBackend());
   // Hold the exec_summary's lock to avoid exposing it half-way through
   // the update loop below.
@@ -443,6 +444,10 @@ bool Coordinator::BackendState::ApplyExecStatusReport(
       // TODO: We're losing this profile information. Call ReportQuerySummary only after
       // all backends have completed.
     }
+
+    if (instance_exec_status.has_aux_error_info()) {
+      aux_error_info->push_back(instance_exec_status.aux_error_info());
+    }
   }
 
   // status_ has incorporated the status from all fragment instances. If the overall
diff --git a/be/src/runtime/coordinator-backend-state.h b/be/src/runtime/coordinator-backend-state.h
index 383e70c..9d390e2 100644
--- a/be/src/runtime/coordinator-backend-state.h
+++ b/be/src/runtime/coordinator-backend-state.h
@@ -90,10 +90,13 @@ class Coordinator::BackendState {
   /// progress_update. If any instance reports an error, the overall execution status
   /// becomes the first reported error status. Returns true iff this update changed
   /// IsDone() from false to true, either because it was the last fragment to complete or
-  /// because it was the first error received.
+  /// because it was the first error received. Adds the AuxErrorInfoPB from each
+  /// FragmentInstanceExecStatusPB in backend_exec_status to the vector
+  /// aux_error_info.
   bool ApplyExecStatusReport(const ReportExecStatusRequestPB& backend_exec_status,
       const TRuntimeProfileForest& thrift_profiles, ExecSummary* exec_summary,
-      ProgressUpdater* scan_range_progress, DmlExecState* dml_exec_state);
+      ProgressUpdater* scan_range_progress, DmlExecState* dml_exec_state,
+      std::vector<AuxErrorInfoPB>* aux_error_info);
 
   /// Merges the incoming 'thrift_profile' into this backend state's host profile.
   void UpdateHostProfile(const TRuntimeProfileTree& thrift_profile);
diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index 7bebdd0..9ac5ee6 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -815,9 +815,10 @@ Status Coordinator::UpdateBackendExecStatus(const ReportExecStatusRequestPB& req
   if (thrift_profiles.__isset.host_profile) {
     backend_state->UpdateHostProfile(thrift_profiles.host_profile);
   }
+  vector<AuxErrorInfoPB> aux_error_info;
 
   if (backend_state->ApplyExecStatusReport(request, thrift_profiles, &exec_summary_,
-          &progress_, &dml_exec_state_)) {
+          &progress_, &dml_exec_state_, &aux_error_info)) {
     // This backend execution has completed.
     if (VLOG_QUERY_IS_ON) {
       // Don't log backend completion if the query has already been cancelled.
@@ -838,10 +839,6 @@ Status Coordinator::UpdateBackendExecStatus(const ReportExecStatusRequestPB& req
       // Can't apply state transition until no more exec rpcs will be sent.
       exec_rpcs_complete_barrier_.Wait();
 
-      // Iterate through all instance exec statuses, and use each fragment's AuxErrorInfo
-      // to possibly blacklist any "faulty" nodes.
-      UpdateBlacklistWithAuxErrorInfo(request);
-
       // Transition the status if we're not already in a terminal state. This won't block
       // because either this transitions to an ERROR state or the query is already in
       // a terminal state.
@@ -865,22 +862,25 @@ Status Coordinator::UpdateBackendExecStatus(const ReportExecStatusRequestPB& req
     }
     num_completed_backends_->Add(1);
   }
+
+  // Iterate through all AuxErrorInfoPB objects, and use each one to possibly blacklist
+  // any "faulty" nodes.
+  UpdateBlacklistWithAuxErrorInfo(&aux_error_info);
+
   // If query execution has terminated, return a cancelled status to force the fragment
   // instance to stop executing.
   return IsExecuting() ? Status::OK() : Status::CANCELLED;
 }
 
 void Coordinator::UpdateBlacklistWithAuxErrorInfo(
-    const ReportExecStatusRequestPB& request) {
+    vector<AuxErrorInfoPB>* aux_error_info) {
   // If the Backend failed due to a RPC failure, blacklist the destination node of
   // the failed RPC. Only blacklist one node per ReportExecStatusRequestPB to avoid
   // blacklisting nodes too aggressively. Currently, only blacklist the first node
   // that contains a valid RPCErrorInfoPB object.
-  for (auto instance_exec_status : request.instance_exec_status()) {
-    if (instance_exec_status.has_aux_error_info()
-        && instance_exec_status.aux_error_info().has_rpc_error_info()) {
-      RPCErrorInfoPB rpc_error_info =
-          instance_exec_status.aux_error_info().rpc_error_info();
+  for (auto aux_error : *aux_error_info) {
+    if (aux_error.has_rpc_error_info()) {
+      RPCErrorInfoPB rpc_error_info = aux_error.rpc_error_info();
       DCHECK(rpc_error_info.has_dest_node());
       DCHECK(rpc_error_info.has_posix_error_code());
       const NetworkAddressPB& dest_node = rpc_error_info.dest_node();
diff --git a/be/src/runtime/coordinator.h b/be/src/runtime/coordinator.h
index 816018a..9ed3324 100644
--- a/be/src/runtime/coordinator.h
+++ b/be/src/runtime/coordinator.h
@@ -39,6 +39,7 @@
 
 namespace impala {
 
+class AuxErrorInfoPB;
 class ClientRequestState;
 class FragmentInstanceState;
 class MemTracker;
@@ -548,15 +549,15 @@ class Coordinator { // NOLINT: The member variables could be re-ordered to save
   /// Checks the exec_state_ of the query and returns true if the query is executing.
   bool IsExecuting();
 
-  /// Helper function for UpdateBackendExecStatus that iterates through the
-  /// FragmentInstanceExecStatusPB for each fragment and uses AuxErrorInfoPB to check if
-  /// any nodes should be blacklisted. AuxErrorInfoPB contains additional error
-  /// information about why the fragment failed, beyond what is available in the
+  /// Helper function for UpdateBackendExecStatus that iterates through the given vector
+  /// of AuxErrorInfoPB objects and uses each one to check if any nodes should be
+  /// blacklisted. AuxErrorInfoPB contains additional error information about why the
+  /// fragment failed, beyond what is available in the
   /// ReportExecStatusRequestPB::overall_status field. This method uses information in
   /// AuxErrorInfoPB to classify specific nodes as "faulty" and then blacklists them. A
   /// node might be considered "faulty" if, for example, a RPC to that node failed, or a
   /// fragment on that node failed due to a disk IO error.
-  void UpdateBlacklistWithAuxErrorInfo(const ReportExecStatusRequestPB& request);
+  void UpdateBlacklistWithAuxErrorInfo(std::vector<AuxErrorInfoPB>* aux_error_info);
 
   /// BackendState and BackendResourceState are private to the Coordinator class, so mark
   /// all tests in CoordinatorBackendStateTest as friends.