You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by sa...@apache.org on 2018/02/07 21:23:14 UTC

[3/4] impala git commit: Revert 'IMPALA-6338: Fix flaky test_profile_fragment_instances'

Revert 'IMPALA-6338: Fix flaky test_profile_fragment_instances'

There have been several crashes observed in testing recently, and its
not clear what's going on, so for now revert this moderately risky
change.

Change-Id: I48c11f0817c5190a3a94f8260f3e8ef653357ab3
Reviewed-on: http://gerrit.cloudera.org:8080/9243
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Thomas Tauber-Marshall <tm...@cloudera.com>


Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/0a9f0369
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/0a9f0369
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/0a9f0369

Branch: refs/heads/master
Commit: 0a9f0369659b0996ac9db3a2e9ef877941d8b9eb
Parents: 2890f30
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Wed Feb 7 11:12:47 2018 -0800
Committer: Thomas Tauber-Marshall <tm...@cloudera.com>
Committed: Wed Feb 7 20:06:52 2018 +0000

----------------------------------------------------------------------
 be/src/common/status.h                      |  6 ------
 be/src/runtime/coordinator-backend-state.cc | 12 ++++++------
 be/src/runtime/coordinator-backend-state.h  |  6 ++----
 be/src/runtime/coordinator.cc               | 10 ++++------
 be/src/runtime/coordinator.h                |  4 +---
 tests/query_test/test_observability.py      | 10 +++++-----
 6 files changed, 18 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/common/status.h
----------------------------------------------------------------------
diff --git a/be/src/common/status.h b/be/src/common/status.h
index f0f91f7..24dba8b 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -95,13 +95,7 @@ class NODISCARD Status {
   static Status MemLimitExceeded();
   static Status MemLimitExceeded(const std::string& details);
 
-  /// Indicates a 'cancelled' status. CANCELLED should not be reported by a fragment
-  /// instance that encounters a problem - instances should return a specific error,
-  /// and then the coordinator will initiate cancellation.
-  /// TODO: we use this in some places to indicate things other than query cancellation,
-  /// which can be confusing.
   static const Status CANCELLED;
-
   static const Status DEPRECATED_RPC;
 
   /// Copy c'tor makes copy of error detail so Status can be returned by value.

http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator-backend-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc
index b238cad..914a3e4 100644
--- a/be/src/runtime/coordinator-backend-state.cc
+++ b/be/src/runtime/coordinator-backend-state.cc
@@ -234,7 +234,7 @@ void Coordinator::BackendState::LogFirstInProgress(
 }
 
 inline bool Coordinator::BackendState::IsDone() const {
-  return num_remaining_instances_ == 0 || (!status_.ok() && !status_.IsCancelled());
+  return num_remaining_instances_ == 0 || !status_.ok();
 }
 
 bool Coordinator::BackendState::ApplyExecStatusReport(
@@ -338,8 +338,8 @@ bool Coordinator::BackendState::Cancel() {
   // Nothing to cancel if the exec rpc was not sent
   if (!rpc_sent_) return false;
 
-  // don't cancel if it already finished (for any reason) or cancelled
-  if (IsDone() || status_.IsCancelled()) return false;
+  // don't cancel if it already finished (for any reason)
+  if (IsDone()) return false;
 
   /// If the status is not OK, we still try to cancel - !OK status might mean
   /// communication failure between backend and coordinator, but fragment
@@ -391,10 +391,10 @@ bool Coordinator::BackendState::Cancel() {
 void Coordinator::BackendState::PublishFilter(const TPublishFilterParams& rpc_params) {
   DCHECK_EQ(rpc_params.dst_query_id, query_id_);
   {
-    // If the backend is already done or cancelled, it's not waiting for this filter, so
-    // we skip sending it in this case.
+    // If the backend is already done, it's not waiting for this filter, so we skip
+    // sending it in this case.
     lock_guard<mutex> l(lock_);
-    if (IsDone() || status_.IsCancelled()) return;
+    if (IsDone()) return;
   }
 
   if (fragments_.count(rpc_params.dst_fragment_idx) == 0) return;

http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator-backend-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-backend-state.h b/be/src/runtime/coordinator-backend-state.h
index 860b968..0973ca3 100644
--- a/be/src/runtime/coordinator-backend-state.h
+++ b/be/src/runtime/coordinator-backend-state.h
@@ -219,7 +219,7 @@ class Coordinator::BackendState {
 
   /// If the status indicates an error status, execution has either been aborted by the
   /// executing impalad (which then reported the error) or cancellation has been
-  /// initiated by the coordinator.
+  /// initiated; either way, execution must not be cancelled.
   Status status_;
 
   /// Used to distinguish between errors reported by a specific fragment instance,
@@ -254,9 +254,7 @@ class Coordinator::BackendState {
       const FilterRoutingTable& filter_routing_table,
       TExecQueryFInstancesParams* rpc_params);
 
-  /// Return true if execution at this backend is done. The backend is considered done if
-  /// either all instances have completed, or an error (other than cancel) is encountered.
-  /// Caller must hold lock_.
+  /// Return true if execution at this backend is done. Caller must hold lock_.
   bool IsDone() const;
 };
 

http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index 05ecf9f..7973775 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -867,9 +867,6 @@ Status Coordinator::GetNext(QueryResultSet* results, int max_rows, bool* eos) {
     ReleaseExecResources();
     // wait for all backends to complete before computing the summary
     // TODO: relocate this so GetNext() won't have to wait for backends to complete?
-    // Note that doing this here allows us to ensure that a query that completes
-    // successfully will have a full runtime profile by the time that Fetch() indicates
-    // all of the results have been returned.
     RETURN_IF_ERROR(WaitForBackendCompletion());
     // Release admission control resources after backends are finished.
     ReleaseAdmissionControlResources();
@@ -923,8 +920,10 @@ Status Coordinator::UpdateBackendExecStatus(const TReportExecStatusParams& param
         Substitute("Unknown backend index $0 (max known: $1)",
             params.coord_state_idx, backend_states_.size() - 1));
   }
-  // If the query was cancelled, don't process the update.
-  if (query_status_.IsCancelled()) return Status::OK();
+  BackendState* backend_state = backend_states_[params.coord_state_idx];
+  // TODO: return here if returned_all_results_?
+  // TODO: return CANCELLED in that case? Although that makes the cancellation propagation
+  // path more irregular.
 
   // TODO: only do this when the sink is done; probably missing a done field
   // in TReportExecStatus for that
@@ -932,7 +931,6 @@ Status Coordinator::UpdateBackendExecStatus(const TReportExecStatusParams& param
     UpdateInsertExecStatus(params.insert_exec_status);
   }
 
-  BackendState* backend_state = backend_states_[params.coord_state_idx];
   if (backend_state->ApplyExecStatusReport(params, &exec_summary_, &progress_)) {
     // This report made this backend done, so update the status and
     // num_remaining_backends_.

http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.h b/be/src/runtime/coordinator.h
index fbbdfa9..d630b9a 100644
--- a/be/src/runtime/coordinator.h
+++ b/be/src/runtime/coordinator.h
@@ -290,9 +290,7 @@ class Coordinator { // NOLINT: The member variables could be re-ordered to save
   boost::mutex lock_;
 
   /// Overall status of the entire query; set to the first reported fragment error
-  /// status or to CANCELLED, if Cancel() is called. Note that some fragments may have
-  /// status CANCELLED even if this is not CANCELLED if cancellation is initiated because
-  /// returned_all_results_ is true or an error is encountered.
+  /// status or to CANCELLED, if Cancel() is called.
   Status query_status_;
 
   /// If true, the query is done returning all results.  It is possible that the

http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/tests/query_test/test_observability.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_observability.py b/tests/query_test/test_observability.py
index 960a6f4..e838081 100644
--- a/tests/query_test/test_observability.py
+++ b/tests/query_test/test_observability.py
@@ -124,14 +124,14 @@ class TestObservability(ImpalaTestSuite):
         join (select * from l LIMIT 2000000) b on a.l_orderkey = -b.l_orderkey;""")
     # There are 3 scan nodes and each appears in the profile 4 times (for 3 fragment
     # instances + the averaged fragment).
-    assert results.runtime_profile.count("HDFS_SCAN_NODE") == 12, results.runtime_profile
+    assert results.runtime_profile.count("HDFS_SCAN_NODE") == 12
     # There are 3 exchange nodes and each appears in the profile 2 times (for 1 fragment
     # instance + the averaged fragment).
-    assert results.runtime_profile.count("EXCHANGE_NODE") == 6, results.runtime_profile
+    assert results.runtime_profile.count("EXCHANGE_NODE") == 6
     # The following appear only in the root fragment which has 1 instance.
-    assert results.runtime_profile.count("HASH_JOIN_NODE") == 2, results.runtime_profile
-    assert results.runtime_profile.count("AGGREGATION_NODE") == 2, results.runtime_profile
-    assert results.runtime_profile.count("PLAN_ROOT_SINK") == 2, results.runtime_profile
+    assert results.runtime_profile.count("HASH_JOIN_NODE") == 2
+    assert results.runtime_profile.count("AGGREGATION_NODE") == 2
+    assert results.runtime_profile.count("PLAN_ROOT_SINK") == 2
 
   # IMPALA-6399: Run this test serially to avoid a delay over the wait time in fetching
   # the profile.