You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by sa...@apache.org on 2018/02/07 21:23:14 UTC
[3/4] impala git commit: Revert 'IMPALA-6338: Fix flaky
test_profile_fragment_instances'
Revert 'IMPALA-6338: Fix flaky test_profile_fragment_instances'
There have been several crashes observed in testing recently, and its
not clear what's going on, so for now revert this moderately risky
change.
Change-Id: I48c11f0817c5190a3a94f8260f3e8ef653357ab3
Reviewed-on: http://gerrit.cloudera.org:8080/9243
Reviewed-by: Alex Behm <al...@cloudera.com>
Tested-by: Thomas Tauber-Marshall <tm...@cloudera.com>
Project: http://git-wip-us.apache.org/repos/asf/impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/0a9f0369
Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/0a9f0369
Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/0a9f0369
Branch: refs/heads/master
Commit: 0a9f0369659b0996ac9db3a2e9ef877941d8b9eb
Parents: 2890f30
Author: Thomas Tauber-Marshall <tm...@cloudera.com>
Authored: Wed Feb 7 11:12:47 2018 -0800
Committer: Thomas Tauber-Marshall <tm...@cloudera.com>
Committed: Wed Feb 7 20:06:52 2018 +0000
----------------------------------------------------------------------
be/src/common/status.h | 6 ------
be/src/runtime/coordinator-backend-state.cc | 12 ++++++------
be/src/runtime/coordinator-backend-state.h | 6 ++----
be/src/runtime/coordinator.cc | 10 ++++------
be/src/runtime/coordinator.h | 4 +---
tests/query_test/test_observability.py | 10 +++++-----
6 files changed, 18 insertions(+), 30 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/common/status.h
----------------------------------------------------------------------
diff --git a/be/src/common/status.h b/be/src/common/status.h
index f0f91f7..24dba8b 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -95,13 +95,7 @@ class NODISCARD Status {
static Status MemLimitExceeded();
static Status MemLimitExceeded(const std::string& details);
- /// Indicates a 'cancelled' status. CANCELLED should not be reported by a fragment
- /// instance that encounters a problem - instances should return a specific error,
- /// and then the coordinator will initiate cancellation.
- /// TODO: we use this in some places to indicate things other than query cancellation,
- /// which can be confusing.
static const Status CANCELLED;
-
static const Status DEPRECATED_RPC;
/// Copy c'tor makes copy of error detail so Status can be returned by value.
http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator-backend-state.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-backend-state.cc b/be/src/runtime/coordinator-backend-state.cc
index b238cad..914a3e4 100644
--- a/be/src/runtime/coordinator-backend-state.cc
+++ b/be/src/runtime/coordinator-backend-state.cc
@@ -234,7 +234,7 @@ void Coordinator::BackendState::LogFirstInProgress(
}
inline bool Coordinator::BackendState::IsDone() const {
- return num_remaining_instances_ == 0 || (!status_.ok() && !status_.IsCancelled());
+ return num_remaining_instances_ == 0 || !status_.ok();
}
bool Coordinator::BackendState::ApplyExecStatusReport(
@@ -338,8 +338,8 @@ bool Coordinator::BackendState::Cancel() {
// Nothing to cancel if the exec rpc was not sent
if (!rpc_sent_) return false;
- // don't cancel if it already finished (for any reason) or cancelled
- if (IsDone() || status_.IsCancelled()) return false;
+ // don't cancel if it already finished (for any reason)
+ if (IsDone()) return false;
/// If the status is not OK, we still try to cancel - !OK status might mean
/// communication failure between backend and coordinator, but fragment
@@ -391,10 +391,10 @@ bool Coordinator::BackendState::Cancel() {
void Coordinator::BackendState::PublishFilter(const TPublishFilterParams& rpc_params) {
DCHECK_EQ(rpc_params.dst_query_id, query_id_);
{
- // If the backend is already done or cancelled, it's not waiting for this filter, so
- // we skip sending it in this case.
+ // If the backend is already done, it's not waiting for this filter, so we skip
+ // sending it in this case.
lock_guard<mutex> l(lock_);
- if (IsDone() || status_.IsCancelled()) return;
+ if (IsDone()) return;
}
if (fragments_.count(rpc_params.dst_fragment_idx) == 0) return;
http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator-backend-state.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator-backend-state.h b/be/src/runtime/coordinator-backend-state.h
index 860b968..0973ca3 100644
--- a/be/src/runtime/coordinator-backend-state.h
+++ b/be/src/runtime/coordinator-backend-state.h
@@ -219,7 +219,7 @@ class Coordinator::BackendState {
/// If the status indicates an error status, execution has either been aborted by the
/// executing impalad (which then reported the error) or cancellation has been
- /// initiated by the coordinator.
+ /// initiated; either way, execution must not be cancelled.
Status status_;
/// Used to distinguish between errors reported by a specific fragment instance,
@@ -254,9 +254,7 @@ class Coordinator::BackendState {
const FilterRoutingTable& filter_routing_table,
TExecQueryFInstancesParams* rpc_params);
- /// Return true if execution at this backend is done. The backend is considered done if
- /// either all instances have completed, or an error (other than cancel) is encountered.
- /// Caller must hold lock_.
+ /// Return true if execution at this backend is done. Caller must hold lock_.
bool IsDone() const;
};
http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator.cc
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.cc b/be/src/runtime/coordinator.cc
index 05ecf9f..7973775 100644
--- a/be/src/runtime/coordinator.cc
+++ b/be/src/runtime/coordinator.cc
@@ -867,9 +867,6 @@ Status Coordinator::GetNext(QueryResultSet* results, int max_rows, bool* eos) {
ReleaseExecResources();
// wait for all backends to complete before computing the summary
// TODO: relocate this so GetNext() won't have to wait for backends to complete?
- // Note that doing this here allows us to ensure that a query that completes
- // successfully will have a full runtime profile by the time that Fetch() indicates
- // all of the results have been returned.
RETURN_IF_ERROR(WaitForBackendCompletion());
// Release admission control resources after backends are finished.
ReleaseAdmissionControlResources();
@@ -923,8 +920,10 @@ Status Coordinator::UpdateBackendExecStatus(const TReportExecStatusParams& param
Substitute("Unknown backend index $0 (max known: $1)",
params.coord_state_idx, backend_states_.size() - 1));
}
- // If the query was cancelled, don't process the update.
- if (query_status_.IsCancelled()) return Status::OK();
+ BackendState* backend_state = backend_states_[params.coord_state_idx];
+ // TODO: return here if returned_all_results_?
+ // TODO: return CANCELLED in that case? Although that makes the cancellation propagation
+ // path more irregular.
// TODO: only do this when the sink is done; probably missing a done field
// in TReportExecStatus for that
@@ -932,7 +931,6 @@ Status Coordinator::UpdateBackendExecStatus(const TReportExecStatusParams& param
UpdateInsertExecStatus(params.insert_exec_status);
}
- BackendState* backend_state = backend_states_[params.coord_state_idx];
if (backend_state->ApplyExecStatusReport(params, &exec_summary_, &progress_)) {
// This report made this backend done, so update the status and
// num_remaining_backends_.
http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/be/src/runtime/coordinator.h
----------------------------------------------------------------------
diff --git a/be/src/runtime/coordinator.h b/be/src/runtime/coordinator.h
index fbbdfa9..d630b9a 100644
--- a/be/src/runtime/coordinator.h
+++ b/be/src/runtime/coordinator.h
@@ -290,9 +290,7 @@ class Coordinator { // NOLINT: The member variables could be re-ordered to save
boost::mutex lock_;
/// Overall status of the entire query; set to the first reported fragment error
- /// status or to CANCELLED, if Cancel() is called. Note that some fragments may have
- /// status CANCELLED even if this is not CANCELLED if cancellation is initiated because
- /// returned_all_results_ is true or an error is encountered.
+ /// status or to CANCELLED, if Cancel() is called.
Status query_status_;
/// If true, the query is done returning all results. It is possible that the
http://git-wip-us.apache.org/repos/asf/impala/blob/0a9f0369/tests/query_test/test_observability.py
----------------------------------------------------------------------
diff --git a/tests/query_test/test_observability.py b/tests/query_test/test_observability.py
index 960a6f4..e838081 100644
--- a/tests/query_test/test_observability.py
+++ b/tests/query_test/test_observability.py
@@ -124,14 +124,14 @@ class TestObservability(ImpalaTestSuite):
join (select * from l LIMIT 2000000) b on a.l_orderkey = -b.l_orderkey;""")
# There are 3 scan nodes and each appears in the profile 4 times (for 3 fragment
# instances + the averaged fragment).
- assert results.runtime_profile.count("HDFS_SCAN_NODE") == 12, results.runtime_profile
+ assert results.runtime_profile.count("HDFS_SCAN_NODE") == 12
# There are 3 exchange nodes and each appears in the profile 2 times (for 1 fragment
# instance + the averaged fragment).
- assert results.runtime_profile.count("EXCHANGE_NODE") == 6, results.runtime_profile
+ assert results.runtime_profile.count("EXCHANGE_NODE") == 6
# The following appear only in the root fragment which has 1 instance.
- assert results.runtime_profile.count("HASH_JOIN_NODE") == 2, results.runtime_profile
- assert results.runtime_profile.count("AGGREGATION_NODE") == 2, results.runtime_profile
- assert results.runtime_profile.count("PLAN_ROOT_SINK") == 2, results.runtime_profile
+ assert results.runtime_profile.count("HASH_JOIN_NODE") == 2
+ assert results.runtime_profile.count("AGGREGATION_NODE") == 2
+ assert results.runtime_profile.count("PLAN_ROOT_SINK") == 2
# IMPALA-6399: Run this test serially to avoid a delay over the wait time in fetching
# the profile.