You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2020/07/10 17:21:52 UTC

[impala] 02/02: IMPALA-9834: De-flake TestQueryRetries on EC builds

This is an automated email from the ASF dual-hosted git repository.

stakiar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 70c2073d02675ffc64b09335e6c3a2744bc6d961
Author: Sahil Takiar <ta...@gmail.com>
AuthorDate: Tue Jun 23 17:22:11 2020 -0700

    IMPALA-9834: De-flake TestQueryRetries on EC builds
    
    This patch skips all tests in TestQueryRetries on EC builds.
    
    The tests in TestQueryRetries runs queries that run on three instances
    during regular builds (HDFS, S3, etc.), but only two instances on EC
    builds. This causes some non-deterministism during the test because
    killing an impalad in the mini-cluster won't necessarily cause a retry
    to be triggered.
    
    It bumps up the timeout used when waiting for a query to be retried.
    
    It improves the assertion in __get_query_id_from_profile so that it
    dumps the full profile when the assertion fails. This should help
    debuggability of any test failures that fail in this assertion.
    
    Testing:
    * Ran TestQueryRetries locally
    
    Change-Id: Id5c73c2cbd0ef369175856c41f36d4b0de4b8d71
    Reviewed-on: http://gerrit.cloudera.org:8080/16149
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/custom_cluster/test_query_retries.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tests/custom_cluster/test_query_retries.py b/tests/custom_cluster/test_query_retries.py
index e17b807..54f2334 100644
--- a/tests/custom_cluster/test_query_retries.py
+++ b/tests/custom_cluster/test_query_retries.py
@@ -32,8 +32,16 @@ from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.errors import Timeout
+from tests.common.skip import SkipIfEC
 
 
+# All tests in this class have SkipIfEC because all tests run a query and expect
+# the query to be retried when killing a random impalad. On EC this does not always work
+# because many queries that might run on three impalads for HDFS / S3 builds, might only
+# run on two instances on EC builds. The difference is that EC creates smaller tables
+# compared to data stored on HDFS / S3. If the query is only run on two instances, then
+# randomly killing one impalad won't necessarily trigger a retry of the query.
+@SkipIfEC.fix_later
 class TestQueryRetries(CustomClusterTestSuite):
 
   # A query that shuffles a lot of data. Useful when testing query retries since it
@@ -499,7 +507,7 @@ class TestQueryRetries(CustomClusterTestSuite):
     if not retried_query_id_search: return None
     return retried_query_id_search.group(1)
 
-  def __wait_until_retried(self, handle, timeout=60):
+  def __wait_until_retried(self, handle, timeout=300):
     """Wait until the given query handle has been retried. This is achieved by polling the
     runtime profile of the query and checking the 'Retry Status' field."""
     retried_state = "RETRIED"
@@ -529,7 +537,8 @@ class TestQueryRetries(CustomClusterTestSuite):
   def __get_query_id_from_profile(self, profile):
     """Extracts and returns the query id of the given profile."""
     query_id_search = re.search("Query \(id=(.*)\)", profile)
-    assert query_id_search, "Invalid query profile, has no query id"
+    assert query_id_search, "Invalid query profile, has no query id:\n{0}".format(
+        profile)
     return query_id_search.group(1)
 
   def __get_original_query_profile(self, original_query_id):
@@ -603,6 +612,11 @@ class TestQueryRetries(CustomClusterTestSuite):
     assert "Retried Query Id: {0}".format(retried_query_id) \
         in original_runtime_profile, original_runtime_profile
 
+    # Assert that the original query ran on all three nodes. All queries scan tables
+    # large enough such that scan fragments are scheduled on all impalads.
+    assert re.search("PLAN FRAGMENT.*instances=3", original_runtime_profile), \
+        original_runtime_profile
+
   def __validate_web_ui_state(self):
     """Validate the state of the web ui after a query (or queries) have been retried.
     The web ui should list 0 queries as in flight, running, or queued."""