You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/08/16 16:03:25 UTC
[impala] 01/04: IMPALA-8867: Deflake test_auto_scaling, improve logging

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit cebed8b88a3ce928dff694e79f04db8c83ba0970
Author: Lars Volker <lv...@cloudera.com>
AuthorDate: Thu Aug 15 11:02:13 2019 -0700

    IMPALA-8867: Deflake test_auto_scaling, improve logging
    
    test_auto_scaling sometimes failed to reach the required query rate
    within the expected time. This change increases the timeout that we wait
    for the query rate to increase. It also adds logging for the maximum
    query rate observed to help with debugging in case the issue still
    occurs.
    
    Testing: I looped this for a day on my desktop box and did not observe
    any issues.
    
    Change-Id: I22c43618a40ff197784add69223359e23fa1bdec
    Reviewed-on: http://gerrit.cloudera.org:8080/14074
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/custom_cluster/test_auto_scaling.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/custom_cluster/test_auto_scaling.py b/tests/custom_cluster/test_auto_scaling.py
index 65149bd..5662575 100644
--- a/tests/custom_cluster/test_auto_scaling.py
+++ b/tests/custom_cluster/test_auto_scaling.py
@@ -99,12 +99,23 @@ class TestAutoScaling(CustomClusterTestSuite):
       assert self.impalad_test_service.get_metric_value(
         "cluster-membership.executor-groups.total-healthy") >= 2
 
-      # Wait for query rate to surpass the maximum for a single executor group plus 20%
+      # Wait for query rate to reach the maximum for a single executor group plus 20%
       min_query_rate = 1.2 * EXECUTOR_SLOTS
-      assert any(workload.get_query_rate() > min_query_rate or sleep(1)
-                 for _ in range(self.STATE_CHANGE_TIMEOUT_S)), \
-                     "Query rate did not surpass %s within %s s" % (
-                     cluster_size, self.STATE_CHANGE_TIMEOUT_S)
+      max_query_rate = 0
+      # This barrier has been flaky in the past so we wait 2x as long as for the other
+      # checks.
+      end = time() + 2 * self.STATE_CHANGE_TIMEOUT_S
+      while time() < end:
+        current_rate = workload.get_query_rate()
+        LOG.info("Current rate: %s" % current_rate)
+        max_query_rate = max(max_query_rate, current_rate)
+        if max_query_rate >= min_query_rate:
+          break
+        sleep(1)
+
+      assert max_query_rate >= min_query_rate, "Query rate did not reach %s within %s " \
+          "s. Maximum was %s. Cluster size is %s." % (min_query_rate,
+          self.STATE_CHANGE_TIMEOUT_S, max_query_rate, cluster_size)
 
       LOG.info("Stopping workload")
       workload.stop()