You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by ta...@apache.org on 2019/11/01 22:33:00 UTC

[impala] 01/02: IMPALA-9073: fix test_executor_concurrency flakiness

This is an automated email from the ASF dual-hosted git repository.

tarmstrong pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git

commit eb8f415f7c93eaac9e712ea245ebd1a8f6338c01
Author: Tim Armstrong <ta...@cloudera.com>
AuthorDate: Thu Oct 31 15:49:25 2019 -0700

    IMPALA-9073: fix test_executor_concurrency flakiness
    
    The test was checking the incorrect invariant - the
    slot mechanism only prevents more than than number of
    queries running on a backend. More queries can run on
    a cluster since the query's backends are freed up before
    the query itself finishes.
    
    It was a little tricky picking an appropriate metric
    since there is no strong consistency between the
    metrics, e.g. decrementing a metric after a backend
    finishes may race with admitting the next query.
    
    So I simply used the same metric used by the admission
    controller in making decisions, which should be
    strongly consistent w.r.t. admission control decissions.
    
    Also remove the concurrency limit on the coordinator,
    which seemed inconsistent with the purpose of the
    test, because we only want concurrency to be limited
    by the executors.
    
    Testing:
    Looped the test for a bit.
    
    Change-Id: I910028919f248a3bf5de345e9eade9dbc4353ebd
    Reviewed-on: http://gerrit.cloudera.org:8080/14606
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 tests/custom_cluster/test_executor_groups.py | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/tests/custom_cluster/test_executor_groups.py b/tests/custom_cluster/test_executor_groups.py
index da3beee..1bb070a 100644
--- a/tests/custom_cluster/test_executor_groups.py
+++ b/tests/custom_cluster/test_executor_groups.py
@@ -20,6 +20,7 @@
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.util.concurrent_workload import ConcurrentWorkload
 
+import json
 import logging
 import pytest
 from time import sleep
@@ -261,7 +262,6 @@ class TestExecutorGroups(CustomClusterTestSuite):
     client.cancel(q2)
 
   @pytest.mark.execute_serially
-  @CustomClusterTestSuite.with_args(impalad_args="-admission_control_slots=3")
   def test_executor_concurrency(self):
     """Tests that the command line flag to limit query concurrency on executors works as
     expected."""
@@ -282,16 +282,23 @@ class TestExecutorGroups(CustomClusterTestSuite):
                   for _ in range(RAMP_UP_TIMEOUT_S)), \
           "Did not admit enough queries within %s s" % RAMP_UP_TIMEOUT_S
 
-      # Sample the number of running queries for while
-      NUM_RUNNING_SAMPLES = 30
-      num_running = []
-      for _ in xrange(NUM_RUNNING_SAMPLES):
-        num_running.append(self._get_num_running_queries())
+      # Sample the number of admitted queries on each backend for while.
+      # Note that the total number of queries in the cluster can higher
+      # than 3 because resources may be released on some backends, allowing
+      # a new query to fit (see IMPALA-9073).
+      NUM_SAMPLES = 30
+      executor_slots_in_use = []
+      for _ in xrange(NUM_SAMPLES):
+        backends_json = json.loads(
+            self.impalad_test_service.read_debug_webpage('backends?json'))
+        for backend in backends_json['backends']:
+          if backend['is_executor']:
+            executor_slots_in_use.append(backend['admission_slots_in_use'])
         sleep(1)
 
       # Must reach 3 but not exceed it
-      assert max(num_running) == 3, \
-          "Unexpected number of running queries: %s" % num_running
+      assert max(executor_slots_in_use) == 3, \
+          "Unexpected number of slots in use: %s" % executor_slots_in_use
 
     finally:
       LOG.info("Stopping workload")