You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by st...@apache.org on 2022/09/09 05:21:41 UTC
[impala] 02/03: IMPALA-7864: (Addendum) Deflake test_replan_limit by postponing catalog fetches

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch branch-4.1.1
in repository https://gitbox.apache.org/repos/asf/impala.git

commit 2aeb6013fa44e53031d82b7e7ca59d771037d60a
Author: stiga-huang <hu...@gmail.com>
AuthorDate: Wed May 18 16:51:37 2022 +0800

    IMPALA-7864: (Addendum) Deflake test_replan_limit by postponing catalog fetches
    
    TestLocalCatalogRetries.test_replan_limit runs REFRESH and SELECT
    queries concurrently on a table, and expects one of the query hits
    inconsistent metadata.
    
    This patch increases the chance of inconsistent metadata by injecting
    a latency (500ms) before each catalog fetch. So it's more likely that a
    request is fetching stale metadata. Also bump up the timeout of
    thread.join() so we can try out all the attempts.
    
    Test
     - Run test_replan_limit 1000 times without any error.
     - Run all tests of TestLocalCatalogRetries 100 times without any error.
    
    Change-Id: Ia5bdca7402039f1f24b7bf19595c2541fa32d0ad
    Reviewed-on: http://gerrit.cloudera.org:8080/18537
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
    Reviewed-on: http://gerrit.cloudera.org:8080/18951
    Reviewed-by: Csaba Ringhofer <cs...@cloudera.com>
    Tested-by: Quanlong Huang <hu...@gmail.com>
---
 be/src/exec/catalog-op-executor.cc         |  5 +++++
 tests/custom_cluster/test_local_catalog.py | 12 ++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/be/src/exec/catalog-op-executor.cc b/be/src/exec/catalog-op-executor.cc
index 646e6aa52..c6c245428 100644
--- a/be/src/exec/catalog-op-executor.cc
+++ b/be/src/exec/catalog-op-executor.cc
@@ -55,6 +55,8 @@ DECLARE_int32(catalog_client_connection_num_retries);
 DECLARE_int32(catalog_client_rpc_timeout_ms);
 DECLARE_int32(catalog_client_rpc_retry_interval_ms);
 
+DEFINE_int32_hidden(inject_latency_before_catalog_fetch_ms, 0,
+    "Latency (ms) to be injected before fetching catalog data from the catalogd");
 DEFINE_int32_hidden(inject_latency_after_catalog_fetch_ms, 0,
     "Latency (ms) to be injected after fetching catalog data from the catalogd");
 
@@ -366,6 +368,9 @@ Status CatalogOpExecutor::GetPartialCatalogObject(
   DCHECK(FLAGS_use_local_catalog || TestInfo::is_test());
   const TNetworkAddress& address =
       MakeNetworkAddress(FLAGS_catalog_service_host, FLAGS_catalog_service_port);
+  if (FLAGS_inject_latency_before_catalog_fetch_ms > 0) {
+    SleepForMs(FLAGS_inject_latency_before_catalog_fetch_ms);
+  }
   int attempt = 0; // Used for debug action only.
   CatalogServiceConnection::RpcStatus rpc_status =
       CatalogServiceConnection::DoRpcWithRetry(env_->catalogd_client_cache(), address,
diff --git a/tests/custom_cluster/test_local_catalog.py b/tests/custom_cluster/test_local_catalog.py
index 63b0cbb91..6e74a4de0 100644
--- a/tests/custom_cluster/test_local_catalog.py
+++ b/tests/custom_cluster/test_local_catalog.py
@@ -273,8 +273,9 @@ class TestLocalCatalogRetries(CustomClusterTestSuite):
           q = random.choice(queries)
           attempt += 1
           try:
+            print 'Attempt', attempt, 'client', str(client)
             ret = self.execute_query_unchecked(client, q)
-          except Exception, e:
+          except Exception as e:
             if 'InconsistentMetadataFetchException' in str(e):
               with inconsistent_seen_lock:
                 inconsistent_seen[0] += 1
@@ -287,7 +288,8 @@ class TestLocalCatalogRetries(CustomClusterTestSuite):
         t.start()
       for t in threads:
         # When there are failures, they're observed quickly.
-        t.join(30)
+        # 600s is enough for 200 attempts.
+        t.join(600)
 
       assert failed_queries.empty(),\
           "Failed query count non zero: %s" % list(failed_queries.queue)
@@ -318,7 +320,8 @@ class TestLocalCatalogRetries(CustomClusterTestSuite):
 
   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(
-      impalad_args="--use_local_catalog=true --local_catalog_max_fetch_retries=0",
+      impalad_args="--use_local_catalog=true --local_catalog_max_fetch_retries=0"
+                   " --inject_latency_before_catalog_fetch_ms=500",
       catalogd_args="--catalog_topic_mode=minimal")
   def test_replan_limit(self):
     """
@@ -326,7 +329,8 @@ class TestLocalCatalogRetries(CustomClusterTestSuite):
     an inconsistent metadata exception when running concurrent reads/writes
     is seen. With the max retries set to 0, no retries are expected and with
     the concurrent read/write workload, an inconsistent metadata exception is
-    expected.
+    expected. Setting inject_latency_before_catalog_fetch_ms to increases the
+    possibility of a stale request which throws the expected exception.
     """
     queries = [
       'refresh functional.alltypes',