You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@impala.apache.org by wz...@apache.org on 2023/08/08 22:52:12 UTC

[impala] branch master updated: IMPALA-12340: Fix flaky test_two_catalogd_with_force_active

This is an automated email from the ASF dual-hosted git repository.

wzhou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


The following commit(s) were added to refs/heads/master by this push:
     new 3a9b52faf IMPALA-12340: Fix flaky test_two_catalogd_with_force_active
3a9b52faf is described below

commit 3a9b52faf18ab76724aaeb0bc3ea35e379c3c044
Author: wzhou-code <wz...@cloudera.com>
AuthorDate: Sun Aug 6 23:31:36 2023 -0700

    IMPALA-12340: Fix flaky test_two_catalogd_with_force_active
    
    The issue could be re-produced by repeatedly running the test case for
    about 50 times in Jenkins or on local machine.
    The issue was introduced by IMPALA-12304, for which we tried not to
    wake up the thread for update_catalogd RPC if there is no change for
    elected active catalogd. Since we cannot hold mutex when calling
    SendUpdateCatalogdNotification(), it may cause the notification to the
    condition variable not been processed. To fix the issue, revert the
    optimization added in IMPALA-12304, and add sleep in
    test_two_catalogd_with_force_active.
    
    Testing:
     - Repeatedly ran test_two_catalogd_with_force_active for 8000 times on
       local machine without failure, ran the test for 1000 times in Jenkins
       without failure.
       Repeatedly ran all test cases in test_catalogd_ha.py for 21 hours in
       Jenkins without failure.
     - Passed core test.
    
    Change-Id: Ifc213422a5a3360d07a22046e1f46fdf0be1d2fd
    Reviewed-on: http://gerrit.cloudera.org:8080/20323
    Reviewed-by: Impala Public Jenkins <im...@cloudera.com>
    Tested-by: Impala Public Jenkins <im...@cloudera.com>
---
 be/src/statestore/statestore.cc          | 8 +++-----
 tests/custom_cluster/test_catalogd_ha.py | 2 ++
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/be/src/statestore/statestore.cc b/be/src/statestore/statestore.cc
index 12e545412..f946e54b5 100644
--- a/be/src/statestore/statestore.cc
+++ b/be/src/statestore/statestore.cc
@@ -1244,11 +1244,7 @@ void Statestore::DoSubscriberUpdate(UpdateKind update_kind, int thread_id,
   while (1) {
     {
       unique_lock<mutex> l(*catalog_manager_.GetLock());
-      if (rpc_receivers.empty()) {
-        update_catalod_cv_.Wait(l);
-      } else {
-        update_catalod_cv_.WaitFor(l, timeout_us);
-      }
+      update_catalod_cv_.WaitFor(l, timeout_us);
     }
     SendUpdateCatalogdNotification(&last_active_catalogd_version, rpc_receivers);
   }
@@ -1272,6 +1268,8 @@ void Statestore::SendUpdateCatalogdNotification(int64_t* last_active_catalogd_ve
   bool resend_rpc = false;
   if (active_catalogd_version > *last_active_catalogd_version) {
     // Send notification for the latest elected active catalogd.
+    LOG(INFO) << "Send notification for active catalogd version: "
+              << active_catalogd_version;
     active_catalogd_address_metric_->SetValue(
         TNetworkAddressToString(catalogd_registration.address));
     rpc_receivers.clear();
diff --git a/tests/custom_cluster/test_catalogd_ha.py b/tests/custom_cluster/test_catalogd_ha.py
index dcdcf744a..6cc966e61 100644
--- a/tests/custom_cluster/test_catalogd_ha.py
+++ b/tests/custom_cluster/test_catalogd_ha.py
@@ -367,6 +367,8 @@ class TestCatalogdHA(CustomClusterTestSuite):
     Verify that one and only one catalogd is active."""
     catalogds = self.cluster.catalogds()
     assert(len(catalogds) == 2)
+    sleep_time_s = build_flavor_timeout(2, slow_build_timeout=5)
+    sleep(sleep_time_s)
     catalogd_service_1 = catalogds[0].service
     catalogd_service_2 = catalogds[1].service
     assert(catalogd_service_1.get_metric_value("catalog-server.active-status")