You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by bb...@apache.org on 2020/11/06 21:55:52 UTC

[geode-native] branch develop updated: GEODE-8688: Fix flaky C++ native client integration tests (#686)

This is an automated email from the ASF dual-hosted git repository.

bbender pushed a commit to branch develop
in repository https://gitbox.apache.org/repos/asf/geode-native.git


The following commit(s) were added to refs/heads/develop by this push:
     new 5c7d47d  GEODE-8688: Fix flaky C++ native client integration tests (#686)
5c7d47d is described below

commit 5c7d47d34c2b8a53874ec6f53e66c2290fd0427c
Author: Alberto Gomez <al...@est.tech>
AuthorDate: Fri Nov 6 22:52:10 2020 +0100

    GEODE-8688: Fix flaky C++ native client integration tests (#686)
    
    * GEODE-8688: Fix flaky C++ native client integration tests
    
    The following integration test cases under
    integration/test (new integration tests)
    ar flaky (do not
    fail normally when run locally but fail very often
    when run in CI).
    
    - PartitionRegionOpsTest.getPartitionedRegionWithRedundancyServerGoesDownSingleHop
    - PartitionRegionOpsTest.putPartitionedRegionWithRedundancyServerGoesDownSingleHop
    
    There were two reasons that can make them fail.
    
    One of them is that sometimes the connections to the server have expired
    before the server is restarted and therefore, when traffic is sent
    to the restarted server, no errors are found. To fix this,
    the pool configuration for the test client
    has been changed so that connections do not expire.
    
    The other reason is that sometimes the error in the connection is
    found by the ping thread that is invoking the
    ThinClientPoolDM::sendRequestToEP() method and in this method,
    when the IO error or TIMEOUT error are encountered,
    the endpoint is not removed from the metadata (by means of the
    removeBucketServerLocation method).
    The code has been updated to remove the metadata also in this
    case.
    
    With these two changes, the test cases are not flaky anymore.
---
 .../integration/test/PartitionRegionOpsTest.cpp    |  8 +++---
 cppcache/src/ThinClientPoolDM.cpp                  | 30 +++++++++++-----------
 cppcache/src/ThinClientPoolDM.hpp                  |  2 ++
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/cppcache/integration/test/PartitionRegionOpsTest.cpp b/cppcache/integration/test/PartitionRegionOpsTest.cpp
index 27fea22..29989d7 100644
--- a/cppcache/integration/test/PartitionRegionOpsTest.cpp
+++ b/cppcache/integration/test/PartitionRegionOpsTest.cpp
@@ -74,6 +74,8 @@ std::shared_ptr<Pool> createPool(Cluster& cluster, Cache& cache,
   auto poolFactory = cache.getPoolManager().createFactory();
   cluster.applyLocators(poolFactory);
   poolFactory.setPRSingleHopEnabled(singleHop);
+  poolFactory.setLoadConditioningInterval(std::chrono::milliseconds::zero());
+  poolFactory.setIdleTimeout(std::chrono::milliseconds::zero());
   return poolFactory.create("default");
 }
 
@@ -144,9 +146,9 @@ void verifyMetadataWasRemovedAtFirstError() {
       }
     }
   }
-  ASSERT_TRUE((timeoutErrors == metadataRemovedDueToTimeout) &&
-              (ioErrors == metadataRemovedDueToIoErr) &&
-              (metadataRemovedDueToTimeout != metadataRemovedDueToIoErr));
+  EXPECT_EQ(timeoutErrors, metadataRemovedDueToTimeout);
+  EXPECT_EQ(ioErrors, metadataRemovedDueToIoErr);
+  EXPECT_NE(metadataRemovedDueToTimeout, metadataRemovedDueToIoErr);
 }
 
 void putPartitionedRegionWithRedundancyServerGoesDown(bool singleHop) {
diff --git a/cppcache/src/ThinClientPoolDM.cpp b/cppcache/src/ThinClientPoolDM.cpp
index dcc816b..95c9867 100644
--- a/cppcache/src/ThinClientPoolDM.cpp
+++ b/cppcache/src/ThinClientPoolDM.cpp
@@ -1426,14 +1426,7 @@ GfErrType ThinClientPoolDM::sendSyncRequest(
             GF_SAFE_DELETE_CON(conn);
           }
           excludeServers.insert(ServerLocation(ep->name()));
-          if ((error == GF_IOERR || error == GF_TIMEOUT) &&
-              m_clientMetadataService) {
-            auto sl = std::make_shared<BucketServerLocation>(ep->name());
-            LOGINFO("Removing bucketServerLocation %s due to %s",
-                    sl->toString().c_str(),
-                    (error == GF_IOERR ? "GF_IOERR" : "GF_TIMEOUT"));
-            m_clientMetadataService->removeBucketServerLocation(sl);
-          }
+          removeEPFromMetadataIfError(error, ep);
         }
       } else {
         return error;  // server exception while sending credential message to
@@ -1533,6 +1526,17 @@ GfErrType ThinClientPoolDM::sendSyncRequest(
   return error;
 }
 
+void ThinClientPoolDM::removeEPFromMetadataIfError(const GfErrType& error,
+                                                   const TcrEndpoint* ep) {
+  if ((error == GF_IOERR || error == GF_TIMEOUT) && (m_clientMetadataService)) {
+    auto sl = std::make_shared<BucketServerLocation>(ep->name());
+    LOGINFO("Removing bucketServerLocation %s due to %s",
+            sl->toString().c_str(),
+            (error == GF_IOERR ? "GF_IOERR" : "GF_TIMEOUT"));
+    m_clientMetadataService->removeBucketServerLocation(sl);
+  }
+}
+
 void ThinClientPoolDM::removeEPConnections(int numConn,
                                            bool triggerManageConn) {
   // TODO: Delete EP
@@ -1941,6 +1945,7 @@ GfErrType ThinClientPoolDM::sendRequestToEP(const TcrMessage& request,
       if (putConnInPool) {
         removeEPConnections(1);
       }
+      removeEPFromMetadataIfError(error, currentEndpoint);
     }
 
     if (error == GF_NOERR || error == GF_CACHESERVER_EXCEPTION ||
@@ -2343,13 +2348,8 @@ TcrConnection* ThinClientPoolDM::getConnectionFromQueueW(
                   version);
         }
         return nullptr;
-      } else if ((*error == GF_IOERR || *error == GF_TIMEOUT) &&
-                 m_clientMetadataService) {
-        auto sl = std::make_shared<BucketServerLocation>(theEP->name());
-        LOGINFO("Removing bucketServerLocation %s due to %s",
-                sl->toString().c_str(),
-                (*error == GF_IOERR ? "GF_IOERR" : "GF_TIMEOUT"));
-        m_clientMetadataService->removeBucketServerLocation(sl);
+      } else {
+        removeEPFromMetadataIfError(*error, theEP);
       }
     }
   }
diff --git a/cppcache/src/ThinClientPoolDM.hpp b/cppcache/src/ThinClientPoolDM.hpp
index ffd3d90..1c2d789 100644
--- a/cppcache/src/ThinClientPoolDM.hpp
+++ b/cppcache/src/ThinClientPoolDM.hpp
@@ -312,6 +312,8 @@ class ThinClientPoolDM
   static const char* NC_Ping_Thread;
   static const char* NC_MC_Thread;
   int m_primaryServerQueueSize;
+  void removeEPFromMetadataIfError(const GfErrType& error,
+                                   const TcrEndpoint* ep);
 };
 
 class FunctionExecution : public PooledWork<GfErrType> {