You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@geode.apache.org by on...@apache.org on 2021/06/02 22:01:19 UTC

[geode] 01/05: GEODE-7861: Improve error reporting in GMSJoinLeave.join() (#5839)

This is an automated email from the ASF dual-hosted git repository.

onichols pushed a commit to branch support/1.13
in repository https://gitbox.apache.org/repos/asf/geode.git

commit 53d32c325c808d6e965a45cdf36aca1a71db2183
Author: Kamilla Aslami <ka...@vmware.com>
AuthorDate: Fri Jan 8 14:57:02 2021 -0800

    GEODE-7861: Improve error reporting in GMSJoinLeave.join() (#5839)
    
    * GEODE-7861: Improve error reporting in GMSJoinLeave.join()
    
    * Fix LocatorDUnitTest.testNoLocator
    
    * Changes after the code review
    
    * Fix typo
    
    (cherry picked from commit 089c1ba7e20606f8201a4cd8f7221f6adc60ba5c)
---
 .../apache/geode/distributed/LocatorDUnitTest.java |   3 +-
 .../gms/membership/GMSJoinLeaveJUnitTest.java      | 100 ++++++++++++++++-----
 .../internal/membership/gms/GMSMembership.java     |   7 +-
 .../membership/gms/interfaces/JoinLeave.java       |  12 ++-
 .../membership/gms/membership/GMSJoinLeave.java    |  76 ++++++++++------
 5 files changed, 143 insertions(+), 55 deletions(-)

diff --git a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
index d3c1733..e0f8966 100644
--- a/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
+++ b/geode-core/src/distributedTest/java/org/apache/geode/distributed/LocatorDUnitTest.java
@@ -1002,7 +1002,8 @@ public class LocatorDUnitTest implements Serializable {
 
     } catch (GemFireConfigException ex) {
       String s = ex.getMessage();
-      assertThat(s.contains("Locator does not exist")).isTrue();
+      assertThat(s.contains("Could not contact any of the locators"))
+          .isTrue();
     }
   }
 
diff --git a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
index 3a86a1e..fd15db6 100644
--- a/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
+++ b/geode-membership/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeaveJUnitTest.java
@@ -22,12 +22,14 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.any;
 import static org.mockito.ArgumentMatchers.anyBoolean;
 import static org.mockito.ArgumentMatchers.anyInt;
 import static org.mockito.ArgumentMatchers.eq;
 import static org.mockito.Matchers.isA;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.timeout;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
@@ -115,11 +117,18 @@ public class GMSJoinLeaveJUnitTest {
 
   public void initMocks(boolean enableNetworkPartition, boolean useTestGMSJoinLeave)
       throws Exception {
+    String locator = "localhost[12345]";
+    initMocks(enableNetworkPartition, useTestGMSJoinLeave, locator, locator);
+  }
+
+  public void initMocks(boolean enableNetworkPartition, boolean useTestGMSJoinLeave,
+      String locators, String startLocator)
+      throws Exception {
     mockConfig = mock(MembershipConfig.class);
     when(mockConfig.isNetworkPartitionDetectionEnabled()).thenReturn(enableNetworkPartition);
     when(mockConfig.getSecurityUDPDHAlgo()).thenReturn("");
-    when(mockConfig.getStartLocator()).thenReturn("localhost[12345]");
-    when(mockConfig.getLocators()).thenReturn("localhost[12345]");
+    when(mockConfig.getStartLocator()).thenReturn(startLocator);
+    when(mockConfig.getLocators()).thenReturn(locators);
     when(mockConfig.getMcastPort()).thenReturn(0);
     when(mockConfig.getMemberTimeout()).thenReturn(2000L);
 
@@ -1423,14 +1432,7 @@ public class GMSJoinLeaveJUnitTest {
   @Test
   public void testCoordinatorFindRequestSuccess() throws Exception {
     initMocks(false);
-    HashSet<MemberIdentifier> registrants = new HashSet<>();
-    registrants.add(mockMembers[0]);
-    FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0], false,
-        null, registrants, false, true, null);
-
-    when(locatorClient.requestToServer(isA(HostAndPort.class),
-        isA(FindCoordinatorRequest.class), anyInt(), anyBoolean()))
-            .thenReturn(fcr);
+    mockRequestToServer(isA(HostAndPort.class));
 
     boolean foundCoordinator = gmsJoinLeave.findCoordinator();
     assertTrue(gmsJoinLeave.searchState.toString(), foundCoordinator);
@@ -1441,24 +1443,82 @@ public class GMSJoinLeaveJUnitTest {
   public void testCoordinatorFindRequestFailure() throws Exception {
     try {
       initMocks(false);
-      HashSet<MemberIdentifier> registrants = new HashSet<>();
-      registrants.add(mockMembers[0]);
-      FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0],
-          false, null, registrants, false, true, null);
+      mockRequestToServer(eq(new HostAndPort("localhost", 12346)));
       GMSMembershipView view = createView();
       JoinResponseMessage jrm = new JoinResponseMessage(mockMembers[0], view, 0);
       gmsJoinLeave.setJoinResponseMessage(jrm);
 
-      when(locatorClient.requestToServer(eq(new HostAndPort("localhost", 12346)),
-          isA(FindCoordinatorRequest.class), anyInt(), anyBoolean()))
-              .thenReturn(fcr);
-
-      assertFalse("Should not be able to join ", gmsJoinLeave.join());
+      assertThatThrownBy(gmsJoinLeave::join)
+          .isInstanceOf(MembershipConfigurationException.class);
     } finally {
-
     }
   }
 
+  @Test
+  public void testJoinFailureWhenSleepInterrupted() throws Exception {
+    initMocks(false);
+    mockRequestToServer(isA(HostAndPort.class));
+
+    when(mockConfig.getMemberTimeout()).thenReturn(100L);
+    when(mockConfig.getJoinTimeout()).thenReturn(1000L);
+
+    GMSJoinLeave spyGmsJoinLeave = spy(gmsJoinLeave);
+    when(spyGmsJoinLeave.pauseIfThereIsNoCoordinator(-1, GMSJoinLeave.JOIN_RETRY_SLEEP))
+        .thenThrow(new InterruptedException());
+
+    assertThatThrownBy(spyGmsJoinLeave::join)
+        .isInstanceOf(MembershipConfigurationException.class)
+        .hasMessageContaining("Retry sleep interrupted");
+  }
+
+  @Test
+  public void testJoinFailureWhenTimeout() throws Exception {
+    initMocks(false);
+    mockRequestToServer(isA(HostAndPort.class));
+
+    assertThatThrownBy(() -> gmsJoinLeave.join())
+        .isInstanceOf(MembershipConfigurationException.class)
+        .hasMessageContaining("Operation timed out");
+  }
+
+  @Test
+  public void testPauseIfThereIsNoCoordinator() throws InterruptedException {
+    locatorClient = mock(TcpClient.class);
+    gmsJoinLeave = new GMSJoinLeave(locatorClient);
+    assertThat(gmsJoinLeave.pauseIfThereIsNoCoordinator(-1, GMSJoinLeave.JOIN_RETRY_SLEEP))
+        .isFalse();
+    assertThat(gmsJoinLeave.pauseIfThereIsNoCoordinator(1, GMSJoinLeave.JOIN_RETRY_SLEEP)).isTrue();
+  }
+
+  @Test
+  public void testJoinFailureWhenNoLocator() throws Exception {
+    final String locator1 = "locator1[12345]";
+    final String locator2 = "locator2[54321]";
+    locatorClient = mock(TcpClient.class);
+
+    initMocks(false, false, locator1 + ',' + locator2, locator1);
+    when(locatorClient.requestToServer(any(), any(), anyInt(), anyBoolean()))
+        .thenThrow(IOException.class);
+
+    assertThatThrownBy(gmsJoinLeave::join)
+        .isInstanceOf(MembershipConfigurationException.class)
+        .hasMessageContaining(
+            "Could not contact any of the locators: [HostAndPort[locator1:12345], HostAndPort[locator2:54321]]")
+        .hasCauseInstanceOf(IOException.class);
+  }
+
+  private void mockRequestToServer(HostAndPort hostAndPort)
+      throws IOException, ClassNotFoundException {
+    HashSet<MemberIdentifier> registrants = new HashSet<>();
+    registrants.add(mockMembers[0]);
+
+    FindCoordinatorResponse fcr = new FindCoordinatorResponse(mockMembers[0], mockMembers[0], false,
+        null, registrants, false, true, null);
+    when(locatorClient.requestToServer(hostAndPort,
+        isA(FindCoordinatorRequest.class), anyInt(), anyBoolean()))
+            .thenReturn(fcr);
+  }
+
   private void waitForViewAndFinalCheckInProgress(int viewId) throws InterruptedException {
     // wait for the view processing thread to collect and process the requests
     int sleeps = 0;
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
index 1986931..434093c 100644
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/GMSMembership.java
@@ -571,12 +571,7 @@ public class GMSMembership<ID extends MemberIdentifier> implements Membership<ID
         this.isJoining = true; // added for bug #44373
 
         // connect
-        boolean ok = services.getJoinLeave().join();
-
-        if (!ok) {
-          throw new MembershipConfigurationException("Unable to join the distributed system.  "
-              + "Operation either timed out, was stopped or Locator does not exist.");
-        }
+        services.getJoinLeave().join();
 
         MembershipView<ID> initialView = createGeodeView(services.getJoinLeave().getView());
         latestView = new MembershipView<>(initialView, initialView.getViewId());
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java
index 7228162..1880a26 100755
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/interfaces/JoinLeave.java
@@ -16,6 +16,7 @@ package org.apache.geode.distributed.internal.membership.gms.interfaces;
 
 import org.apache.geode.distributed.internal.membership.api.MemberIdentifier;
 import org.apache.geode.distributed.internal.membership.api.MemberStartupException;
+import org.apache.geode.distributed.internal.membership.api.MembershipConfigurationException;
 import org.apache.geode.distributed.internal.membership.gms.GMSMembershipView;
 
 /**
@@ -26,10 +27,15 @@ import org.apache.geode.distributed.internal.membership.gms.GMSMembershipView;
 public interface JoinLeave<ID extends MemberIdentifier> extends Service<ID> {
 
   /**
-   * joins the distributed system and returns true if successful, false if not. Throws
-   * MemberStartupException and MemberConfigurationException
+   * joins the distributed system.
+   *
+   * @throws MemberStartupException if there was a problem joining the cluster after membership
+   *         configuration has
+   *         completed.
+   * @throws MembershipConfigurationException if operation either timed out, was stopped or locator
+   *         does not exist.
    */
-  boolean join() throws MemberStartupException;
+  void join() throws MemberStartupException;
 
   /**
    * leaves the distributed system. Should be invoked before stop()
diff --git a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
index 208cfb5..87587d5 100644
--- a/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
+++ b/geode-membership/src/main/java/org/apache/geode/distributed/internal/membership/gms/membership/GMSJoinLeave.java
@@ -273,11 +273,13 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID>
     int lastFindCoordinatorInViewId = -1000;
     final Set<FindCoordinatorResponse<ID>> responses = new HashSet<>();
     public int responsesExpected;
+    Exception lastLocatorException;
 
     void cleanup() {
       alreadyTried.clear();
       possibleCoordinator = null;
       view = null;
+      lastLocatorException = null;
       synchronized (responses) {
         responses.clear();
       }
@@ -315,14 +317,14 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID>
    * @return true if successful, false if not
    */
   @Override
-  public boolean join() throws MemberStartupException {
+  public void join() throws MemberStartupException {
 
     try {
       if (Boolean.getBoolean(BYPASS_DISCOVERY_PROPERTY)) {
         synchronized (viewInstallationLock) {
           becomeCoordinator();
         }
-        return true;
+        return;
       }
 
       SearchState<ID> state = searchState;
@@ -355,11 +357,11 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID>
               synchronized (viewInstallationLock) {
                 becomeCoordinator();
               }
-              return true;
+              return;
             }
           } else {
             if (attemptToJoin()) {
-              return true;
+              return;
             }
             if (this.isStopping) {
               break;
@@ -383,40 +385,45 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID>
             break;
           }
         }
-        try {
-          if (found && !state.hasContactedAJoinedLocator) {
-            // if locators are restarting they may be handing out IDs from a stale view that
-            // we should go through quickly. Otherwise we should sleep a bit to let failure
-            // detection select a new coordinator
-            if (state.possibleCoordinator.getVmViewId() < 0) {
-              logger.debug("sleeping for {} before making another attempt to find the coordinator",
-                  retrySleep);
-              Thread.sleep(retrySleep);
-            } else {
+        if (found && !state.hasContactedAJoinedLocator) {
+          try {
+            if (pauseIfThereIsNoCoordinator(state.possibleCoordinator.getVmViewId(), retrySleep)) {
               // since we were given a coordinator that couldn't be used we should keep trying
               tries = 0;
               giveupTime = System.currentTimeMillis() + timeout;
             }
+          } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            throw new MembershipConfigurationException(
+                "Retry sleep interrupted. Giving up on joining the distributed system.");
           }
-        } catch (InterruptedException e) {
-          logger.debug("retry sleep interrupted - giving up on joining the distributed system");
-          return false;
         }
       } // for
 
       if (!this.isJoined) {
         logger.debug("giving up attempting to join the distributed system after "
             + (System.currentTimeMillis() - startTime) + "ms");
-      }
 
-      // to preserve old behavior we need to throw a MemberStartupException if
-      // unable to contact any of the locators
-      if (!this.isJoined && state.hasContactedAJoinedLocator) {
-        throw new MemberStartupException("Unable to join the distributed system in "
-            + (System.currentTimeMillis() - startTime) + "ms");
-      }
+        // to preserve old behavior we need to throw a MemberStartupException if
+        // unable to contact any of the locators
+        if (state.hasContactedAJoinedLocator) {
+          throw new MemberStartupException("Unable to join the distributed system in "
+              + (System.currentTimeMillis() - startTime) + "ms");
+        }
 
-      return this.isJoined;
+        if (state.locatorsContacted == 0) {
+          throw new MembershipConfigurationException(
+              "Unable to join the distributed system. Could not contact any of the locators: "
+                  + locators,
+              state.lastLocatorException);
+        }
+
+        if (System.currentTimeMillis() > giveupTime) {
+          throw new MembershipConfigurationException(
+              "Unable to join the distributed system. Operation timed out");
+        }
+      }
+      return;
     } finally {
       // notify anyone waiting on the address to be completed
       if (this.isJoined) {
@@ -428,6 +435,24 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID>
     }
   }
 
+  boolean pauseIfThereIsNoCoordinator(int viewId, long retrySleep)
+      throws InterruptedException {
+    // if locators are restarting they may be handing out IDs from a stale view that
+    // we should go through quickly. Otherwise we should sleep a bit to let failure
+    // detection select a new coordinator
+    if (viewId < 0) {
+      // the process hasn't finished joining the cluster.
+      logger.debug("sleeping for {} before making another attempt to find the coordinator",
+          retrySleep);
+      Thread.sleep(retrySleep);
+    } else {
+      // the member has joined the cluster.
+      return true;
+    }
+
+    return false;
+  }
+
   /**
    * send a join request and wait for a reply. Process the reply. This may throw a
    * MemberStartupException or an exception from the authenticator, if present.
@@ -1199,6 +1224,7 @@ public class GMSJoinLeave<ID extends MemberIdentifier> implements JoinLeave<ID>
         } catch (IOException | ClassNotFoundException problem) {
           logger.info("Unable to contact locator " + laddr + ": " + problem);
           logger.debug("Exception thrown when contacting a locator", problem);
+          state.lastLocatorException = problem;
           if (state.locatorsContacted == 0 && System.currentTimeMillis() < giveUpTime) {
             try {
               Thread.sleep(FIND_LOCATOR_RETRY_SLEEP);