You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by jm...@apache.org on 2021/08/27 19:51:15 UTC

[cassandra] branch cassandra-4.0 updated: Tolerate missing DNS entry when completing host replacement

This is an automated email from the ASF dual-hosted git repository.

jmckenzie pushed a commit to branch cassandra-4.0
in repository https://gitbox.apache.org/repos/asf/cassandra.git


The following commit(s) were added to refs/heads/cassandra-4.0 by this push:
     new f59411f  Tolerate missing DNS entry when completing host replacement
f59411f is described below

commit f59411f1c985043850154971e9c4066013f355cb
Author: Chris Lohfink <cl...@apple.com>
AuthorDate: Fri Aug 13 11:07:34 2021 -0400

    Tolerate missing DNS entry when completing host replacement
    
    patch by Chris Lohfink; reviewed by Brandon Williams for CASSANDRA-16873
    
    Co-authored by Chris Lohfink <cl...@apple.com>
    Co-authored by Josh McKenzie <jm...@apache.org>
---
 CHANGES.txt                                        |  1 +
 .../apache/cassandra/service/StorageService.java   | 37 ++++++++++++++++++++--
 .../service/StorageServiceServerTest.java          | 34 +++++++++++++++++++-
 3 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index b9d4aa1..4236e0c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 4.0.1
+ * Tolerate missing DNS entry when completing a host replacement (CASSANDRA-16873)
  * Harden PrunableArrayQueue against Pruner implementations that might throw exceptions (CASSANDRA-16866)
  * Move RepairedDataInfo to the execution controller rather than the ReadCommand to avoid unintended sharing (CASSANDRA-16721)
  * Bump zstd-jni version to 1.5.0-4 (CASSANDRA-16884)
diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java
index 9e2d020..6c72682 100644
--- a/src/java/org/apache/cassandra/service/StorageService.java
+++ b/src/java/org/apache/cassandra/service/StorageService.java
@@ -2719,6 +2719,38 @@ public class StorageService extends NotificationBroadcasterSupport implements IE
             SystemKeyspace.updateTokens(endpoint, tokensToUpdateInSystemKeyspace);
     }
 
+    @VisibleForTesting
+    public boolean isReplacingSameHostAddressAndHostId(UUID hostId)
+    {
+        try
+        {
+            return isReplacingSameAddress() &&
+                    Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress()) != null
+                    && hostId.equals(Gossiper.instance.getHostId(DatabaseDescriptor.getReplaceAddress()));
+        }
+        catch (RuntimeException ex)
+        {
+            // If a host is decomissioned and the DNS entry is removed before the
+            // bootstrap completes, when it completes and advertises NORMAL state to other nodes, they will be unable
+            // to resolve it to an InetAddress unless it happens to be cached. This could happen on nodes
+            // storing large amounts of data or with long index rebuild times or if new instances have been added
+            // to the cluster through expansion or additional host replacement.
+            //
+            // The original host replacement must have been able to resolve the replacing address on startup
+            // when setting StorageService.replacing, so if it is impossible to resolve now it is probably
+            // decommissioned and did not have the same IP address or host id.  Allow the handleStateNormal
+            // handling to proceed, otherwise gossip state will be inconistent with some nodes believing the
+            // replacement host to be normal, and nodes unable to resolve the hostname will be left in JOINING.
+            if (ex.getCause() != null && ex.getCause().getClass() == UnknownHostException.class)
+            {
+                logger.info("Suppressed exception while checking isReplacingSameHostAddressAndHostId({}). Original host was probably decommissioned. ({})",
+                        hostId, ex.getMessage());
+                return false;
+            }
+            throw ex; // otherwise rethrow
+        }
+    }
+
     /**
      * Handle node move to normal state. That is, node is entering token ring and participating
      * in reads.
@@ -2764,9 +2796,10 @@ public class StorageService extends NotificationBroadcasterSupport implements IE
         // Order Matters, TM.updateHostID() should be called before TM.updateNormalToken(), (see CASSANDRA-4300).
         UUID hostId = Gossiper.instance.getHostId(endpoint);
         InetAddressAndPort existing = tokenMetadata.getEndpointForHostId(hostId);
-        if (replacing && isReplacingSameAddress() && Gossiper.instance.getEndpointStateForEndpoint(DatabaseDescriptor.getReplaceAddress()) != null
-            && (hostId.equals(Gossiper.instance.getHostId(DatabaseDescriptor.getReplaceAddress()))))
+        if (replacing && isReplacingSameHostAddressAndHostId(hostId))
+        {
             logger.warn("Not updating token metadata for {} because I am replacing it", endpoint);
+        }
         else
         {
             if (existing != null && !existing.equals(endpoint))
diff --git a/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java b/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java
index c0071b4..9dda9f9 100644
--- a/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java
+++ b/test/unit/org/apache/cassandra/service/StorageServiceServerTest.java
@@ -24,11 +24,13 @@ import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.net.InetAddress;
+import java.net.UnknownHostException;
 import java.util.*;
 
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
 
+import org.apache.cassandra.db.SystemKeyspace;
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -697,4 +699,34 @@ public class StorageServiceServerTest
         assertTrue(AuditLogManager.instance.isEnabled());
         StorageService.instance.disableAuditLog();
     }
-}
+
+    @Test
+    public void isReplacingSameHostAddressAndHostIdTest() throws UnknownHostException
+    {
+        try
+        {
+            UUID differentHostId = UUID.randomUUID();
+            Assert.assertFalse(StorageService.instance.isReplacingSameHostAddressAndHostId(differentHostId));
+
+            final String hostAddress = FBUtilities.getBroadcastAddressAndPort().getHostAddress(false);
+            UUID localHostId = SystemKeyspace.getLocalHostId();
+            Gossiper.instance.initializeNodeUnsafe(FBUtilities.getBroadcastAddressAndPort(), localHostId, 1);
+
+            // Check detects replacing the same host address with the same hostid
+            System.setProperty("cassandra.replace_address", hostAddress);
+            Assert.assertTrue(StorageService.instance.isReplacingSameHostAddressAndHostId(localHostId));
+
+            // Check detects replacing the same host address with a different host id
+            System.setProperty("cassandra.replace_address", hostAddress);
+            Assert.assertFalse(StorageService.instance.isReplacingSameHostAddressAndHostId(differentHostId));
+
+            // Check tolerates the DNS entry going away for the replace_address
+            System.setProperty("cassandra.replace_address", "unresolvable.host.local.");
+            Assert.assertFalse(StorageService.instance.isReplacingSameHostAddressAndHostId(differentHostId));
+        }
+        finally
+        {
+            System.clearProperty("cassandra.replace_address");
+        }
+    }
+}
\ No newline at end of file

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@cassandra.apache.org
For additional commands, e-mail: commits-help@cassandra.apache.org