You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cassandra.apache.org by dc...@apache.org on 2022/06/01 17:50:33 UTC

[cassandra] branch trunk updated: When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata

This is an automated email from the ASF dual-hosted git repository.

dcapwell pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/cassandra.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 740cec41d2 When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata
740cec41d2 is described below

commit 740cec41d2d67783a463bd18f70221de331928df
Author: David Capwell <dc...@apache.org>
AuthorDate: Wed Jun 1 08:49:44 2022 -0700

    When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata
    
    patch by David Capwell; reviewed by Blake Eggleston for CASSANDRA-17676
---
 CHANGES.txt                                        |  1 +
 src/java/org/apache/cassandra/gms/Gossiper.java    | 46 +++++++++++++++++++++-
 .../org/apache/cassandra/gms/VersionedValue.java   |  3 ++
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index fb75bc510d..30dfdf763c 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,4 +1,5 @@
 4.2
+ * When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata (CASSANDRA-17676)
  * Add CQLSH command SHOW REPLICAS (CASSANDRA-17577)
  * Add guardrail to allow disabling of SimpleStrategy (CASSANDRA-17647)
  * Change default directory permission to 750 in packaging (CASSANDRA-17470)
diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java
index 4c72166a9d..8041eed034 100644
--- a/src/java/org/apache/cassandra/gms/Gossiper.java
+++ b/src/java/org/apache/cassandra/gms/Gossiper.java
@@ -85,6 +85,7 @@ import static org.apache.cassandra.net.Verb.GOSSIP_DIGEST_SYN;
 import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort;
 import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis;
 import static org.apache.cassandra.utils.Clock.Global.nanoTime;
+import static org.apache.cassandra.gms.VersionedValue.BOOTSTRAPPING_STATUS;
 
 /**
  * This module is responsible for Gossiping information for the local endpoint. This abstraction
@@ -1500,11 +1501,54 @@ public class Gossiper implements IFailureDetectionEventListener, GossiperMBean
         return pieces[0];
     }
 
+    /**
+     * Gossip offers no happens-before relationship, but downstream subscribers assume a happens-before relationship
+     * before being notified!  To attempt to be nicer to subscribers, this {@link Comparator} attempts to order EndpointState
+     * within a map based off a few heuristics:
+     * <ol>
+     *     <li>STATUS - some STATUS depends on other instance STATUS, so make sure they are last; eg. BOOT, and BOOT_REPLACE</li>
+     *     <li>generation - normally defined as system clock millis, this can be skewed and is a best effort</li>
+     *     <li>address - tie breaker to make sure order is consistent</li>
+     * </ol>
+     * <p>
+     * Problems:
+     * Generation is normally defined as system clock millis, which can be skewed and in-consistent cross nodes
+     * (generations do not have a happens-before relationship, so ordering is sketchy at best).
+     * <p>
+     * Motivations:
+     * {@link Map#entrySet()} returns data in effectivlly random order, so can get into a situation such as the following example.
+     * {@code
+     * 3 node cluster: n1, n2, and n3
+     * n2 goes down and n4 does host replacement and fails before completion
+     * h5 tries to do a host replacement against n4 (ignore the fact this doesn't make sense)
+     * }
+     * In that case above, the {@link Map#entrySet()} ordering can be random, causing h4 to apply before h2, which will
+     * be rejected by subscripers (only after updating gossip causing zero retries).
+     */
+    private static Comparator<Entry<InetAddressAndPort, EndpointState>> STATE_MAP_ORDERING =
+    ((Comparator<Entry<InetAddressAndPort, EndpointState>>) (e1, e2) -> {
+        // check status first, make sure bootstrap status happens-after all others
+        if (BOOTSTRAPPING_STATUS.contains(getGossipStatus(e1.getValue())))
+            return 1;
+        if (BOOTSTRAPPING_STATUS.contains(getGossipStatus(e2.getValue())))
+            return -1;
+        return 0;
+    })
+    .thenComparingInt((Entry<InetAddressAndPort, EndpointState> e) -> e.getValue().getHeartBeatState().getGeneration())
+    .thenComparing(Entry::getKey);
+
+    private static Iterable<Entry<InetAddressAndPort, EndpointState>> order(Map<InetAddressAndPort, EndpointState> epStateMap)
+    {
+        List<Entry<InetAddressAndPort, EndpointState>> list = new ArrayList<>(epStateMap.entrySet());
+        Collections.sort(list, STATE_MAP_ORDERING);
+        return list;
+    }
+
     @VisibleForTesting
     public void applyStateLocally(Map<InetAddressAndPort, EndpointState> epStateMap)
     {
         checkProperThreadForStateMutation();
-        for (Entry<InetAddressAndPort, EndpointState> entry : epStateMap.entrySet())
+        for (Entry<InetAddressAndPort, EndpointState> entry : order(epStateMap))
         {
             InetAddressAndPort ep = entry.getKey();
             if (ep.equals(getBroadcastAddressAndPort()) && !isInShadowRound())
diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java b/src/java/org/apache/cassandra/gms/VersionedValue.java
index 26644e17cc..519fffa9b8 100644
--- a/src/java/org/apache/cassandra/gms/VersionedValue.java
+++ b/src/java/org/apache/cassandra/gms/VersionedValue.java
@@ -27,6 +27,7 @@ import java.util.stream.Collectors;
 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Iterables;
 
 import org.apache.cassandra.db.TypeSizes;
@@ -84,6 +85,8 @@ public class VersionedValue implements Comparable<VersionedValue>
     // values for ApplicationState.REMOVAL_COORDINATOR
     public final static String REMOVAL_COORDINATOR = "REMOVER";
 
+    public static Set<String> BOOTSTRAPPING_STATUS = ImmutableSet.of(STATUS_BOOTSTRAPPING, STATUS_BOOTSTRAPPING_REPLACE);
+
     public final int version;
     public final String value;
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@cassandra.apache.org
For additional commands, e-mail: commits-help@cassandra.apache.org