You are viewing a plain text version of this content. The canonical link for it is here.
Posted to pr@cassandra.apache.org by "Maxwell-Guo (via GitHub)" <gi...@apache.org> on 2023/06/06 05:37:18 UTC

[GitHub] [cassandra] Maxwell-Guo commented on a diff in pull request #2374: A new nodetool/JMX command that tells whether node's decommission fai…

Maxwell-Guo commented on code in PR #2374:
URL: https://github.com/apache/cassandra/pull/2374#discussion_r1218906517


##########
src/java/org/apache/cassandra/service/StorageService.java:
##########
@@ -5127,100 +5128,113 @@ private void startLeaving()
 
     public void decommission(boolean force) throws InterruptedException
     {
-        TokenMetadata metadata = tokenMetadata.cloneAfterAllLeft();
-        if (operationMode != Mode.LEAVING)
+        try
         {
-            if (!tokenMetadata.isMember(FBUtilities.getBroadcastAddressAndPort()))
-                throw new UnsupportedOperationException("local node is not a member of the token ring yet");
-            if (metadata.getAllEndpoints().size() < 2)
+            TokenMetadata metadata = tokenMetadata.cloneAfterAllLeft();
+            if (operationMode != Mode.LEAVING)
+            {
+                if (!tokenMetadata.isMember(FBUtilities.getBroadcastAddressAndPort()))
+                    throw new UnsupportedOperationException("local node is not a member of the token ring yet");
+                if (metadata.getAllEndpoints().size() < 2)
                     throw new UnsupportedOperationException("no other normal nodes in the ring; decommission would be pointless");
-            if (operationMode != Mode.NORMAL)
-                throw new UnsupportedOperationException("Node in " + operationMode + " state; wait for status to become normal or restart");
-        }
-        if (!isDecommissioning.compareAndSet(false, true))
-            throw new IllegalStateException("Node is still decommissioning. Check nodetool netstats.");
+                if (operationMode != Mode.NORMAL)
+                    throw new UnsupportedOperationException("Node in " + operationMode + " state; wait for status to become normal or restart");
+            }
+            if (!isDecommissioning.compareAndSet(false, true))
+                throw new IllegalStateException("Node is still decommissioning. Check nodetool netstats.");
 
-        if (logger.isDebugEnabled())
-            logger.debug("DECOMMISSIONING");
+            if (logger.isDebugEnabled())
+                logger.debug("DECOMMISSIONING");
 
-        try
-        {
-            PendingRangeCalculatorService.instance.blockUntilFinished();
+            try
+            {
+                PendingRangeCalculatorService.instance.blockUntilFinished();
 
-            String dc = DatabaseDescriptor.getEndpointSnitch().getLocalDatacenter();
+                String dc = DatabaseDescriptor.getEndpointSnitch().getLocalDatacenter();
 
-            if (operationMode != Mode.LEAVING) // If we're already decommissioning there is no point checking RF/pending ranges
-            {
-                int rf, numNodes;
-                for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces().names())
+                if (operationMode != Mode.LEAVING) // If we're already decommissioning there is no point checking RF/pending ranges
                 {
-                    if (!force)
+                    int rf, numNodes;
+                    for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces().names())
                     {
-                        Keyspace keyspace = Keyspace.open(keyspaceName);
-                        if (keyspace.getReplicationStrategy() instanceof NetworkTopologyStrategy)
+                        if (!force)
                         {
-                            NetworkTopologyStrategy strategy = (NetworkTopologyStrategy) keyspace.getReplicationStrategy();
-                            rf = strategy.getReplicationFactor(dc).allReplicas;
-                            numNodes = metadata.getTopology().getDatacenterEndpoints().get(dc).size();
-                        }
-                        else
-                        {
-                            numNodes = metadata.getAllEndpoints().size();
-                            rf = keyspace.getReplicationStrategy().getReplicationFactor().allReplicas;
-                        }
+                            Keyspace keyspace = Keyspace.open(keyspaceName);
+                            if (keyspace.getReplicationStrategy() instanceof NetworkTopologyStrategy)
+                            {
+                                NetworkTopologyStrategy strategy = (NetworkTopologyStrategy) keyspace.getReplicationStrategy();
+                                rf = strategy.getReplicationFactor(dc).allReplicas;
+                                numNodes = metadata.getTopology().getDatacenterEndpoints().get(dc).size();
+                            }
+                            else
+                            {
+                                numNodes = metadata.getAllEndpoints().size();
+                                rf = keyspace.getReplicationStrategy().getReplicationFactor().allReplicas;
+                            }
 
-                        if (numNodes <= rf)
-                            throw new UnsupportedOperationException("Not enough live nodes to maintain replication factor in keyspace "
-                                                                    + keyspaceName + " (RF = " + rf + ", N = " + numNodes + ")."
-                                                                    + " Perform a forceful decommission to ignore.");
+                            if (numNodes <= rf)
+                                throw new UnsupportedOperationException("Not enough live nodes to maintain replication factor in keyspace "
+                                                                        + keyspaceName + " (RF = " + rf + ", N = " + numNodes + ")."
+                                                                        + " Perform a forceful decommission to ignore.");
+                        }
+                        // TODO: do we care about fixing transient/full self-movements here? probably
+                        if (tokenMetadata.getPendingRanges(keyspaceName, FBUtilities.getBroadcastAddressAndPort()).size() > 0)
+                            throw new UnsupportedOperationException("data is currently moving to this node; unable to leave the ring");
                     }
-                    // TODO: do we care about fixing transient/full self-movements here? probably
-                    if (tokenMetadata.getPendingRanges(keyspaceName, FBUtilities.getBroadcastAddressAndPort()).size() > 0)
-                        throw new UnsupportedOperationException("data is currently moving to this node; unable to leave the ring");
                 }
-            }
 
-            startLeaving();
-            long timeout = Math.max(RING_DELAY_MILLIS, BatchlogManager.instance.getBatchlogTimeout());
-            setMode(Mode.LEAVING, "sleeping " + timeout + " ms for batch processing and pending range setup", true);
-            Thread.sleep(timeout);
+                startLeaving();
+                long timeout = Math.max(RING_DELAY_MILLIS, BatchlogManager.instance.getBatchlogTimeout());
+                setMode(Mode.LEAVING, "sleeping " + timeout + " ms for batch processing and pending range setup", true);
+                Thread.sleep(timeout);
 
-            Runnable finishLeaving = new Runnable()
-            {
-                public void run()
+                Runnable finishLeaving = new Runnable()
                 {
-                    shutdownClientServers();
-                    Gossiper.instance.stop();
-                    try
-                    {
-                        MessagingService.instance().shutdown();
-                    }
-                    catch (IOError ioe)
+                    public void run()
                     {
-                        logger.info("failed to shutdown message service: {}", ioe);
-                    }
+                        shutdownClientServers();
+                        Gossiper.instance.stop();
+                        try
+                        {
+                            MessagingService.instance().shutdown();
+                        }
+                        catch (IOError ioe)
+                        {
+                            logger.info("failed to shutdown message service: {}", ioe);
+                        }
 
-                    Stage.shutdownNow();
-                    SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.DECOMMISSIONED);
-                    setMode(Mode.DECOMMISSIONED, true);
-                    // let op be responsible for killing the process
-                }
-            };
-            unbootstrap(finishLeaving);
-        }
-        catch (InterruptedException e)
-        {
-            throw new UncheckedInterruptedException(e);
-        }
-        catch (ExecutionException e)
-        {
-            logger.error("Error while decommissioning node ", e.getCause());
-            throw new RuntimeException("Error while decommissioning node: " + e.getCause().getMessage());
-        }
-        finally
-        {
-            isDecommissioning.set(false);
-        }
+                        Stage.shutdownNow();
+                        SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.DECOMMISSIONED);
+                        setMode(Mode.DECOMMISSIONED, true);
+                        // let op be responsible for killing the process
+                    }
+                };
+                unbootstrap(finishLeaving);
+            }
+            catch (InterruptedException e)
+            {
+                throw new UncheckedInterruptedException(e);
+            }
+            catch (ExecutionException e)
+            {
+                logger.error("Error while decommissioning node ", e.getCause());
+                throw new RuntimeException("Error while decommissioning node: " + e.getCause().getMessage());
+            }
+            finally
+            {
+                isDecommissioning.set(false);
+            }
+        } catch (Exception e)

Review Comment:
   I think there is a problem for code format with cath '{'



##########
test/unit/org/apache/cassandra/service/StorageServiceServerTest.java:
##########
@@ -658,4 +660,26 @@ public void isReplacingSameHostAddressAndHostIdTest() throws UnknownHostExceptio
             Assert.assertFalse(StorageService.instance.isReplacingSameHostAddressAndHostId(differentHostId));
         }
     }
-}
\ No newline at end of file
+
+    @Test
+    public void testIsDecommissionNotFailed()
+    {
+        assertFalse(StorageService.instance.isDecommissionFailed());
+        assertEquals(0, StorageMetrics.errorDecommissiong.getCount());
+    }
+
+    @Test

Review Comment:
   I think we can also add a new test case use ToolRunner.invokeNodetool see GossipInfoTest
   we can use at frist set the node's mode to Un-normal and then use ToolRunner.invokeNodetool  to get the status of descommission.
   



##########
src/java/org/apache/cassandra/service/StorageService.java:
##########
@@ -5236,6 +5250,21 @@ private void leaveRing()
         Uninterruptibles.sleepUninterruptibly(delay, MILLISECONDS);
     }
 
+    public boolean isDecommissionFailed()
+    {
+        if (operationMode == Mode.LEAVING && hasDecommissionFailed)

Review Comment:
   I think we can change this to `if (hasDecommissionFailed && operationMode == Mode.LEAVING)`
   if hasDecommissionFailed is false we will not need to do the comparison with operationMode
   Edit :
   or why we just return 
   hasDecommissionFailed && operationMode == Mode.LEAVING
   ? and not need for if () else 
   



##########
src/java/org/apache/cassandra/service/StorageService.java:
##########
@@ -433,6 +433,7 @@ public enum Mode { STARTING, NORMAL, JOINING, LEAVING, DECOMMISSIONED, MOVING, D
     private static final boolean allowSimultaneousMoves = CONSISTENT_SIMULTANEOUS_MOVES_ALLOW.getBoolean();
     private static final boolean joinRing = JOIN_RING.getBoolean();
     private boolean replacing;
+    private volatile boolean hasDecommissionFailed = false;

Review Comment:
   I think we can change the name to isDecommissionFailed as src/java/org/apache/cassandra/service/StorageServiceMBean.java  method isDecommissionFailed();
   Edit :
   can we just AtomicBoolean 



##########
src/java/org/apache/cassandra/service/StorageService.java:
##########
@@ -5127,100 +5128,113 @@ private void startLeaving()
 
     public void decommission(boolean force) throws InterruptedException
     {
-        TokenMetadata metadata = tokenMetadata.cloneAfterAllLeft();
-        if (operationMode != Mode.LEAVING)
+        try
         {
-            if (!tokenMetadata.isMember(FBUtilities.getBroadcastAddressAndPort()))
-                throw new UnsupportedOperationException("local node is not a member of the token ring yet");
-            if (metadata.getAllEndpoints().size() < 2)
+            TokenMetadata metadata = tokenMetadata.cloneAfterAllLeft();
+            if (operationMode != Mode.LEAVING)
+            {
+                if (!tokenMetadata.isMember(FBUtilities.getBroadcastAddressAndPort()))
+                    throw new UnsupportedOperationException("local node is not a member of the token ring yet");
+                if (metadata.getAllEndpoints().size() < 2)
                     throw new UnsupportedOperationException("no other normal nodes in the ring; decommission would be pointless");
-            if (operationMode != Mode.NORMAL)
-                throw new UnsupportedOperationException("Node in " + operationMode + " state; wait for status to become normal or restart");
-        }
-        if (!isDecommissioning.compareAndSet(false, true))
-            throw new IllegalStateException("Node is still decommissioning. Check nodetool netstats.");
+                if (operationMode != Mode.NORMAL)
+                    throw new UnsupportedOperationException("Node in " + operationMode + " state; wait for status to become normal or restart");
+            }
+            if (!isDecommissioning.compareAndSet(false, true))
+                throw new IllegalStateException("Node is still decommissioning. Check nodetool netstats.");
 
-        if (logger.isDebugEnabled())
-            logger.debug("DECOMMISSIONING");
+            if (logger.isDebugEnabled())
+                logger.debug("DECOMMISSIONING");
 
-        try
-        {
-            PendingRangeCalculatorService.instance.blockUntilFinished();
+            try
+            {
+                PendingRangeCalculatorService.instance.blockUntilFinished();
 
-            String dc = DatabaseDescriptor.getEndpointSnitch().getLocalDatacenter();
+                String dc = DatabaseDescriptor.getEndpointSnitch().getLocalDatacenter();
 
-            if (operationMode != Mode.LEAVING) // If we're already decommissioning there is no point checking RF/pending ranges
-            {
-                int rf, numNodes;
-                for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces().names())
+                if (operationMode != Mode.LEAVING) // If we're already decommissioning there is no point checking RF/pending ranges
                 {
-                    if (!force)
+                    int rf, numNodes;
+                    for (String keyspaceName : Schema.instance.getNonLocalStrategyKeyspaces().names())
                     {
-                        Keyspace keyspace = Keyspace.open(keyspaceName);
-                        if (keyspace.getReplicationStrategy() instanceof NetworkTopologyStrategy)
+                        if (!force)
                         {
-                            NetworkTopologyStrategy strategy = (NetworkTopologyStrategy) keyspace.getReplicationStrategy();
-                            rf = strategy.getReplicationFactor(dc).allReplicas;
-                            numNodes = metadata.getTopology().getDatacenterEndpoints().get(dc).size();
-                        }
-                        else
-                        {
-                            numNodes = metadata.getAllEndpoints().size();
-                            rf = keyspace.getReplicationStrategy().getReplicationFactor().allReplicas;
-                        }
+                            Keyspace keyspace = Keyspace.open(keyspaceName);
+                            if (keyspace.getReplicationStrategy() instanceof NetworkTopologyStrategy)
+                            {
+                                NetworkTopologyStrategy strategy = (NetworkTopologyStrategy) keyspace.getReplicationStrategy();
+                                rf = strategy.getReplicationFactor(dc).allReplicas;
+                                numNodes = metadata.getTopology().getDatacenterEndpoints().get(dc).size();
+                            }
+                            else
+                            {
+                                numNodes = metadata.getAllEndpoints().size();
+                                rf = keyspace.getReplicationStrategy().getReplicationFactor().allReplicas;
+                            }
 
-                        if (numNodes <= rf)
-                            throw new UnsupportedOperationException("Not enough live nodes to maintain replication factor in keyspace "
-                                                                    + keyspaceName + " (RF = " + rf + ", N = " + numNodes + ")."
-                                                                    + " Perform a forceful decommission to ignore.");
+                            if (numNodes <= rf)
+                                throw new UnsupportedOperationException("Not enough live nodes to maintain replication factor in keyspace "
+                                                                        + keyspaceName + " (RF = " + rf + ", N = " + numNodes + ")."
+                                                                        + " Perform a forceful decommission to ignore.");
+                        }
+                        // TODO: do we care about fixing transient/full self-movements here? probably
+                        if (tokenMetadata.getPendingRanges(keyspaceName, FBUtilities.getBroadcastAddressAndPort()).size() > 0)
+                            throw new UnsupportedOperationException("data is currently moving to this node; unable to leave the ring");
                     }
-                    // TODO: do we care about fixing transient/full self-movements here? probably
-                    if (tokenMetadata.getPendingRanges(keyspaceName, FBUtilities.getBroadcastAddressAndPort()).size() > 0)
-                        throw new UnsupportedOperationException("data is currently moving to this node; unable to leave the ring");
                 }
-            }
 
-            startLeaving();
-            long timeout = Math.max(RING_DELAY_MILLIS, BatchlogManager.instance.getBatchlogTimeout());
-            setMode(Mode.LEAVING, "sleeping " + timeout + " ms for batch processing and pending range setup", true);
-            Thread.sleep(timeout);
+                startLeaving();
+                long timeout = Math.max(RING_DELAY_MILLIS, BatchlogManager.instance.getBatchlogTimeout());
+                setMode(Mode.LEAVING, "sleeping " + timeout + " ms for batch processing and pending range setup", true);
+                Thread.sleep(timeout);
 
-            Runnable finishLeaving = new Runnable()
-            {
-                public void run()
+                Runnable finishLeaving = new Runnable()
                 {
-                    shutdownClientServers();
-                    Gossiper.instance.stop();
-                    try
-                    {
-                        MessagingService.instance().shutdown();
-                    }
-                    catch (IOError ioe)
+                    public void run()
                     {
-                        logger.info("failed to shutdown message service: {}", ioe);
-                    }
+                        shutdownClientServers();
+                        Gossiper.instance.stop();
+                        try
+                        {
+                            MessagingService.instance().shutdown();
+                        }
+                        catch (IOError ioe)
+                        {
+                            logger.info("failed to shutdown message service: {}", ioe);
+                        }
 
-                    Stage.shutdownNow();
-                    SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.DECOMMISSIONED);
-                    setMode(Mode.DECOMMISSIONED, true);
-                    // let op be responsible for killing the process
-                }
-            };
-            unbootstrap(finishLeaving);
-        }
-        catch (InterruptedException e)
-        {
-            throw new UncheckedInterruptedException(e);
-        }
-        catch (ExecutionException e)
-        {
-            logger.error("Error while decommissioning node ", e.getCause());
-            throw new RuntimeException("Error while decommissioning node: " + e.getCause().getMessage());
-        }
-        finally
-        {
-            isDecommissioning.set(false);
-        }
+                        Stage.shutdownNow();
+                        SystemKeyspace.setBootstrapState(SystemKeyspace.BootstrapState.DECOMMISSIONED);
+                        setMode(Mode.DECOMMISSIONED, true);
+                        // let op be responsible for killing the process
+                    }
+                };
+                unbootstrap(finishLeaving);
+            }
+            catch (InterruptedException e)
+            {
+                throw new UncheckedInterruptedException(e);
+            }
+            catch (ExecutionException e)
+            {
+                logger.error("Error while decommissioning node ", e.getCause());
+                throw new RuntimeException("Error while decommissioning node: " + e.getCause().getMessage());
+            }
+            finally
+            {
+                isDecommissioning.set(false);
+            }
+        } catch (Exception e)

Review Comment:
   Besides , can we use Throwable instand of Exception ?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: pr-unsubscribe@cassandra.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: pr-unsubscribe@cassandra.apache.org
For additional commands, e-mail: pr-help@cassandra.apache.org