You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by so...@apache.org on 2023/04/12 19:11:34 UTC

[ozone] branch master updated: HDDS-8336. ReplicationManager: RatisUnderReplicationHandler should partially recover the container if not enough nodes (#4561)

This is an automated email from the ASF dual-hosted git repository.

sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new f625a9f72c HDDS-8336. ReplicationManager: RatisUnderReplicationHandler should partially recover the container if not enough nodes (#4561)
f625a9f72c is described below

commit f625a9f72cbd6b9bda91e19b6ab6bd19ec15922b
Author: Stephen O'Donnell <st...@gmail.com>
AuthorDate: Wed Apr 12 20:11:28 2023 +0100

    HDDS-8336. ReplicationManager: RatisUnderReplicationHandler should partially recover the container if not enough nodes (#4561)
---
 .../replication/MisReplicationHandler.java         |  7 ++--
 .../replication/RatisUnderReplicationHandler.java  | 42 +++++++++++++++++-----
 .../container/replication/ReplicationTestUtil.java | 36 +++++++++++++++++--
 .../TestRatisUnderReplicationHandler.java          | 20 +++++++++++
 4 files changed, 90 insertions(+), 15 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java
index f84306b096..abf7afe309 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java
@@ -90,10 +90,9 @@ public abstract class MisReplicationHandler implements
       }
     }
     throw new SCMException(String.format("Placement Policy: %s did not return"
-                    + " any number of nodes. Number of required "
-                    + "Nodes %d, Datasize Required: %d",
-            containerPlacement.getClass(), requiredNodes, dataSizeRequired),
-            SCMException.ResultCodes.FAILED_TO_FIND_SUITABLE_NODE);
+        + " any nodes. Number of required Nodes %d, Datasize Required: %d",
+        containerPlacement.getClass(), requiredNodes, dataSizeRequired),
+        SCMException.ResultCodes.FAILED_TO_FIND_SUITABLE_NODE);
   }
 
   private Set<ContainerReplica> filterSources(Set<ContainerReplica> replicas) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
index 04b6564579..3f82bbb0eb 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
@@ -26,7 +26,9 @@ import org.apache.hadoop.hdds.scm.PlacementPolicy;
 import org.apache.hadoop.hdds.scm.ScmConfigKeys;
 import org.apache.hadoop.hdds.scm.container.ContainerInfo;
 import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import org.apache.hadoop.hdds.scm.exceptions.SCMException;
 import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.scm.pipeline.InsufficientDatanodesException;
 import org.apache.hadoop.ozone.protocol.commands.ReplicateContainerCommand;
 import org.apache.ratis.protocol.exceptions.NotLeaderException;
 import org.slf4j.Logger;
@@ -108,13 +110,24 @@ public class RatisUnderReplicationHandler
     // find targets to send replicas to
     List<DatanodeDetails> targetDatanodes =
         getTargets(withUnhealthy, pendingOps);
-    if (targetDatanodes.isEmpty()) {
-      LOG.warn("Cannot replicate container {} because no eligible targets " +
-          "were found.", containerInfo);
-      return 0;
-    }
-    return sendReplicationCommands(
+
+    int commandsSent = sendReplicationCommands(
         containerInfo, sourceDatanodes, targetDatanodes);
+
+    if (targetDatanodes.size() < withUnhealthy.additionalReplicaNeeded()) {
+      // The placement policy failed to find enough targets to satisfy fix
+      // the under replication. There fore even though some commands were sent,
+      // we throw an exception to indicate that the container is still under
+      // replicated and should be re-queued for another attempt later.
+      LOG.debug("Placement policy failed to find enough targets to satisfy " +
+          "under replication for container {}. Targets found: {}, " +
+          "additional replicas needed: {}",
+          containerInfo, targetDatanodes.size(),
+          withUnhealthy.additionalReplicaNeeded());
+      throw new InsufficientDatanodesException(
+          withUnhealthy.additionalReplicaNeeded(), targetDatanodes.size());
+    }
+    return commandsSent;
   }
 
   /**
@@ -235,8 +248,21 @@ public class RatisUnderReplicationHandler
     final long dataSizeRequired =
         Math.max(replicaCount.getContainer().getUsedBytes(),
             currentContainerSize);
-    return placementPolicy.chooseDatanodes(excludeList, null,
-        replicaCount.additionalReplicaNeeded(), 0, dataSizeRequired);
+    int requiredNodes = replicaCount.additionalReplicaNeeded();
+    while (requiredNodes > 0) {
+      try {
+        return placementPolicy.chooseDatanodes(excludeList, null,
+            requiredNodes, 0, dataSizeRequired);
+      } catch (IOException e) {
+        LOG.debug("Placement policy was not able to return {} nodes. ",
+            requiredNodes, e);
+        requiredNodes--;
+      }
+    }
+    throw new SCMException(String.format("Placement Policy: %s did not return"
+            + " any nodes. Number of required Nodes %d, Datasize Required: %d",
+        placementPolicy.getClass(), requiredNodes, dataSizeRequired),
+        SCMException.ResultCodes.FAILED_TO_FIND_SUITABLE_NODE);
   }
 
   private int sendReplicationCommands(
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
index d0b1f6322a..bdfb89c53c 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
@@ -42,6 +42,7 @@ import org.mockito.Mockito;
 import org.mockito.stubbing.Answer;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
@@ -286,9 +287,7 @@ public final class ReplicationTestUtil {
           throw new SCMException("Insufficient Nodes available to choose",
               SCMException.ResultCodes.FAILED_TO_FIND_HEALTHY_NODES);
         }
-        List<DatanodeDetails> dns = new ArrayList<>();
-        dns.add(nodeToReturn);
-        return dns;
+        return Collections.singletonList(nodeToReturn);
       }
 
       @Override
@@ -319,6 +318,37 @@ public final class ReplicationTestUtil {
     };
   }
 
+  /**
+   * Placement policy that throws an exception when the number of requested
+   * nodes is greater or equal to throwWhenThisOrMoreNodesRequested, otherwise
+   * returns a random node.
+   */
+  public static PlacementPolicy getInsufficientNodesTestPlacementPolicy(
+      final NodeManager nodeManager, final OzoneConfiguration conf,
+      int throwWhenThisOrMoreNodesRequested) {
+    return new SCMCommonPlacementPolicy(nodeManager, conf) {
+      @Override
+      protected List<DatanodeDetails> chooseDatanodesInternal(
+          List<DatanodeDetails> usedNodes,
+          List<DatanodeDetails> excludedNodes,
+          List<DatanodeDetails> favoredNodes, int nodesRequiredToChoose,
+          long metadataSizeRequired, long dataSizeRequired)
+          throws SCMException {
+        if (nodesRequiredToChoose >= throwWhenThisOrMoreNodesRequested) {
+          throw new SCMException("No nodes available",
+              FAILED_TO_FIND_SUITABLE_NODE);
+        }
+        return Collections
+            .singletonList(MockDatanodeDetails.randomDatanodeDetails());
+      }
+
+      @Override
+      public DatanodeDetails chooseNode(List<DatanodeDetails> healthyNodes) {
+        return null;
+      }
+    };
+  }
+
   /**
    * Given a Mockito mock of ReplicationManager, this method will mock the
    * SendThrottledReplicationCommand method so that it adds the command created
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
index f545809107..a2f9bdcfd8 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager.Repli
 import org.apache.hadoop.hdds.scm.node.NodeManager;
 import org.apache.hadoop.hdds.scm.node.NodeStatus;
 import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.scm.pipeline.InsufficientDatanodesException;
 import org.apache.hadoop.ozone.container.common.SCMTestUtils;
 import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
 import org.apache.ratis.protocol.exceptions.NotLeaderException;
@@ -209,6 +210,25 @@ public class TestRatisUnderReplicationHandler {
             Collections.emptyList(), getUnderReplicatedHealthResult(), 2));
   }
 
+  @Test
+  public void testInsufficientTargetsFoundBecauseOfPlacementPolicy() {
+    policy = ReplicationTestUtil.getInsufficientNodesTestPlacementPolicy(
+        nodeManager, conf, 2);
+    RatisUnderReplicationHandler handler =
+        new RatisUnderReplicationHandler(policy, conf, replicationManager);
+
+    // Only one replica is available, so we need to create 2 new ones.
+    Set<ContainerReplica> replicas
+        = createReplicas(container.containerID(), State.CLOSED, 0);
+
+    Assert.assertThrows(InsufficientDatanodesException.class,
+        () -> handler.processAndSendCommands(replicas,
+            Collections.emptyList(), getUnderReplicatedHealthResult(), 2));
+    // One command should be sent to the replication manager as we could only
+    // fine one node rather than two.
+    Assert.assertEquals(1, commandsSent.size());
+  }
+
   @Test
   public void testUnhealthyReplicasAreReplicatedWhenHealthyAreUnavailable()
       throws IOException {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org