You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by so...@apache.org on 2023/04/12 19:11:34 UTC
[ozone] branch master updated: HDDS-8336. ReplicationManager: RatisUnderReplicationHandler should partially recover the container if not enough nodes (#4561)
This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new f625a9f72c HDDS-8336. ReplicationManager: RatisUnderReplicationHandler should partially recover the container if not enough nodes (#4561)
f625a9f72c is described below
commit f625a9f72cbd6b9bda91e19b6ab6bd19ec15922b
Author: Stephen O'Donnell <st...@gmail.com>
AuthorDate: Wed Apr 12 20:11:28 2023 +0100
HDDS-8336. ReplicationManager: RatisUnderReplicationHandler should partially recover the container if not enough nodes (#4561)
---
.../replication/MisReplicationHandler.java | 7 ++--
.../replication/RatisUnderReplicationHandler.java | 42 +++++++++++++++++-----
.../container/replication/ReplicationTestUtil.java | 36 +++++++++++++++++--
.../TestRatisUnderReplicationHandler.java | 20 +++++++++++
4 files changed, 90 insertions(+), 15 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java
index f84306b096..abf7afe309 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/MisReplicationHandler.java
@@ -90,10 +90,9 @@ public abstract class MisReplicationHandler implements
}
}
throw new SCMException(String.format("Placement Policy: %s did not return"
- + " any number of nodes. Number of required "
- + "Nodes %d, Datasize Required: %d",
- containerPlacement.getClass(), requiredNodes, dataSizeRequired),
- SCMException.ResultCodes.FAILED_TO_FIND_SUITABLE_NODE);
+ + " any nodes. Number of required Nodes %d, Datasize Required: %d",
+ containerPlacement.getClass(), requiredNodes, dataSizeRequired),
+ SCMException.ResultCodes.FAILED_TO_FIND_SUITABLE_NODE);
}
private Set<ContainerReplica> filterSources(Set<ContainerReplica> replicas) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
index 04b6564579..3f82bbb0eb 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/RatisUnderReplicationHandler.java
@@ -26,7 +26,9 @@ import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.scm.pipeline.InsufficientDatanodesException;
import org.apache.hadoop.ozone.protocol.commands.ReplicateContainerCommand;
import org.apache.ratis.protocol.exceptions.NotLeaderException;
import org.slf4j.Logger;
@@ -108,13 +110,24 @@ public class RatisUnderReplicationHandler
// find targets to send replicas to
List<DatanodeDetails> targetDatanodes =
getTargets(withUnhealthy, pendingOps);
- if (targetDatanodes.isEmpty()) {
- LOG.warn("Cannot replicate container {} because no eligible targets " +
- "were found.", containerInfo);
- return 0;
- }
- return sendReplicationCommands(
+
+ int commandsSent = sendReplicationCommands(
containerInfo, sourceDatanodes, targetDatanodes);
+
+ if (targetDatanodes.size() < withUnhealthy.additionalReplicaNeeded()) {
+ // The placement policy failed to find enough targets to satisfy fix
+ // the under replication. There fore even though some commands were sent,
+ // we throw an exception to indicate that the container is still under
+ // replicated and should be re-queued for another attempt later.
+ LOG.debug("Placement policy failed to find enough targets to satisfy " +
+ "under replication for container {}. Targets found: {}, " +
+ "additional replicas needed: {}",
+ containerInfo, targetDatanodes.size(),
+ withUnhealthy.additionalReplicaNeeded());
+ throw new InsufficientDatanodesException(
+ withUnhealthy.additionalReplicaNeeded(), targetDatanodes.size());
+ }
+ return commandsSent;
}
/**
@@ -235,8 +248,21 @@ public class RatisUnderReplicationHandler
final long dataSizeRequired =
Math.max(replicaCount.getContainer().getUsedBytes(),
currentContainerSize);
- return placementPolicy.chooseDatanodes(excludeList, null,
- replicaCount.additionalReplicaNeeded(), 0, dataSizeRequired);
+ int requiredNodes = replicaCount.additionalReplicaNeeded();
+ while (requiredNodes > 0) {
+ try {
+ return placementPolicy.chooseDatanodes(excludeList, null,
+ requiredNodes, 0, dataSizeRequired);
+ } catch (IOException e) {
+ LOG.debug("Placement policy was not able to return {} nodes. ",
+ requiredNodes, e);
+ requiredNodes--;
+ }
+ }
+ throw new SCMException(String.format("Placement Policy: %s did not return"
+ + " any nodes. Number of required Nodes %d, Datasize Required: %d",
+ placementPolicy.getClass(), requiredNodes, dataSizeRequired),
+ SCMException.ResultCodes.FAILED_TO_FIND_SUITABLE_NODE);
}
private int sendReplicationCommands(
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
index d0b1f6322a..bdfb89c53c 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationTestUtil.java
@@ -42,6 +42,7 @@ import org.mockito.Mockito;
import org.mockito.stubbing.Answer;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -286,9 +287,7 @@ public final class ReplicationTestUtil {
throw new SCMException("Insufficient Nodes available to choose",
SCMException.ResultCodes.FAILED_TO_FIND_HEALTHY_NODES);
}
- List<DatanodeDetails> dns = new ArrayList<>();
- dns.add(nodeToReturn);
- return dns;
+ return Collections.singletonList(nodeToReturn);
}
@Override
@@ -319,6 +318,37 @@ public final class ReplicationTestUtil {
};
}
+ /**
+ * Placement policy that throws an exception when the number of requested
+ * nodes is greater or equal to throwWhenThisOrMoreNodesRequested, otherwise
+ * returns a random node.
+ */
+ public static PlacementPolicy getInsufficientNodesTestPlacementPolicy(
+ final NodeManager nodeManager, final OzoneConfiguration conf,
+ int throwWhenThisOrMoreNodesRequested) {
+ return new SCMCommonPlacementPolicy(nodeManager, conf) {
+ @Override
+ protected List<DatanodeDetails> chooseDatanodesInternal(
+ List<DatanodeDetails> usedNodes,
+ List<DatanodeDetails> excludedNodes,
+ List<DatanodeDetails> favoredNodes, int nodesRequiredToChoose,
+ long metadataSizeRequired, long dataSizeRequired)
+ throws SCMException {
+ if (nodesRequiredToChoose >= throwWhenThisOrMoreNodesRequested) {
+ throw new SCMException("No nodes available",
+ FAILED_TO_FIND_SUITABLE_NODE);
+ }
+ return Collections
+ .singletonList(MockDatanodeDetails.randomDatanodeDetails());
+ }
+
+ @Override
+ public DatanodeDetails chooseNode(List<DatanodeDetails> healthyNodes) {
+ return null;
+ }
+ };
+ }
+
/**
* Given a Mockito mock of ReplicationManager, this method will mock the
* SendThrottledReplicationCommand method so that it adds the command created
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
index f545809107..a2f9bdcfd8 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/TestRatisUnderReplicationHandler.java
@@ -34,6 +34,7 @@ import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager.Repli
import org.apache.hadoop.hdds.scm.node.NodeManager;
import org.apache.hadoop.hdds.scm.node.NodeStatus;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.scm.pipeline.InsufficientDatanodesException;
import org.apache.hadoop.ozone.container.common.SCMTestUtils;
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
import org.apache.ratis.protocol.exceptions.NotLeaderException;
@@ -209,6 +210,25 @@ public class TestRatisUnderReplicationHandler {
Collections.emptyList(), getUnderReplicatedHealthResult(), 2));
}
+ @Test
+ public void testInsufficientTargetsFoundBecauseOfPlacementPolicy() {
+ policy = ReplicationTestUtil.getInsufficientNodesTestPlacementPolicy(
+ nodeManager, conf, 2);
+ RatisUnderReplicationHandler handler =
+ new RatisUnderReplicationHandler(policy, conf, replicationManager);
+
+ // Only one replica is available, so we need to create 2 new ones.
+ Set<ContainerReplica> replicas
+ = createReplicas(container.containerID(), State.CLOSED, 0);
+
+ Assert.assertThrows(InsufficientDatanodesException.class,
+ () -> handler.processAndSendCommands(replicas,
+ Collections.emptyList(), getUnderReplicatedHealthResult(), 2));
+ // One command should be sent to the replication manager as we could only
+ // fine one node rather than two.
+ Assert.assertEquals(1, commandsSent.size());
+ }
+
@Test
public void testUnhealthyReplicasAreReplicatedWhenHealthyAreUnavailable()
throws IOException {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org