You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by ja...@apache.org on 2022/07/13 09:55:04 UTC
[ozone] branch master updated: HDDS-6989: EC: Add Test for RECOVERING container cleanup when failure. (#3587)
This is an automated email from the ASF dual-hosted git repository.
jacksonyao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new e1b6b76f0a HDDS-6989: EC: Add Test for RECOVERING container cleanup when failure. (#3587)
e1b6b76f0a is described below
commit e1b6b76f0a9b2042dd2ab4771be5e8b3d3ecc222
Author: Uma Maheswara Rao G <um...@apache.org>
AuthorDate: Wed Jul 13 02:54:59 2022 -0700
HDDS-6989: EC: Add Test for RECOVERING container cleanup when failure. (#3587)
---
.../reconstruction/ECContainerOperationClient.java | 17 ++-
.../hdds/scm/storage/TestContainerCommandsEC.java | 148 ++++++++++++++++-----
2 files changed, 130 insertions(+), 35 deletions(-)
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
index 43a96cb524..e5dd293102 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
@@ -111,13 +111,28 @@ public class ECContainerOperationClient implements Closeable {
}
}
+ /**
+ * Deletes the container at the given DN. Before deleting it will pre-check
+ * whether the container is in RECOVERING state. As this is not an atomic
+ * check for RECOVERING container, there is a chance non recovering containers
+ * could get deleted if they just created in the window time of RECOVERING
+ * container exist check and delete op. So, user of the API needs to keep this
+ * scenario in mind and use this API if it is still safe.
+ *
+ * TODO: Alternatively we can extend this API to pass the flag to perform the
+ * check at server side. So, that it will become atomic op.
+ * @param containerID - Container ID.
+ * @param dn - Datanode details.
+ * @param repConfig - Replication config.
+ * @param encodedToken - Token
+ */
public void deleteRecoveringContainer(long containerID, DatanodeDetails dn,
ECReplicationConfig repConfig, String encodedToken) throws IOException {
XceiverClientSpi xceiverClient = this.xceiverClientManager
.acquireClient(singleNodePipeline(dn, repConfig));
try {
// Before deleting the recovering container, just make sure that state is
- // Recovering. There will be still race condition, but that will avoid
+ // Recovering. There will be still race condition, but this will avoid
// most usual case.
ContainerProtos.ReadContainerResponseProto readContainerResponseProto =
ContainerProtocolCalls
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
index 0b037af8c1..61efbeadb1 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.hdds.client.ECReplicationConfig.EcCodec;
import org.apache.hadoop.hdds.client.ReplicationConfig;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.BlockData;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ListBlockResponseProto;
@@ -60,6 +61,7 @@ import org.apache.hadoop.ozone.client.io.InsufficientLocationsException;
import org.apache.hadoop.ozone.client.io.KeyOutputStream;
import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
import org.apache.hadoop.ozone.common.ChunkBuffer;
+import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
import org.apache.hadoop.ozone.common.utils.BufferUtils;
import org.apache.hadoop.ozone.container.ContainerTestHelper;
import org.apache.hadoop.ozone.container.ec.reconstruction.ECContainerOperationClient;
@@ -91,6 +93,7 @@ import java.util.SortedMap;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@@ -211,19 +214,19 @@ public class TestContainerCommandsEC {
public void testListBlock() throws Exception {
for (int i = 0; i < datanodeDetails.size(); i++) {
final int minKeySize = i < EC_DATA ? i * EC_CHUNK_SIZE : 0;
- final int numExpectedBlocks =
+ final int minNumExpectedBlocks =
(int) Arrays.stream(values).mapToInt(v -> v.length)
.filter(s -> s > minKeySize).count();
Function<Integer, Integer> expectedChunksFunc = chunksInReplicaFunc(i);
- final int numExpectedChunks =
+ final int minNumExpectedChunks =
Arrays.stream(values).mapToInt(v -> v.length)
.map(expectedChunksFunc::apply).sum();
- if (numExpectedBlocks == 0) {
+ if (minNumExpectedBlocks == 0) {
final int j = i;
Throwable t = Assertions.assertThrows(StorageContainerException.class,
() -> ContainerProtocolCalls
.listBlock(clients.get(j), containerID, null,
- numExpectedBlocks + 1, containerToken));
+ minNumExpectedBlocks + 1, containerToken));
Assertions
.assertEquals("ContainerID " + containerID + " does not exist",
t.getMessage());
@@ -232,15 +235,17 @@ public class TestContainerCommandsEC {
ListBlockResponseProto response = ContainerProtocolCalls
.listBlock(clients.get(i), containerID, null, Integer.MAX_VALUE,
containerToken);
- Assertions.assertEquals(numExpectedBlocks,
- response.getBlockDataList().stream().filter(
+ Assertions.assertTrue(
+ minNumExpectedBlocks <= response.getBlockDataList().stream().filter(
k -> k.getChunksCount() > 0 && k.getChunks(0).getLen() > 0)
.collect(Collectors.toList()).size(),
- "blocks count doesn't match on DN " + i);
- Assertions.assertEquals(numExpectedChunks,
- response.getBlockDataList().stream()
+ "blocks count should be same or more than min expected" +
+ " blocks count on DN " + i);
+ Assertions.assertTrue(
+ minNumExpectedChunks <= response.getBlockDataList().stream()
.mapToInt(BlockData::getChunksCount).sum(),
- "chunks count doesn't match on DN " + i);
+ "chunks count should be same or more than min expected" +
+ " chunks count on DN " + i);
}
}
@@ -381,20 +386,9 @@ public class TestContainerCommandsEC {
objectStore.getVolume(volumeName).createBucket(bucketName);
OzoneVolume volume = objectStore.getVolume(volumeName);
OzoneBucket bucket = volume.getBucket(bucketName);
- for (int i = 0; i < EC_DATA; i++) {
- inputChunks[i] = getBytesWith(i + 1, EC_CHUNK_SIZE);
- }
XceiverClientManager xceiverClientManager =
new XceiverClientManager(config);
- try (OzoneOutputStream out = bucket.createKey(keyString, 4096,
- new ECReplicationConfig(3, 2, ECReplicationConfig.EcCodec.RS, 1024),
- new HashMap<>())) {
- Assert.assertTrue(out.getOutputStream() instanceof KeyOutputStream);
- for (int i = 0; i < inputChunks.length; i++) {
- out.write(inputChunks[i]);
- }
- }
-
+ createKeyAndWriteData(keyString, bucket);
ECReconstructionCoordinator coordinator =
new ECReconstructionCoordinator(config, certClient);
@@ -404,12 +398,7 @@ public class TestContainerCommandsEC {
.generateToken(ANY_USER, new ContainerID(conID));
//Close the container first.
- scm.getContainerManager().getContainerStateManager().updateContainerState(
- HddsProtos.ContainerID.newBuilder().setId(conID).build(),
- HddsProtos.LifeCycleEvent.FINALIZE);
- scm.getContainerManager().getContainerStateManager().updateContainerState(
- HddsProtos.ContainerID.newBuilder().setId(conID).build(),
- HddsProtos.LifeCycleEvent.CLOSE);
+ closeContainer(conID);
Pipeline containerPipeline = scm.getPipelineManager().getPipeline(
scm.getContainerManager().getContainer(ContainerID.valueOf(conID))
@@ -507,6 +496,103 @@ public class TestContainerCommandsEC {
}
+ private void createKeyAndWriteData(String keyString, OzoneBucket bucket)
+ throws IOException {
+ for (int i = 0; i < EC_DATA; i++) {
+ inputChunks[i] = getBytesWith(i + 1, EC_CHUNK_SIZE);
+ }
+ try (OzoneOutputStream out = bucket.createKey(keyString, 4096,
+ new ECReplicationConfig(3, 2, EcCodec.RS, 1024), new HashMap<>())) {
+ Assert.assertTrue(out.getOutputStream() instanceof KeyOutputStream);
+ for (int i = 0; i < inputChunks.length; i++) {
+ out.write(inputChunks[i]);
+ }
+ }
+ }
+
+ @Test
+ public void testECReconstructionCoordinatorShouldCleanupContainersOnFailure()
+ throws Exception {
+ List<Integer> missingIndexes = ImmutableList.of(1, 3);
+ ObjectStore objectStore = rpcClient.getObjectStore();
+ String keyString = UUID.randomUUID().toString();
+ String volumeName = UUID.randomUUID().toString();
+ String bucketName = volumeName;
+ objectStore.createVolume(volumeName);
+ objectStore.getVolume(volumeName).createBucket(bucketName);
+ OzoneVolume volume = objectStore.getVolume(volumeName);
+ OzoneBucket bucket = volume.getBucket(bucketName);
+ createKeyAndWriteData(keyString, bucket);
+
+ OzoneKeyDetails key = bucket.getKey(keyString);
+ long conID = key.getOzoneKeyLocations().get(0).getContainerID();
+ Token<ContainerTokenIdentifier> cToken =
+ containerTokenGenerator.generateToken(ANY_USER, new ContainerID(conID));
+ closeContainer(conID);
+
+ Pipeline containerPipeline = scm.getPipelineManager().getPipeline(
+ scm.getContainerManager().getContainer(ContainerID.valueOf(conID))
+ .getPipelineID());
+
+ List<DatanodeDetails> nodeSet = containerPipeline.getNodes();
+ SortedMap<Integer, DatanodeDetails> sourceNodeMap = new TreeMap<>();
+ nodeSet.stream().filter(k -> {
+ int replIndex = containerPipeline.getReplicaIndex(k);
+ return !missingIndexes.contains(replIndex);
+ }).forEach(dn -> {
+ sourceNodeMap.put(containerPipeline.getReplicaIndex(dn), dn);
+ });
+
+ //Find a good node outside of pipeline
+ List<DatanodeDetails> clusterDnsList =
+ cluster.getHddsDatanodes().stream().map(k -> k.getDatanodeDetails())
+ .collect(Collectors.toList());
+ DatanodeDetails goodTargetNode = null;
+ for (DatanodeDetails clusterDN : clusterDnsList) {
+ if (!nodeSet.contains(clusterDN)) {
+ goodTargetNode = clusterDN;
+ break;
+ }
+ }
+
+ //Give the new target to reconstruct the container
+ SortedMap<Integer, DatanodeDetails> targetNodeMap = new TreeMap<>();
+ targetNodeMap.put(1, goodTargetNode);
+ // Replace one of the target node with wrong to simulate failure at target.
+ DatanodeDetails invalidTargetNode =
+ MockDatanodeDetails.randomDatanodeDetails();
+ targetNodeMap.put(3, invalidTargetNode);
+
+ Assert.assertThrows(IOException.class, () -> {
+ ECReconstructionCoordinator coordinator =
+ new ECReconstructionCoordinator(config, certClient);
+ coordinator.reconstructECContainerGroup(conID,
+ (ECReplicationConfig) containerPipeline.getReplicationConfig(),
+ sourceNodeMap, targetNodeMap);
+ });
+ final DatanodeDetails targetDNToCheckContainerCLeaned = goodTargetNode;
+ StorageContainerException ex =
+ Assert.assertThrows(StorageContainerException.class, () -> {
+ ECContainerOperationClient client =
+ new ECContainerOperationClient(new OzoneConfiguration(),
+ certClient);
+ client.listBlock(conID, targetDNToCheckContainerCLeaned,
+ new ECReplicationConfig(3, 2), cToken);
+ });
+ Assert.assertEquals("ContainerID 1 does not exist", ex.getMessage());
+ }
+
+ private void closeContainer(long conID)
+ throws IOException, InvalidStateTransitionException, TimeoutException {
+ //Close the container first.
+ scm.getContainerManager().getContainerStateManager().updateContainerState(
+ HddsProtos.ContainerID.newBuilder().setId(conID).build(),
+ HddsProtos.LifeCycleEvent.FINALIZE);
+ scm.getContainerManager().getContainerStateManager().updateContainerState(
+ HddsProtos.ContainerID.newBuilder().setId(conID).build(),
+ HddsProtos.LifeCycleEvent.CLOSE);
+ }
+
private void checkBlockData(
org.apache.hadoop.ozone.container.common.helpers.BlockData[] blockData,
org.apache.hadoop.ozone.container.common.helpers.BlockData[]
@@ -526,12 +612,6 @@ public class TestContainerCommandsEC {
Assert.assertEquals(chunkInfo, newBlockDataChunks.get(j));
}
}
-
- /* Assert.assertEquals(
- Arrays.stream(blockData).map(b -> b.getProtoBufMessage())
- .collect(Collectors.toList()),
- Arrays.stream(reconstructedBlockData).map(b -> b.getProtoBufMessage())
- .collect(Collectors.toList()));*/
}
public static void startCluster(OzoneConfiguration conf) throws Exception {
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org