You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by ja...@apache.org on 2022/07/13 09:55:04 UTC

[ozone] branch master updated: HDDS-6989: EC: Add Test for RECOVERING container cleanup when failure. (#3587)

This is an automated email from the ASF dual-hosted git repository.

jacksonyao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new e1b6b76f0a HDDS-6989: EC: Add Test for RECOVERING container cleanup when failure. (#3587)
e1b6b76f0a is described below

commit e1b6b76f0a9b2042dd2ab4771be5e8b3d3ecc222
Author: Uma Maheswara Rao G <um...@apache.org>
AuthorDate: Wed Jul 13 02:54:59 2022 -0700

    HDDS-6989: EC: Add Test for RECOVERING container cleanup when failure. (#3587)
---
 .../reconstruction/ECContainerOperationClient.java |  17 ++-
 .../hdds/scm/storage/TestContainerCommandsEC.java  | 148 ++++++++++++++++-----
 2 files changed, 130 insertions(+), 35 deletions(-)

diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
index 43a96cb524..e5dd293102 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ec/reconstruction/ECContainerOperationClient.java
@@ -111,13 +111,28 @@ public class ECContainerOperationClient implements Closeable {
     }
   }
 
+  /**
+   * Deletes the container at the given DN. Before deleting it will pre-check
+   * whether the container is in RECOVERING state. As this is not an atomic
+   * check for RECOVERING container, there is a chance non recovering containers
+   * could get deleted if they just created in the window time of RECOVERING
+   * container exist check and delete op. So, user of the API needs to keep this
+   * scenario in mind and use this API if it is still safe.
+   *
+   * TODO: Alternatively we can extend this API to pass the flag to perform the
+   *       check at server side. So, that it will become atomic op.
+   * @param containerID - Container ID.
+   * @param dn - Datanode details.
+   * @param repConfig - Replication config.
+   * @param encodedToken - Token
+   */
   public void deleteRecoveringContainer(long containerID, DatanodeDetails dn,
       ECReplicationConfig repConfig, String encodedToken) throws IOException {
     XceiverClientSpi xceiverClient = this.xceiverClientManager
         .acquireClient(singleNodePipeline(dn, repConfig));
     try {
       // Before deleting the recovering container, just make sure that state is
-      // Recovering. There will be still race condition, but that will avoid
+      // Recovering. There will be still race condition, but this will avoid
       // most usual case.
       ContainerProtos.ReadContainerResponseProto readContainerResponseProto =
           ContainerProtocolCalls
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
index 0b037af8c1..61efbeadb1 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/hdds/scm/storage/TestContainerCommandsEC.java
@@ -26,6 +26,7 @@ import org.apache.hadoop.hdds.client.ECReplicationConfig.EcCodec;
 import org.apache.hadoop.hdds.client.ReplicationConfig;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.BlockData;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ListBlockResponseProto;
@@ -60,6 +61,7 @@ import org.apache.hadoop.ozone.client.io.InsufficientLocationsException;
 import org.apache.hadoop.ozone.client.io.KeyOutputStream;
 import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
 import org.apache.hadoop.ozone.common.ChunkBuffer;
+import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException;
 import org.apache.hadoop.ozone.common.utils.BufferUtils;
 import org.apache.hadoop.ozone.container.ContainerTestHelper;
 import org.apache.hadoop.ozone.container.ec.reconstruction.ECContainerOperationClient;
@@ -91,6 +93,7 @@ import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.UUID;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
@@ -211,19 +214,19 @@ public class TestContainerCommandsEC {
   public void testListBlock() throws Exception {
     for (int i = 0; i < datanodeDetails.size(); i++) {
       final int minKeySize = i < EC_DATA ? i * EC_CHUNK_SIZE : 0;
-      final int numExpectedBlocks =
+      final int minNumExpectedBlocks =
           (int) Arrays.stream(values).mapToInt(v -> v.length)
               .filter(s -> s > minKeySize).count();
       Function<Integer, Integer> expectedChunksFunc = chunksInReplicaFunc(i);
-      final int numExpectedChunks =
+      final int minNumExpectedChunks =
           Arrays.stream(values).mapToInt(v -> v.length)
               .map(expectedChunksFunc::apply).sum();
-      if (numExpectedBlocks == 0) {
+      if (minNumExpectedBlocks == 0) {
         final int j = i;
         Throwable t = Assertions.assertThrows(StorageContainerException.class,
             () -> ContainerProtocolCalls
                 .listBlock(clients.get(j), containerID, null,
-                    numExpectedBlocks + 1, containerToken));
+                    minNumExpectedBlocks + 1, containerToken));
         Assertions
             .assertEquals("ContainerID " + containerID + " does not exist",
                 t.getMessage());
@@ -232,15 +235,17 @@ public class TestContainerCommandsEC {
       ListBlockResponseProto response = ContainerProtocolCalls
           .listBlock(clients.get(i), containerID, null, Integer.MAX_VALUE,
               containerToken);
-      Assertions.assertEquals(numExpectedBlocks,
-          response.getBlockDataList().stream().filter(
+      Assertions.assertTrue(
+          minNumExpectedBlocks <= response.getBlockDataList().stream().filter(
               k -> k.getChunksCount() > 0 && k.getChunks(0).getLen() > 0)
               .collect(Collectors.toList()).size(),
-          "blocks count doesn't match on DN " + i);
-      Assertions.assertEquals(numExpectedChunks,
-          response.getBlockDataList().stream()
+          "blocks count should be same or more than min expected" +
+              " blocks count on DN " + i);
+      Assertions.assertTrue(
+          minNumExpectedChunks <= response.getBlockDataList().stream()
               .mapToInt(BlockData::getChunksCount).sum(),
-          "chunks count doesn't match on DN " + i);
+          "chunks count should be same or more than min expected" +
+              " chunks count on DN " + i);
     }
   }
 
@@ -381,20 +386,9 @@ public class TestContainerCommandsEC {
     objectStore.getVolume(volumeName).createBucket(bucketName);
     OzoneVolume volume = objectStore.getVolume(volumeName);
     OzoneBucket bucket = volume.getBucket(bucketName);
-    for (int i = 0; i < EC_DATA; i++) {
-      inputChunks[i] = getBytesWith(i + 1, EC_CHUNK_SIZE);
-    }
     XceiverClientManager xceiverClientManager =
         new XceiverClientManager(config);
-    try (OzoneOutputStream out = bucket.createKey(keyString, 4096,
-        new ECReplicationConfig(3, 2, ECReplicationConfig.EcCodec.RS, 1024),
-        new HashMap<>())) {
-      Assert.assertTrue(out.getOutputStream() instanceof KeyOutputStream);
-      for (int i = 0; i < inputChunks.length; i++) {
-        out.write(inputChunks[i]);
-      }
-    }
-
+    createKeyAndWriteData(keyString, bucket);
     ECReconstructionCoordinator coordinator =
         new ECReconstructionCoordinator(config, certClient);
 
@@ -404,12 +398,7 @@ public class TestContainerCommandsEC {
         .generateToken(ANY_USER, new ContainerID(conID));
 
     //Close the container first.
-    scm.getContainerManager().getContainerStateManager().updateContainerState(
-        HddsProtos.ContainerID.newBuilder().setId(conID).build(),
-        HddsProtos.LifeCycleEvent.FINALIZE);
-    scm.getContainerManager().getContainerStateManager().updateContainerState(
-        HddsProtos.ContainerID.newBuilder().setId(conID).build(),
-        HddsProtos.LifeCycleEvent.CLOSE);
+    closeContainer(conID);
 
     Pipeline containerPipeline = scm.getPipelineManager().getPipeline(
         scm.getContainerManager().getContainer(ContainerID.valueOf(conID))
@@ -507,6 +496,103 @@ public class TestContainerCommandsEC {
 
   }
 
+  private void createKeyAndWriteData(String keyString, OzoneBucket bucket)
+      throws IOException {
+    for (int i = 0; i < EC_DATA; i++) {
+      inputChunks[i] = getBytesWith(i + 1, EC_CHUNK_SIZE);
+    }
+    try (OzoneOutputStream out = bucket.createKey(keyString, 4096,
+        new ECReplicationConfig(3, 2, EcCodec.RS, 1024), new HashMap<>())) {
+      Assert.assertTrue(out.getOutputStream() instanceof KeyOutputStream);
+      for (int i = 0; i < inputChunks.length; i++) {
+        out.write(inputChunks[i]);
+      }
+    }
+  }
+
+  @Test
+  public void testECReconstructionCoordinatorShouldCleanupContainersOnFailure()
+      throws Exception {
+    List<Integer> missingIndexes = ImmutableList.of(1, 3);
+    ObjectStore objectStore = rpcClient.getObjectStore();
+    String keyString = UUID.randomUUID().toString();
+    String volumeName = UUID.randomUUID().toString();
+    String bucketName = volumeName;
+    objectStore.createVolume(volumeName);
+    objectStore.getVolume(volumeName).createBucket(bucketName);
+    OzoneVolume volume = objectStore.getVolume(volumeName);
+    OzoneBucket bucket = volume.getBucket(bucketName);
+    createKeyAndWriteData(keyString, bucket);
+
+    OzoneKeyDetails key = bucket.getKey(keyString);
+    long conID = key.getOzoneKeyLocations().get(0).getContainerID();
+    Token<ContainerTokenIdentifier> cToken =
+        containerTokenGenerator.generateToken(ANY_USER, new ContainerID(conID));
+    closeContainer(conID);
+
+    Pipeline containerPipeline = scm.getPipelineManager().getPipeline(
+        scm.getContainerManager().getContainer(ContainerID.valueOf(conID))
+            .getPipelineID());
+
+    List<DatanodeDetails> nodeSet = containerPipeline.getNodes();
+    SortedMap<Integer, DatanodeDetails> sourceNodeMap = new TreeMap<>();
+    nodeSet.stream().filter(k -> {
+      int replIndex = containerPipeline.getReplicaIndex(k);
+      return !missingIndexes.contains(replIndex);
+    }).forEach(dn -> {
+      sourceNodeMap.put(containerPipeline.getReplicaIndex(dn), dn);
+    });
+
+    //Find a good node outside of pipeline
+    List<DatanodeDetails> clusterDnsList =
+        cluster.getHddsDatanodes().stream().map(k -> k.getDatanodeDetails())
+            .collect(Collectors.toList());
+    DatanodeDetails goodTargetNode = null;
+    for (DatanodeDetails clusterDN : clusterDnsList) {
+      if (!nodeSet.contains(clusterDN)) {
+        goodTargetNode = clusterDN;
+        break;
+      }
+    }
+
+    //Give the new target to reconstruct the container
+    SortedMap<Integer, DatanodeDetails> targetNodeMap = new TreeMap<>();
+    targetNodeMap.put(1, goodTargetNode);
+    // Replace one of the target node with wrong to simulate failure at target.
+    DatanodeDetails invalidTargetNode =
+        MockDatanodeDetails.randomDatanodeDetails();
+    targetNodeMap.put(3, invalidTargetNode);
+
+    Assert.assertThrows(IOException.class, () -> {
+      ECReconstructionCoordinator coordinator =
+          new ECReconstructionCoordinator(config, certClient);
+      coordinator.reconstructECContainerGroup(conID,
+          (ECReplicationConfig) containerPipeline.getReplicationConfig(),
+          sourceNodeMap, targetNodeMap);
+    });
+    final DatanodeDetails targetDNToCheckContainerCLeaned = goodTargetNode;
+    StorageContainerException ex =
+        Assert.assertThrows(StorageContainerException.class, () -> {
+          ECContainerOperationClient client =
+              new ECContainerOperationClient(new OzoneConfiguration(),
+                  certClient);
+          client.listBlock(conID, targetDNToCheckContainerCLeaned,
+              new ECReplicationConfig(3, 2), cToken);
+        });
+    Assert.assertEquals("ContainerID 1 does not exist", ex.getMessage());
+  }
+
+  private void closeContainer(long conID)
+      throws IOException, InvalidStateTransitionException, TimeoutException {
+    //Close the container first.
+    scm.getContainerManager().getContainerStateManager().updateContainerState(
+        HddsProtos.ContainerID.newBuilder().setId(conID).build(),
+        HddsProtos.LifeCycleEvent.FINALIZE);
+    scm.getContainerManager().getContainerStateManager().updateContainerState(
+        HddsProtos.ContainerID.newBuilder().setId(conID).build(),
+        HddsProtos.LifeCycleEvent.CLOSE);
+  }
+
   private void checkBlockData(
       org.apache.hadoop.ozone.container.common.helpers.BlockData[] blockData,
       org.apache.hadoop.ozone.container.common.helpers.BlockData[]
@@ -526,12 +612,6 @@ public class TestContainerCommandsEC {
         Assert.assertEquals(chunkInfo, newBlockDataChunks.get(j));
       }
     }
-
-   /* Assert.assertEquals(
-        Arrays.stream(blockData).map(b -> b.getProtoBufMessage())
-            .collect(Collectors.toList()),
-        Arrays.stream(reconstructedBlockData).map(b -> b.getProtoBufMessage())
-            .collect(Collectors.toList()));*/
   }
 
   public static void startCluster(OzoneConfiguration conf) throws Exception {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org