You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by bh...@apache.org on 2021/12/14 04:06:10 UTC

[ozone] branch master updated: HDDS-6018 Closing state to quasiclosed (#2854)

This is an automated email from the ASF dual-hosted git repository.

bharat pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new dfcc69c  HDDS-6018 Closing state to quasiclosed (#2854)
dfcc69c is described below

commit dfcc69cda7589c8c7640b3f6414e3e1d590863cf
Author: Ritesh H Shukla <ke...@gmail.com>
AuthorDate: Mon Dec 13 20:05:46 2021 -0800

    HDDS-6018 Closing state to quasiclosed (#2854)
---
 .../ozone/container/common/interfaces/Handler.java |  2 +-
 .../common/statemachine/DatanodeStateMachine.java  |  2 +-
 .../CloseContainerCommandHandler.java              | 17 ++---
 .../transport/server/ratis/XceiverServerRatis.java |  2 +-
 .../TestCloseContainerCommandHandler.java          |  4 +-
 .../rpc/TestContainerStateMachineFailures.java     | 83 ++++++++++++++++++++++
 6 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java
index e585234..2f92434 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java
@@ -138,7 +138,7 @@ public abstract class Handler {
       throws IOException;
 
   /**
-   * Marks the container Unhealthy. Moves the container to UHEALTHY state.
+   * Marks the container Unhealthy. Moves the container to UNHEALTHY state.
    *
    * @param container container to update
    * @throws IOException in case of exception
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
index 575af3e..ee5e87a 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
@@ -111,7 +111,7 @@ public class DatanodeStateMachine implements Closeable {
   private final ReplicationSupervisorMetrics replicationSupervisorMetrics;
 
   /**
-   * Constructs a a datanode state machine.
+   * Constructs a datanode state machine.
    * @param datanodeDetails - DatanodeDetails used to identify a datanode
    * @param conf - Configuration.
    * @param certClient - Datanode Certificate client, required if security is
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java
index b92752b..e3d2551 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java
@@ -62,14 +62,13 @@ public class CloseContainerCommandHandler implements CommandHandler {
    * Handles a given SCM command.
    *
    * @param command           - SCM Command
-   * @param ozoneContainer         - Ozone Container.
+   * @param ozoneContainer    - Ozone Container.
    * @param context           - Current Context.
    * @param connectionManager - The SCMs that we are talking to.
    */
   @Override
   public void handle(SCMCommand command, OzoneContainer ozoneContainer,
       StateContext context, SCMConnectionManager connectionManager) {
-    LOG.debug("Processing Close Container command.");
     invocationCount.incrementAndGet();
     final long startTime = Time.monotonicNow();
     final DatanodeDetails datanodeDetails = context.getParent()
@@ -78,6 +77,8 @@ public class CloseContainerCommandHandler implements CommandHandler {
         ((CloseContainerCommand)command).getProto();
     final ContainerController controller = ozoneContainer.getController();
     final long containerId = closeCommand.getContainerID();
+    LOG.debug("Processing Close Container command container #{}",
+        containerId);
     try {
       final Container container = controller.getContainer(containerId);
 
@@ -103,10 +104,8 @@ public class CloseContainerCommandHandler implements CommandHandler {
           ozoneContainer.getWriteChannel()
               .submitRequest(request, closeCommand.getPipelineID());
         } else {
-          // Container should not exist in CLOSING state without a pipeline
-          controller.markContainerUnhealthy(containerId);
-          LOG.info("Marking UNHEALTHY as Container should not be in " +
-              "CLOSING state without pipeline, ContainerID: {}", containerId);
+          controller.quasiCloseContainer(containerId);
+          LOG.info("Marking Container {} quasi closed", containerId);
         }
         break;
       case QUASI_CLOSED:
@@ -118,10 +117,8 @@ public class CloseContainerCommandHandler implements CommandHandler {
         break;
       case UNHEALTHY:
       case INVALID:
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Cannot close the container #{}, the container is"
-              + " in {} state.", containerId, container.getContainerState());
-        }
+        LOG.debug("Cannot close the container #{}, the container is"
+            + " in {} state.", containerId, container.getContainerState());
         break;
       default:
         break;
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
index 8c6d3fe..c04e5e9 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
@@ -850,7 +850,7 @@ public final class XceiverServerRatis implements XceiverServerSpi {
     return minIndex == null ? -1 : minIndex.longValue();
   }
 
-  void notifyGroupRemove(RaftGroupId gid) {
+  public void notifyGroupRemove(RaftGroupId gid) {
     raftGids.remove(gid);
     // Remove any entries for group leader map
     groupLeaderMap.remove(gid);
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java
index a1ae60b..7e1ea76 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java
@@ -141,7 +141,7 @@ public class TestCloseContainerCommandHandler {
     // Container in CLOSING state is moved to UNHEALTHY if pipeline does not
     // exist. Container should not exist in CLOSING state without a pipeline.
     verify(containerHandler)
-        .markContainerUnhealthy(container);
+        .quasiCloseContainer(container);
   }
 
   @Test
@@ -168,7 +168,7 @@ public class TestCloseContainerCommandHandler {
     // Container in CLOSING state is moved to UNHEALTHY if pipeline does not
     // exist. Container should not exist in CLOSING state without a pipeline.
     verify(containerHandler)
-        .markContainerUnhealthy(container);
+        .quasiCloseContainer(container);
   }
 
   @Test
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java
index f7dafdf..4f7e183 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java
@@ -25,9 +25,11 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Random;
+import java.util.Set;
 import java.util.UUID;
 import java.util.concurrent.CountDownLatch;
 import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.hadoop.fs.FileUtil;
@@ -60,18 +62,25 @@ import org.apache.hadoop.ozone.container.common.impl.ContainerData;
 import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml;
 import org.apache.hadoop.ozone.container.common.impl.HddsDispatcher;
 import org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine;
+import org.apache.hadoop.ozone.container.common.transport.server.ratis.XceiverServerRatis;
 import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
 import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
 import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
+import org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand;
+import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
+import org.apache.hadoop.security.authentication.client.AuthenticationException;
 import org.apache.ozone.test.LambdaTestUtils;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL;
+import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.QUASI_CLOSED;
 import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
+
+import org.apache.ratis.protocol.RaftGroupId;
 import org.apache.ratis.protocol.exceptions.StateMachineException;
 import org.apache.ratis.server.storage.FileInfo;
 import org.apache.ratis.statemachine.impl.SimpleStateMachineStorage;
@@ -84,6 +93,7 @@ import static org.junit.Assert.assertThat;
 import static org.junit.Assert.fail;
 import org.junit.BeforeClass;
 import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
 
 /**
  * Tests the containerStateMachine failure handling.
@@ -159,6 +169,16 @@ public class TestContainerStateMachineFailures {
     random = new Random();
   }
 
+  @BeforeEach
+  public void restartDatanode()
+      throws InterruptedException, TimeoutException, AuthenticationException,
+      IOException {
+    for (int i=0; i < cluster.getHddsDatanodes().size(); i++) {
+      cluster.restartHddsDatanode(i, true);
+    }
+    cluster.restartStorageContainerManager(true);
+  }
+
   /**
    * Shutdown MiniDFSCluster.
    */
@@ -170,6 +190,69 @@ public class TestContainerStateMachineFailures {
   }
 
   @Test
+  public void testContainerStateMachineCloseOnMissingPipeline()
+      throws Exception {
+    // This integration test is a bit of a hack to see if the highly
+    // improbable event where the Datanode does not have the pipeline
+    // in its Ratis channel but still receives a close container command
+    // for a container that is open or in closing state.
+    // Bugs in code can lead to this sequence of events but for this test
+    // to inject this state, it removes the pipeline by directly calling
+    // the underlying method.
+
+    OzoneOutputStream key =
+        objectStore.getVolume(volumeName).getBucket(bucketName)
+            .createKey("testQuasiClosed1", 1024, ReplicationType.RATIS,
+                ReplicationFactor.THREE, new HashMap<>());
+    key.write("ratis".getBytes(UTF_8));
+    key.flush();
+
+    KeyOutputStream groupOutputStream = (KeyOutputStream) key.
+        getOutputStream();
+    List<OmKeyLocationInfo> locationInfoList =
+        groupOutputStream.getLocationInfoList();
+    Assert.assertEquals(1, locationInfoList.size());
+
+    OmKeyLocationInfo omKeyLocationInfo = locationInfoList.get(0);
+
+    Set<HddsDatanodeService> datanodeSet =
+        TestHelper.getDatanodeServices(cluster,
+            omKeyLocationInfo.getPipeline());
+
+    long containerID = omKeyLocationInfo.getContainerID();
+
+    for (HddsDatanodeService dn : datanodeSet) {
+      XceiverServerRatis wc = (XceiverServerRatis)
+          dn.getDatanodeStateMachine().getContainer().getWriteChannel();
+      if (wc == null) {
+        // Test applicable only for RATIS based channel.
+        return;
+      }
+      wc.notifyGroupRemove(RaftGroupId
+          .valueOf(omKeyLocationInfo.getPipeline().getId().getId()));
+      SCMCommand<?> command = new CloseContainerCommand(
+          containerID, omKeyLocationInfo.getPipeline().getId());
+      command.setTerm(
+          cluster
+              .getStorageContainerManager()
+              .getScmContext()
+              .getTermOfLeader());
+      cluster.getStorageContainerManager().getScmNodeManager()
+          .addDatanodeCommand(dn.getDatanodeDetails().getUuid(), command);
+    }
+
+
+    for (HddsDatanodeService dn : datanodeSet) {
+      LambdaTestUtils.await(20000, 1000,
+          () -> (dn.getDatanodeStateMachine()
+                .getContainer().getContainerSet()
+                .getContainer(containerID)
+                .getContainerState().equals(QUASI_CLOSED)));
+    }
+    key.close();
+  }
+
+  @Test
   public void testContainerStateMachineFailures() throws Exception {
     OzoneOutputStream key =
             objectStore.getVolume(volumeName).getBucket(bucketName)

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org