You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by bh...@apache.org on 2021/12/14 04:06:10 UTC
[ozone] branch master updated: HDDS-6018 Closing state to quasiclosed (#2854)
This is an automated email from the ASF dual-hosted git repository.
bharat pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new dfcc69c HDDS-6018 Closing state to quasiclosed (#2854)
dfcc69c is described below
commit dfcc69cda7589c8c7640b3f6414e3e1d590863cf
Author: Ritesh H Shukla <ke...@gmail.com>
AuthorDate: Mon Dec 13 20:05:46 2021 -0800
HDDS-6018 Closing state to quasiclosed (#2854)
---
.../ozone/container/common/interfaces/Handler.java | 2 +-
.../common/statemachine/DatanodeStateMachine.java | 2 +-
.../CloseContainerCommandHandler.java | 17 ++---
.../transport/server/ratis/XceiverServerRatis.java | 2 +-
.../TestCloseContainerCommandHandler.java | 4 +-
.../rpc/TestContainerStateMachineFailures.java | 83 ++++++++++++++++++++++
6 files changed, 95 insertions(+), 15 deletions(-)
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java
index e585234..2f92434 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/interfaces/Handler.java
@@ -138,7 +138,7 @@ public abstract class Handler {
throws IOException;
/**
- * Marks the container Unhealthy. Moves the container to UHEALTHY state.
+ * Marks the container Unhealthy. Moves the container to UNHEALTHY state.
*
* @param container container to update
* @throws IOException in case of exception
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
index 575af3e..ee5e87a 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/DatanodeStateMachine.java
@@ -111,7 +111,7 @@ public class DatanodeStateMachine implements Closeable {
private final ReplicationSupervisorMetrics replicationSupervisorMetrics;
/**
- * Constructs a a datanode state machine.
+ * Constructs a datanode state machine.
* @param datanodeDetails - DatanodeDetails used to identify a datanode
* @param conf - Configuration.
* @param certClient - Datanode Certificate client, required if security is
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java
index b92752b..e3d2551 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/CloseContainerCommandHandler.java
@@ -62,14 +62,13 @@ public class CloseContainerCommandHandler implements CommandHandler {
* Handles a given SCM command.
*
* @param command - SCM Command
- * @param ozoneContainer - Ozone Container.
+ * @param ozoneContainer - Ozone Container.
* @param context - Current Context.
* @param connectionManager - The SCMs that we are talking to.
*/
@Override
public void handle(SCMCommand command, OzoneContainer ozoneContainer,
StateContext context, SCMConnectionManager connectionManager) {
- LOG.debug("Processing Close Container command.");
invocationCount.incrementAndGet();
final long startTime = Time.monotonicNow();
final DatanodeDetails datanodeDetails = context.getParent()
@@ -78,6 +77,8 @@ public class CloseContainerCommandHandler implements CommandHandler {
((CloseContainerCommand)command).getProto();
final ContainerController controller = ozoneContainer.getController();
final long containerId = closeCommand.getContainerID();
+ LOG.debug("Processing Close Container command container #{}",
+ containerId);
try {
final Container container = controller.getContainer(containerId);
@@ -103,10 +104,8 @@ public class CloseContainerCommandHandler implements CommandHandler {
ozoneContainer.getWriteChannel()
.submitRequest(request, closeCommand.getPipelineID());
} else {
- // Container should not exist in CLOSING state without a pipeline
- controller.markContainerUnhealthy(containerId);
- LOG.info("Marking UNHEALTHY as Container should not be in " +
- "CLOSING state without pipeline, ContainerID: {}", containerId);
+ controller.quasiCloseContainer(containerId);
+ LOG.info("Marking Container {} quasi closed", containerId);
}
break;
case QUASI_CLOSED:
@@ -118,10 +117,8 @@ public class CloseContainerCommandHandler implements CommandHandler {
break;
case UNHEALTHY:
case INVALID:
- if (LOG.isDebugEnabled()) {
- LOG.debug("Cannot close the container #{}, the container is"
- + " in {} state.", containerId, container.getContainerState());
- }
+ LOG.debug("Cannot close the container #{}, the container is"
+ + " in {} state.", containerId, container.getContainerState());
break;
default:
break;
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
index 8c6d3fe..c04e5e9 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/transport/server/ratis/XceiverServerRatis.java
@@ -850,7 +850,7 @@ public final class XceiverServerRatis implements XceiverServerSpi {
return minIndex == null ? -1 : minIndex.longValue();
}
- void notifyGroupRemove(RaftGroupId gid) {
+ public void notifyGroupRemove(RaftGroupId gid) {
raftGids.remove(gid);
// Remove any entries for group leader map
groupLeaderMap.remove(gid);
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java
index a1ae60b..7e1ea76 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/statemachine/commandhandler/TestCloseContainerCommandHandler.java
@@ -141,7 +141,7 @@ public class TestCloseContainerCommandHandler {
// Container in CLOSING state is moved to UNHEALTHY if pipeline does not
// exist. Container should not exist in CLOSING state without a pipeline.
verify(containerHandler)
- .markContainerUnhealthy(container);
+ .quasiCloseContainer(container);
}
@Test
@@ -168,7 +168,7 @@ public class TestCloseContainerCommandHandler {
// Container in CLOSING state is moved to UNHEALTHY if pipeline does not
// exist. Container should not exist in CLOSING state without a pipeline.
verify(containerHandler)
- .markContainerUnhealthy(container);
+ .quasiCloseContainer(container);
}
@Test
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java
index f7dafdf..4f7e183 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailures.java
@@ -25,9 +25,11 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Random;
+import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.fs.FileUtil;
@@ -60,18 +62,25 @@ import org.apache.hadoop.ozone.container.common.impl.ContainerData;
import org.apache.hadoop.ozone.container.common.impl.ContainerDataYaml;
import org.apache.hadoop.ozone.container.common.impl.HddsDispatcher;
import org.apache.hadoop.ozone.container.common.transport.server.ratis.ContainerStateMachine;
+import org.apache.hadoop.ozone.container.common.transport.server.ratis.XceiverServerRatis;
import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
+import org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand;
+import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
+import org.apache.hadoop.security.authentication.client.AuthenticationException;
import org.apache.ozone.test.LambdaTestUtils;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL;
import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL;
+import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.QUASI_CLOSED;
import static org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.UNHEALTHY;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
+
+import org.apache.ratis.protocol.RaftGroupId;
import org.apache.ratis.protocol.exceptions.StateMachineException;
import org.apache.ratis.server.storage.FileInfo;
import org.apache.ratis.statemachine.impl.SimpleStateMachineStorage;
@@ -84,6 +93,7 @@ import static org.junit.Assert.assertThat;
import static org.junit.Assert.fail;
import org.junit.BeforeClass;
import org.junit.Test;
+import org.junit.jupiter.api.BeforeEach;
/**
* Tests the containerStateMachine failure handling.
@@ -159,6 +169,16 @@ public class TestContainerStateMachineFailures {
random = new Random();
}
+ @BeforeEach
+ public void restartDatanode()
+ throws InterruptedException, TimeoutException, AuthenticationException,
+ IOException {
+ for (int i=0; i < cluster.getHddsDatanodes().size(); i++) {
+ cluster.restartHddsDatanode(i, true);
+ }
+ cluster.restartStorageContainerManager(true);
+ }
+
/**
* Shutdown MiniDFSCluster.
*/
@@ -170,6 +190,69 @@ public class TestContainerStateMachineFailures {
}
@Test
+ public void testContainerStateMachineCloseOnMissingPipeline()
+ throws Exception {
+ // This integration test is a bit of a hack to see if the highly
+ // improbable event where the Datanode does not have the pipeline
+ // in its Ratis channel but still receives a close container command
+ // for a container that is open or in closing state.
+ // Bugs in code can lead to this sequence of events but for this test
+ // to inject this state, it removes the pipeline by directly calling
+ // the underlying method.
+
+ OzoneOutputStream key =
+ objectStore.getVolume(volumeName).getBucket(bucketName)
+ .createKey("testQuasiClosed1", 1024, ReplicationType.RATIS,
+ ReplicationFactor.THREE, new HashMap<>());
+ key.write("ratis".getBytes(UTF_8));
+ key.flush();
+
+ KeyOutputStream groupOutputStream = (KeyOutputStream) key.
+ getOutputStream();
+ List<OmKeyLocationInfo> locationInfoList =
+ groupOutputStream.getLocationInfoList();
+ Assert.assertEquals(1, locationInfoList.size());
+
+ OmKeyLocationInfo omKeyLocationInfo = locationInfoList.get(0);
+
+ Set<HddsDatanodeService> datanodeSet =
+ TestHelper.getDatanodeServices(cluster,
+ omKeyLocationInfo.getPipeline());
+
+ long containerID = omKeyLocationInfo.getContainerID();
+
+ for (HddsDatanodeService dn : datanodeSet) {
+ XceiverServerRatis wc = (XceiverServerRatis)
+ dn.getDatanodeStateMachine().getContainer().getWriteChannel();
+ if (wc == null) {
+ // Test applicable only for RATIS based channel.
+ return;
+ }
+ wc.notifyGroupRemove(RaftGroupId
+ .valueOf(omKeyLocationInfo.getPipeline().getId().getId()));
+ SCMCommand<?> command = new CloseContainerCommand(
+ containerID, omKeyLocationInfo.getPipeline().getId());
+ command.setTerm(
+ cluster
+ .getStorageContainerManager()
+ .getScmContext()
+ .getTermOfLeader());
+ cluster.getStorageContainerManager().getScmNodeManager()
+ .addDatanodeCommand(dn.getDatanodeDetails().getUuid(), command);
+ }
+
+
+ for (HddsDatanodeService dn : datanodeSet) {
+ LambdaTestUtils.await(20000, 1000,
+ () -> (dn.getDatanodeStateMachine()
+ .getContainer().getContainerSet()
+ .getContainer(containerID)
+ .getContainerState().equals(QUASI_CLOSED)));
+ }
+ key.close();
+ }
+
+ @Test
public void testContainerStateMachineFailures() throws Exception {
OzoneOutputStream key =
objectStore.getVolume(volumeName).getBucket(bucketName)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org