You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by ad...@apache.org on 2020/03/01 20:08:50 UTC

[hadoop-ozone] branch master updated: HDDS-2716. Add integration test to verify pipeline closed on read statemachine failure (#576)

This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new f5ea42a  HDDS-2716. Add integration test to verify pipeline closed on read statemachine failure (#576)
f5ea42a is described below

commit f5ea42a653e4aecdd2566486c14580ba05f4bf79
Author: Siddharth <sw...@apache.org>
AuthorDate: Sun Mar 1 12:08:43 2020 -0800

    HDDS-2716. Add integration test to verify pipeline closed on read statemachine failure (#576)
---
 .../hdds/scm/pipeline/PipelineReportHandler.java   |   6 +-
 .../TestContainerStateMachineFailureOnRead.java    | 224 +++++++++++++++++++++
 2 files changed, 228 insertions(+), 2 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineReportHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineReportHandler.java
index 0d1f0e3..068c328 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineReportHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/PipelineReportHandler.java
@@ -114,8 +114,10 @@ public class PipelineReportHandler implements
     }
 
     if (pipeline.getPipelineState() == Pipeline.PipelineState.ALLOCATED) {
-      LOGGER.info("Pipeline {} {} reported by {}", pipeline.getFactor(),
-          pipeline.getId(), dn);
+      if (LOGGER.isDebugEnabled()) {
+        LOGGER.debug("Pipeline {} {} reported by {}", pipeline.getFactor(),
+            pipeline.getId(), dn);
+      }
       if (pipeline.isHealthy()) {
         pipelineManager.openPipeline(pipelineID);
         if (pipelineAvailabilityCheck && scmSafeModeManager.getInSafeMode()) {
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailureOnRead.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailureOnRead.java
new file mode 100644
index 0000000..bfa4c63
--- /dev/null
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/client/rpc/TestContainerStateMachineFailureOnRead.java
@@ -0,0 +1,224 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.ozone.client.rpc;
+
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_COMMAND_STATUS_REPORT_INTERVAL;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_CONTAINER_REPORT_INTERVAL;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_PIPELINE_REPORT_INTERVAL;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.HDDS_SCM_WATCHER_TIMEOUT;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_PIPELINE_DESTROY_TIMEOUT;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Optional;
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdds.client.ReplicationFactor;
+import org.apache.hadoop.hdds.client.ReplicationType;
+import org.apache.hadoop.hdds.conf.DatanodeRatisServerConfig;
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.ratis.RatisHelper;
+import org.apache.hadoop.hdds.scm.XceiverClientRatis;
+import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineNotFoundException;
+import org.apache.hadoop.ozone.HddsDatanodeService;
+import org.apache.hadoop.ozone.MiniOzoneCluster;
+import org.apache.hadoop.ozone.OzoneConfigKeys;
+import org.apache.hadoop.ozone.client.ObjectStore;
+import org.apache.hadoop.ozone.client.OzoneClient;
+import org.apache.hadoop.ozone.client.OzoneClientFactory;
+import org.apache.hadoop.ozone.client.io.KeyOutputStream;
+import org.apache.hadoop.ozone.client.io.OzoneOutputStream;
+import org.apache.hadoop.ozone.container.ContainerTestHelper;
+import org.apache.hadoop.ozone.container.ozoneimpl.TestOzoneContainer;
+import org.apache.hadoop.ozone.om.helpers.OmKeyLocationInfo;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.ratis.grpc.server.GrpcLogAppender;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test to verify pipeline is closed on readStateMachine failure.
+ */
+public class TestContainerStateMachineFailureOnRead {
+  private MiniOzoneCluster cluster;
+  private ObjectStore objectStore;
+  private String volumeName;
+  private String bucketName;
+  private OzoneConfiguration conf;
+
+  @Before
+  public void setup() throws Exception {
+    conf = new OzoneConfiguration();
+    String path = GenericTestUtils
+        .getTempPath(TestContainerStateMachineFailures.class.getSimpleName());
+    File baseDir = new File(path);
+    baseDir.mkdirs();
+
+    conf.setTimeDuration(HDDS_CONTAINER_REPORT_INTERVAL, 200,
+        TimeUnit.MILLISECONDS);
+    conf.setTimeDuration(HDDS_COMMAND_STATUS_REPORT_INTERVAL, 200,
+        TimeUnit.MILLISECONDS);
+    conf.setTimeDuration(HDDS_PIPELINE_REPORT_INTERVAL, 200,
+        TimeUnit.MILLISECONDS);
+    conf.setTimeDuration(HDDS_SCM_WATCHER_TIMEOUT, 1000, TimeUnit.MILLISECONDS);
+    conf.setTimeDuration(OZONE_SCM_STALENODE_INTERVAL, 1200, TimeUnit.SECONDS);
+    conf.setTimeDuration(OZONE_SCM_PIPELINE_DESTROY_TIMEOUT, 1000,
+        TimeUnit.SECONDS);
+    conf.setTimeDuration(
+        RatisHelper.HDDS_DATANODE_RATIS_SERVER_PREFIX_KEY + "." +
+            DatanodeRatisServerConfig.RATIS_FOLLOWER_SLOWNESS_TIMEOUT_KEY,
+        1000, TimeUnit.SECONDS);
+    conf.setTimeDuration(
+        RatisHelper.HDDS_DATANODE_RATIS_SERVER_PREFIX_KEY + "." +
+            DatanodeRatisServerConfig.RATIS_SERVER_NO_LEADER_TIMEOUT_KEY,
+        1000, TimeUnit.SECONDS);
+    conf.setInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY, 10);
+    conf.setTimeDuration(
+        RatisHelper.HDDS_DATANODE_RATIS_SERVER_PREFIX_KEY + "." +
+            DatanodeRatisServerConfig.RATIS_SERVER_REQUEST_TIMEOUT_KEY,
+        3, TimeUnit.SECONDS);
+    conf.setTimeDuration(
+        RatisHelper.HDDS_DATANODE_RATIS_SERVER_PREFIX_KEY + "." +
+            DatanodeRatisServerConfig.
+                RATIS_SERVER_WATCH_REQUEST_TIMEOUT_KEY,
+        3, TimeUnit.SECONDS);
+    conf.setTimeDuration(
+        RatisHelper.HDDS_DATANODE_RATIS_CLIENT_PREFIX_KEY+ "." +
+            "rpc.request.timeout",
+        3, TimeUnit.SECONDS);
+    conf.setTimeDuration(
+        RatisHelper.HDDS_DATANODE_RATIS_CLIENT_PREFIX_KEY+ "." +
+            "watch.request.timeout",
+        3, TimeUnit.SECONDS);
+
+    conf.setQuietMode(false);
+    cluster = MiniOzoneCluster.newBuilder(conf)
+        .setNumDatanodes(3)
+        .setHbInterval(200)
+        .build();
+    cluster.waitForClusterToBeReady();
+    OzoneClient client = OzoneClientFactory.getClient(conf);
+    objectStore = client.getObjectStore();
+    volumeName = "testcontainerstatemachinefailures";
+    bucketName = volumeName;
+    objectStore.createVolume(volumeName);
+    objectStore.getVolume(volumeName).createBucket(bucketName);
+    Logger.getLogger(GrpcLogAppender.class).setLevel(Level.WARN);
+  }
+
+  @After
+  public void teardown() throws Exception {
+    if (cluster != null) {
+      cluster.shutdown();
+    }
+  }
+
+  @Test(timeout = 300000)
+  @SuppressWarnings("squid:S3655")
+  public void testReadStateMachineFailureClosesPipeline() throws Exception {
+    // Stop one follower datanode
+    List<Pipeline> pipelines =
+        cluster.getStorageContainerManager().getPipelineManager().getPipelines(
+            HddsProtos.ReplicationType.RATIS,
+            HddsProtos.ReplicationFactor.THREE);
+    Assert.assertEquals(1, pipelines.size());
+    Pipeline ratisPipeline = pipelines.iterator().next();
+
+    Optional<HddsDatanodeService> dnToStop =
+        cluster.getHddsDatanodes().stream().filter(
+            s -> {
+              try {
+                return ContainerTestHelper.isRatisFollower(s, ratisPipeline);
+              } catch (Exception e) {
+                e.printStackTrace();
+                return false;
+              }
+            }).findFirst();
+
+    Assert.assertTrue(dnToStop.isPresent());
+    cluster.shutdownHddsDatanode(dnToStop.get().getDatanodeDetails());
+    // Verify healthy pipeline before creating key
+    XceiverClientRatis xceiverClientRatis =
+        XceiverClientRatis.newXceiverClientRatis(ratisPipeline, conf);
+    xceiverClientRatis.connect();
+    TestOzoneContainer.createContainerForTesting(xceiverClientRatis, 100L);
+
+    OmKeyLocationInfo omKeyLocationInfo;
+    OzoneOutputStream key = objectStore.getVolume(volumeName)
+        .getBucket(bucketName)
+        .createKey("ratis", 1024, ReplicationType.RATIS,
+            ReplicationFactor.THREE, new HashMap<>());
+    // First write and flush creates a container in the datanode
+    key.write("ratis".getBytes());
+    key.flush();
+    
+    // get the name of a valid container
+    KeyOutputStream groupOutputStream = (KeyOutputStream) key.getOutputStream();
+
+    List<OmKeyLocationInfo> locationInfoList =
+        groupOutputStream.getLocationInfoList();
+    Assert.assertEquals(1, locationInfoList.size());
+    omKeyLocationInfo = locationInfoList.get(0);
+    key.close();
+    groupOutputStream.close();
+
+    Optional<HddsDatanodeService> leaderDn =
+        cluster.getHddsDatanodes().stream().filter(dn -> {
+          try {
+            return ContainerTestHelper.isRatisLeader(dn, ratisPipeline);
+          } catch (Exception e) {
+            e.printStackTrace();
+            return false;
+          }
+        }).findFirst();
+
+    Assert.assertTrue(leaderDn.isPresent());
+    // delete the container dir from leader
+    FileUtil.fullyDelete(new File(
+        leaderDn.get().getDatanodeStateMachine()
+            .getContainer().getContainerSet()
+            .getContainer(omKeyLocationInfo.getContainerID()).getContainerData()
+            .getContainerPath()));
+    // Start the stopped datanode
+    // Do not wait on restart since on stop will take long time due to
+    // stale interval timeout for the test
+    cluster.restartHddsDatanode(dnToStop.get().getDatanodeDetails(), false);
+    cluster.waitForClusterToBeReady();
+    Thread.sleep(10000);
+
+    try {
+      Pipeline pipeline = cluster.getStorageContainerManager()
+          .getPipelineManager().getPipeline(pipelines.get(0).getId());
+      Assert.assertEquals("Pipeline " + pipeline.getId()
+              + "should be in CLOSED state",
+          Pipeline.PipelineState.CLOSED,
+          pipeline.getPipelineState());
+    } catch (PipelineNotFoundException e) {
+      // do nothing
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: ozone-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: ozone-commits-help@hadoop.apache.org