You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by av...@apache.org on 2021/06/02 17:18:39 UTC

[ozone] branch master updated: HDDS-5277. Recon shows operational status as "DECOMMISSIONING" for "DECOMMISSIONED" DNs (#2286)

This is an automated email from the ASF dual-hosted git repository.

avijayan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new f3f258a  HDDS-5277. Recon shows operational status as "DECOMMISSIONING" for "DECOMMISSIONED" DNs (#2286)
f3f258a is described below

commit f3f258af7d006062aa6adc0d7346e87d1c633bbe
Author: avijayanhwx <14...@users.noreply.github.com>
AuthorDate: Wed Jun 2 10:18:23 2021 -0700

    HDDS-5277. Recon shows operational status as "DECOMMISSIONING" for "DECOMMISSIONED" DNs (#2286)
---
 .../hadoop/hdds/scm/node/DeadNodeHandler.java      |  3 +
 .../apache/hadoop/hdds/scm/node/NodeStatus.java    |  2 +-
 .../hadoop/ozone/recon/scm/PipelineSyncTask.java   | 46 ++++++++++++-
 .../ozone/recon/scm/ReconDeadNodeHandler.java      | 79 ++++++++++++++++++++++
 .../hadoop/ozone/recon/scm/ReconNodeManager.java   | 20 ++++++
 .../scm/ReconStorageContainerManagerFacade.java    |  5 +-
 .../recon/spi/StorageContainerServiceProvider.java |  6 ++
 .../impl/StorageContainerServiceProviderImpl.java  |  8 +++
 .../hadoop/ozone/recon/tasks/ReconTaskConfig.java  |  4 +-
 .../ozone/recon/scm/TestReconNodeManager.java      | 41 ++++++++++-
 10 files changed, 207 insertions(+), 7 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java
index 7f9b942..304e075 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java
@@ -171,5 +171,8 @@ public class DeadNodeHandler implements EventHandler<DatanodeDetails> {
         });
   }
 
+  protected NodeManager getNodeManager() {
+    return nodeManager;
+  }
 
 }
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java
index 72ca015..69dcd0f 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java
@@ -200,7 +200,7 @@ public class NodeStatus {
   @Override
   public String toString() {
     return "OperationalState: "+operationalState+" Health: "+health+
-        " OperastionStateExpiry: "+opStateExpiryEpochSeconds;
+        " OperationStateExpiry: "+opStateExpiryEpochSeconds;
   }
 
 }
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java
index d5de2bc..1ec3d3b 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java
@@ -18,8 +18,16 @@
 
 package org.apache.hadoop.ozone.recon.scm;
 
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD;
+
+import java.io.IOException;
 import java.util.List;
+import java.util.stream.Collectors;
 
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.Node;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
 import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
 import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider;
 import org.apache.hadoop.ozone.recon.tasks.ReconTaskConfig;
@@ -30,7 +38,8 @@ import org.slf4j.LoggerFactory;
 
 /**
  * Background pipeline sync task that queries pipelines in SCM, and removes
- * any obsolete pipeline.
+ * any obsolete pipeline. Also syncs operational state of dead nodes with SCM
+ * state.
  */
 public class PipelineSyncTask extends ReconScmTask {
 
@@ -39,15 +48,18 @@ public class PipelineSyncTask extends ReconScmTask {
 
   private StorageContainerServiceProvider scmClient;
   private ReconPipelineManager reconPipelineManager;
+  private ReconNodeManager nodeManager;
   private final long interval;
 
   public PipelineSyncTask(ReconPipelineManager pipelineManager,
+      ReconNodeManager nodeManager,
       StorageContainerServiceProvider scmClient,
       ReconTaskStatusDao reconTaskStatusDao,
       ReconTaskConfig reconTaskConfig) {
     super(reconTaskStatusDao);
     this.scmClient = scmClient;
     this.reconPipelineManager = pipelineManager;
+    this.nodeManager = nodeManager;
     this.interval = reconTaskConfig.getPipelineSyncTaskInterval().toMillis();
   }
 
@@ -58,6 +70,7 @@ public class PipelineSyncTask extends ReconScmTask {
         long start = Time.monotonicNow();
         List<Pipeline> pipelinesFromScm = scmClient.getPipelines();
         reconPipelineManager.initializePipelines(pipelinesFromScm);
+        syncOperationalStateOnDeadNodes();
         LOG.info("Pipeline sync Thread took {} milliseconds.",
             Time.monotonicNow() - start);
         recordSingleRunCompletion();
@@ -67,4 +80,35 @@ public class PipelineSyncTask extends ReconScmTask {
       LOG.error("Exception in Pipeline sync Thread.", t);
     }
   }
+
+  /**
+   * For every dead node in Recon, update Operational state with that on SCM
+   * if different.
+   * @throws IOException on Error
+   * @throws NodeNotFoundException if node not found in Recon.
+   */
+  private void syncOperationalStateOnDeadNodes()
+      throws IOException, NodeNotFoundException {
+    List<DatanodeDetails> deadNodesOnRecon = nodeManager.getNodes(null, DEAD);
+
+    if (!deadNodesOnRecon.isEmpty()) {
+      List<Node> scmNodes = scmClient.getNodes();
+      List<Node> filteredScmNodes = scmNodes.stream()
+              .filter(n -> deadNodesOnRecon.contains(
+                  DatanodeDetails.getFromProtoBuf(n.getNodeID())))
+              .collect(Collectors.toList());
+
+      for (Node deadNode : filteredScmNodes) {
+        DatanodeDetails dnDetails =
+            DatanodeDetails.getFromProtoBuf(deadNode.getNodeID());
+
+        HddsProtos.NodeState scmNodeState = deadNode.getNodeStates(0);
+        if (scmNodeState != DEAD) {
+          LOG.warn("Node {} DEAD in Recon, but SCM reports it as {}",
+              dnDetails.getHostName(), scmNodeState);
+        }
+        nodeManager.updateNodeOperationalStateFromScm(deadNode, dnDetails);
+      }
+    }
+  }
 }
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconDeadNodeHandler.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconDeadNodeHandler.java
new file mode 100644
index 0000000..9868851
--- /dev/null
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconDeadNodeHandler.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.scm;
+
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.Node;
+import org.apache.hadoop.hdds.scm.container.ContainerManagerV2;
+import org.apache.hadoop.hdds.scm.node.DeadNodeHandler;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
+import org.apache.hadoop.hdds.server.events.EventPublisher;
+import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Recon's handling of Dead node.
+ */
+public class ReconDeadNodeHandler extends DeadNodeHandler {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(ReconDeadNodeHandler.class);
+
+
+  private StorageContainerServiceProvider scmClient;
+
+  public ReconDeadNodeHandler(NodeManager nodeManager,
+                              PipelineManager pipelineManager,
+                              ContainerManagerV2 containerManager,
+                              StorageContainerServiceProvider scmClient) {
+    super(nodeManager, pipelineManager, containerManager);
+    this.scmClient = scmClient;
+  }
+
+  @Override
+  public void onMessage(final DatanodeDetails datanodeDetails,
+                        final EventPublisher publisher) {
+
+    super.onMessage(datanodeDetails, publisher);
+    ReconNodeManager nodeManager = (ReconNodeManager) getNodeManager();
+    try {
+      List<Node> nodes = scmClient.getNodes();
+      Optional<Node> matchedDn = nodes.stream()
+              .filter(n -> n.getNodeID().getUuid()
+                  .equals(datanodeDetails.getUuidString()))
+              .findAny();
+
+      if (matchedDn.isPresent()) {
+        nodeManager.updateNodeOperationalStateFromScm(matchedDn.get(),
+            datanodeDetails);
+      } else {
+        LOG.warn("Node {} has reached DEAD state, but SCM does not have " +
+            "information about it.", datanodeDetails);
+      }
+    } catch (Exception ioEx) {
+      LOG.error("Error trying to verify Node operational state from SCM.",
+          ioEx);
+    }
+  }
+}
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java
index 3545efb..e908723 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java
@@ -27,9 +27,11 @@ import java.util.UUID;
 
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
 import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type;
 import org.apache.hadoop.hdds.scm.ha.SCMContext;
 import org.apache.hadoop.hdds.scm.net.NetworkTopology;
+import org.apache.hadoop.hdds.scm.node.NodeStatus;
 import org.apache.hadoop.hdds.scm.node.SCMNodeManager;
 import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
 import org.apache.hadoop.hdds.scm.server.SCMStorageConfig;
@@ -149,4 +151,22 @@ public class ReconNodeManager extends SCMNodeManager {
         reportedDn.getPersistedOpState(),
         reportedDn.getPersistedOpStateExpiryEpochSec());
   }
+
+  public void updateNodeOperationalStateFromScm(HddsProtos.Node scmNode,
+                                                DatanodeDetails dnDetails)
+      throws NodeNotFoundException {
+    NodeStatus nodeStatus = getNodeStatus(dnDetails);
+    HddsProtos.NodeOperationalState nodeOperationalStateFromScm =
+        scmNode.getNodeOperationalStates(0);
+    if (nodeOperationalStateFromScm != nodeStatus.getOperationalState()) {
+      LOG.info("Updating Node operational state for {}, in SCM = {}, in " +
+              "Recon = {}", dnDetails.getHostName(),
+          nodeOperationalStateFromScm,
+          nodeStatus.getOperationalState());
+
+      setNodeOperationalState(dnDetails, nodeOperationalStateFromScm);
+      DatanodeDetails scmDnd = getNodeByUuid(dnDetails.getUuidString());
+      scmDnd.setPersistedOpState(nodeOperationalStateFromScm);
+    }
+  }
 }
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java
index 2272448..c7f4e22 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java
@@ -159,8 +159,8 @@ public class ReconStorageContainerManagerFacade
 
     StaleNodeHandler staleNodeHandler =
         new StaleNodeHandler(nodeManager, pipelineManager, conf);
-    DeadNodeHandler deadNodeHandler = new DeadNodeHandler(nodeManager,
-        pipelineManager, containerManager);
+    DeadNodeHandler deadNodeHandler = new ReconDeadNodeHandler(nodeManager,
+        pipelineManager, containerManager, scmServiceProvider);
 
     ContainerReportHandler containerReportHandler =
         new ReconContainerReportHandler(nodeManager, containerManager);
@@ -189,6 +189,7 @@ public class ReconStorageContainerManagerFacade
     ReconTaskConfig reconTaskConfig = conf.getObject(ReconTaskConfig.class);
     reconScmTasks.add(new PipelineSyncTask(
         pipelineManager,
+        nodeManager,
         scmServiceProvider,
         reconTaskStatusDao,
         reconTaskConfig));
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java
index d3a9354..be2c7cb 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java
@@ -60,4 +60,10 @@ public interface StorageContainerServiceProvider {
    */
   List<ContainerWithPipeline> getExistContainerWithPipelinesInBatch(
       List<Long> containerIDs);
+
+  /**
+   * Returns list of nodes from SCM.
+   */
+  List<HddsProtos.Node> getNodes() throws IOException;
+
 }
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java
index 47def4c..1e609e8 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java
@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.ozone.recon.spi.impl;
 
+import static org.apache.hadoop.ozone.ClientVersions.CURRENT_VERSION;
+
 import java.io.IOException;
 import java.util.List;
 
@@ -66,4 +68,10 @@ public class StorageContainerServiceProviderImpl
       List<Long> containerIDs) {
     return scmClient.getExistContainerWithPipelinesInBatch(containerIDs);
   }
+
+  @Override
+  public List<HddsProtos.Node> getNodes() throws IOException {
+    return scmClient.queryNode(null, null, HddsProtos.QueryScope.CLUSTER,
+        "", CURRENT_VERSION);
+  }
 }
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java
index 813baf5..9788bf6 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java
@@ -33,12 +33,12 @@ public class ReconTaskConfig {
 
   @Config(key = "pipelinesync.interval",
       type = ConfigType.TIME,
-      defaultValue = "600s",
+      defaultValue = "300s",
       tags = { ConfigTag.RECON, ConfigTag.OZONE },
       description = "The time interval of periodic sync of pipeline state " +
           "from SCM to Recon."
   )
-  private long pipelineSyncTaskInterval = Duration.ofMinutes(10).toMillis();
+  private long pipelineSyncTaskInterval = Duration.ofMinutes(5).toMillis();
 
   public Duration getPipelineSyncTaskInterval() {
     return Duration.ofMillis(pipelineSyncTaskInterval);
diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java
index 0cacd7a..4c68ed6 100644
--- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java
+++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java
@@ -19,11 +19,16 @@
 package org.apache.hadoop.ozone.recon.scm;
 
 import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONING;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
 import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_DIRS;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
 
 import java.io.IOException;
 import java.util.List;
@@ -43,6 +48,7 @@ import org.apache.hadoop.hdds.utils.db.Table;
 import org.apache.hadoop.ozone.protocol.commands.ReregisterCommand;
 import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
 import org.apache.hadoop.ozone.protocol.commands.SetNodeOperationalStateCommand;
+import org.apache.hadoop.test.LambdaTestUtils;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Rule;
@@ -104,7 +110,7 @@ public class TestReconNodeManager {
     // This command should never be returned by Recon
     reconNodeManager.addDatanodeCommand(datanodeDetails.getUuid(),
         new SetNodeOperationalStateCommand(1234,
-        HddsProtos.NodeOperationalState.DECOMMISSIONING, 0));
+        DECOMMISSIONING, 0));
 
     // This one should be returned
     reconNodeManager.addDatanodeCommand(datanodeDetails.getUuid(),
@@ -156,4 +162,37 @@ public class TestReconNodeManager {
     assertNotNull(
         reconNodeManager.getNodeByUuid(datanodeDetails.getUuidString()));
   }
+
+  @Test
+  public void testUpdateNodeOperationalStateFromScm() throws Exception {
+    ReconStorageConfig scmStorageConfig = new ReconStorageConfig(conf);
+    EventQueue eventQueue = new EventQueue();
+    NetworkTopology clusterMap = new NetworkTopologyImpl(conf);
+    Table<UUID, DatanodeDetails> nodeTable =
+        ReconSCMDBDefinition.NODES.getTable(store);
+    ReconNodeManager reconNodeManager = new ReconNodeManager(conf,
+        scmStorageConfig, eventQueue, clusterMap, nodeTable);
+
+
+    DatanodeDetails datanodeDetails = randomDatanodeDetails();
+    HddsProtos.Node node = mock(HddsProtos.Node.class);
+
+    LambdaTestUtils.intercept(NodeNotFoundException.class, () -> {
+      reconNodeManager.updateNodeOperationalStateFromScm(node, datanodeDetails);
+    });
+
+    reconNodeManager.register(datanodeDetails, null, null);
+    assertEquals(IN_SERVICE, reconNodeManager
+        .getNodeByUuid(datanodeDetails.getUuidString()).getPersistedOpState());
+
+    when(node.getNodeOperationalStates(eq(0)))
+        .thenReturn(DECOMMISSIONING);
+    reconNodeManager.updateNodeOperationalStateFromScm(node, datanodeDetails);
+    assertEquals(DECOMMISSIONING, reconNodeManager
+        .getNodeByUuid(datanodeDetails.getUuidString()).getPersistedOpState());
+    List<DatanodeDetails> nodes =
+        reconNodeManager.getNodes(DECOMMISSIONING, null);
+    assertEquals(1, nodes.size());
+    assertEquals(datanodeDetails.getUuid(), nodes.get(0).getUuid());
+  }
 }

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org