You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by av...@apache.org on 2021/06/02 17:18:39 UTC
[ozone] branch master updated: HDDS-5277. Recon shows operational
status as "DECOMMISSIONING" for "DECOMMISSIONED" DNs (#2286)
This is an automated email from the ASF dual-hosted git repository.
avijayan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new f3f258a HDDS-5277. Recon shows operational status as "DECOMMISSIONING" for "DECOMMISSIONED" DNs (#2286)
f3f258a is described below
commit f3f258af7d006062aa6adc0d7346e87d1c633bbe
Author: avijayanhwx <14...@users.noreply.github.com>
AuthorDate: Wed Jun 2 10:18:23 2021 -0700
HDDS-5277. Recon shows operational status as "DECOMMISSIONING" for "DECOMMISSIONED" DNs (#2286)
---
.../hadoop/hdds/scm/node/DeadNodeHandler.java | 3 +
.../apache/hadoop/hdds/scm/node/NodeStatus.java | 2 +-
.../hadoop/ozone/recon/scm/PipelineSyncTask.java | 46 ++++++++++++-
.../ozone/recon/scm/ReconDeadNodeHandler.java | 79 ++++++++++++++++++++++
.../hadoop/ozone/recon/scm/ReconNodeManager.java | 20 ++++++
.../scm/ReconStorageContainerManagerFacade.java | 5 +-
.../recon/spi/StorageContainerServiceProvider.java | 6 ++
.../impl/StorageContainerServiceProviderImpl.java | 8 +++
.../hadoop/ozone/recon/tasks/ReconTaskConfig.java | 4 +-
.../ozone/recon/scm/TestReconNodeManager.java | 41 ++++++++++-
10 files changed, 207 insertions(+), 7 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java
index 7f9b942..304e075 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DeadNodeHandler.java
@@ -171,5 +171,8 @@ public class DeadNodeHandler implements EventHandler<DatanodeDetails> {
});
}
+ protected NodeManager getNodeManager() {
+ return nodeManager;
+ }
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java
index 72ca015..69dcd0f 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeStatus.java
@@ -200,7 +200,7 @@ public class NodeStatus {
@Override
public String toString() {
return "OperationalState: "+operationalState+" Health: "+health+
- " OperastionStateExpiry: "+opStateExpiryEpochSeconds;
+ " OperationStateExpiry: "+opStateExpiryEpochSeconds;
}
}
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java
index d5de2bc..1ec3d3b 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/PipelineSyncTask.java
@@ -18,8 +18,16 @@
package org.apache.hadoop.ozone.recon.scm;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState.DEAD;
+
+import java.io.IOException;
import java.util.List;
+import java.util.stream.Collectors;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.Node;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider;
import org.apache.hadoop.ozone.recon.tasks.ReconTaskConfig;
@@ -30,7 +38,8 @@ import org.slf4j.LoggerFactory;
/**
* Background pipeline sync task that queries pipelines in SCM, and removes
- * any obsolete pipeline.
+ * any obsolete pipeline. Also syncs operational state of dead nodes with SCM
+ * state.
*/
public class PipelineSyncTask extends ReconScmTask {
@@ -39,15 +48,18 @@ public class PipelineSyncTask extends ReconScmTask {
private StorageContainerServiceProvider scmClient;
private ReconPipelineManager reconPipelineManager;
+ private ReconNodeManager nodeManager;
private final long interval;
public PipelineSyncTask(ReconPipelineManager pipelineManager,
+ ReconNodeManager nodeManager,
StorageContainerServiceProvider scmClient,
ReconTaskStatusDao reconTaskStatusDao,
ReconTaskConfig reconTaskConfig) {
super(reconTaskStatusDao);
this.scmClient = scmClient;
this.reconPipelineManager = pipelineManager;
+ this.nodeManager = nodeManager;
this.interval = reconTaskConfig.getPipelineSyncTaskInterval().toMillis();
}
@@ -58,6 +70,7 @@ public class PipelineSyncTask extends ReconScmTask {
long start = Time.monotonicNow();
List<Pipeline> pipelinesFromScm = scmClient.getPipelines();
reconPipelineManager.initializePipelines(pipelinesFromScm);
+ syncOperationalStateOnDeadNodes();
LOG.info("Pipeline sync Thread took {} milliseconds.",
Time.monotonicNow() - start);
recordSingleRunCompletion();
@@ -67,4 +80,35 @@ public class PipelineSyncTask extends ReconScmTask {
LOG.error("Exception in Pipeline sync Thread.", t);
}
}
+
+ /**
+ * For every dead node in Recon, update Operational state with that on SCM
+ * if different.
+ * @throws IOException on Error
+ * @throws NodeNotFoundException if node not found in Recon.
+ */
+ private void syncOperationalStateOnDeadNodes()
+ throws IOException, NodeNotFoundException {
+ List<DatanodeDetails> deadNodesOnRecon = nodeManager.getNodes(null, DEAD);
+
+ if (!deadNodesOnRecon.isEmpty()) {
+ List<Node> scmNodes = scmClient.getNodes();
+ List<Node> filteredScmNodes = scmNodes.stream()
+ .filter(n -> deadNodesOnRecon.contains(
+ DatanodeDetails.getFromProtoBuf(n.getNodeID())))
+ .collect(Collectors.toList());
+
+ for (Node deadNode : filteredScmNodes) {
+ DatanodeDetails dnDetails =
+ DatanodeDetails.getFromProtoBuf(deadNode.getNodeID());
+
+ HddsProtos.NodeState scmNodeState = deadNode.getNodeStates(0);
+ if (scmNodeState != DEAD) {
+ LOG.warn("Node {} DEAD in Recon, but SCM reports it as {}",
+ dnDetails.getHostName(), scmNodeState);
+ }
+ nodeManager.updateNodeOperationalStateFromScm(deadNode, dnDetails);
+ }
+ }
+ }
}
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconDeadNodeHandler.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconDeadNodeHandler.java
new file mode 100644
index 0000000..9868851
--- /dev/null
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconDeadNodeHandler.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.ozone.recon.scm;
+
+import java.util.List;
+import java.util.Optional;
+
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos.Node;
+import org.apache.hadoop.hdds.scm.container.ContainerManagerV2;
+import org.apache.hadoop.hdds.scm.node.DeadNodeHandler;
+import org.apache.hadoop.hdds.scm.node.NodeManager;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineManager;
+import org.apache.hadoop.hdds.server.events.EventPublisher;
+import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Recon's handling of Dead node.
+ */
+public class ReconDeadNodeHandler extends DeadNodeHandler {
+
+ private static final Logger LOG =
+ LoggerFactory.getLogger(ReconDeadNodeHandler.class);
+
+
+ private StorageContainerServiceProvider scmClient;
+
+ public ReconDeadNodeHandler(NodeManager nodeManager,
+ PipelineManager pipelineManager,
+ ContainerManagerV2 containerManager,
+ StorageContainerServiceProvider scmClient) {
+ super(nodeManager, pipelineManager, containerManager);
+ this.scmClient = scmClient;
+ }
+
+ @Override
+ public void onMessage(final DatanodeDetails datanodeDetails,
+ final EventPublisher publisher) {
+
+ super.onMessage(datanodeDetails, publisher);
+ ReconNodeManager nodeManager = (ReconNodeManager) getNodeManager();
+ try {
+ List<Node> nodes = scmClient.getNodes();
+ Optional<Node> matchedDn = nodes.stream()
+ .filter(n -> n.getNodeID().getUuid()
+ .equals(datanodeDetails.getUuidString()))
+ .findAny();
+
+ if (matchedDn.isPresent()) {
+ nodeManager.updateNodeOperationalStateFromScm(matchedDn.get(),
+ datanodeDetails);
+ } else {
+ LOG.warn("Node {} has reached DEAD state, but SCM does not have " +
+ "information about it.", datanodeDetails);
+ }
+ } catch (Exception ioEx) {
+ LOG.error("Error trying to verify Node operational state from SCM.",
+ ioEx);
+ }
+ }
+}
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java
index 3545efb..e908723 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconNodeManager.java
@@ -27,9 +27,11 @@ import java.util.UUID;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.SCMCommandProto.Type;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.net.NetworkTopology;
+import org.apache.hadoop.hdds.scm.node.NodeStatus;
import org.apache.hadoop.hdds.scm.node.SCMNodeManager;
import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
import org.apache.hadoop.hdds.scm.server.SCMStorageConfig;
@@ -149,4 +151,22 @@ public class ReconNodeManager extends SCMNodeManager {
reportedDn.getPersistedOpState(),
reportedDn.getPersistedOpStateExpiryEpochSec());
}
+
+ public void updateNodeOperationalStateFromScm(HddsProtos.Node scmNode,
+ DatanodeDetails dnDetails)
+ throws NodeNotFoundException {
+ NodeStatus nodeStatus = getNodeStatus(dnDetails);
+ HddsProtos.NodeOperationalState nodeOperationalStateFromScm =
+ scmNode.getNodeOperationalStates(0);
+ if (nodeOperationalStateFromScm != nodeStatus.getOperationalState()) {
+ LOG.info("Updating Node operational state for {}, in SCM = {}, in " +
+ "Recon = {}", dnDetails.getHostName(),
+ nodeOperationalStateFromScm,
+ nodeStatus.getOperationalState());
+
+ setNodeOperationalState(dnDetails, nodeOperationalStateFromScm);
+ DatanodeDetails scmDnd = getNodeByUuid(dnDetails.getUuidString());
+ scmDnd.setPersistedOpState(nodeOperationalStateFromScm);
+ }
+ }
}
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java
index 2272448..c7f4e22 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java
@@ -159,8 +159,8 @@ public class ReconStorageContainerManagerFacade
StaleNodeHandler staleNodeHandler =
new StaleNodeHandler(nodeManager, pipelineManager, conf);
- DeadNodeHandler deadNodeHandler = new DeadNodeHandler(nodeManager,
- pipelineManager, containerManager);
+ DeadNodeHandler deadNodeHandler = new ReconDeadNodeHandler(nodeManager,
+ pipelineManager, containerManager, scmServiceProvider);
ContainerReportHandler containerReportHandler =
new ReconContainerReportHandler(nodeManager, containerManager);
@@ -189,6 +189,7 @@ public class ReconStorageContainerManagerFacade
ReconTaskConfig reconTaskConfig = conf.getObject(ReconTaskConfig.class);
reconScmTasks.add(new PipelineSyncTask(
pipelineManager,
+ nodeManager,
scmServiceProvider,
reconTaskStatusDao,
reconTaskConfig));
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java
index d3a9354..be2c7cb 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/StorageContainerServiceProvider.java
@@ -60,4 +60,10 @@ public interface StorageContainerServiceProvider {
*/
List<ContainerWithPipeline> getExistContainerWithPipelinesInBatch(
List<Long> containerIDs);
+
+ /**
+ * Returns list of nodes from SCM.
+ */
+ List<HddsProtos.Node> getNodes() throws IOException;
+
}
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java
index 47def4c..1e609e8 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/spi/impl/StorageContainerServiceProviderImpl.java
@@ -18,6 +18,8 @@
package org.apache.hadoop.ozone.recon.spi.impl;
+import static org.apache.hadoop.ozone.ClientVersions.CURRENT_VERSION;
+
import java.io.IOException;
import java.util.List;
@@ -66,4 +68,10 @@ public class StorageContainerServiceProviderImpl
List<Long> containerIDs) {
return scmClient.getExistContainerWithPipelinesInBatch(containerIDs);
}
+
+ @Override
+ public List<HddsProtos.Node> getNodes() throws IOException {
+ return scmClient.queryNode(null, null, HddsProtos.QueryScope.CLUSTER,
+ "", CURRENT_VERSION);
+ }
}
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java
index 813baf5..9788bf6 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/tasks/ReconTaskConfig.java
@@ -33,12 +33,12 @@ public class ReconTaskConfig {
@Config(key = "pipelinesync.interval",
type = ConfigType.TIME,
- defaultValue = "600s",
+ defaultValue = "300s",
tags = { ConfigTag.RECON, ConfigTag.OZONE },
description = "The time interval of periodic sync of pipeline state " +
"from SCM to Recon."
)
- private long pipelineSyncTaskInterval = Duration.ofMinutes(10).toMillis();
+ private long pipelineSyncTaskInterval = Duration.ofMinutes(5).toMillis();
public Duration getPipelineSyncTaskInterval() {
return Duration.ofMillis(pipelineSyncTaskInterval);
diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java
index 0cacd7a..4c68ed6 100644
--- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java
+++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconNodeManager.java
@@ -19,11 +19,16 @@
package org.apache.hadoop.ozone.recon.scm;
import static org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONING;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_DIRS;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
+import static org.mockito.ArgumentMatchers.eq;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
import java.io.IOException;
import java.util.List;
@@ -43,6 +48,7 @@ import org.apache.hadoop.hdds.utils.db.Table;
import org.apache.hadoop.ozone.protocol.commands.ReregisterCommand;
import org.apache.hadoop.ozone.protocol.commands.SCMCommand;
import org.apache.hadoop.ozone.protocol.commands.SetNodeOperationalStateCommand;
+import org.apache.hadoop.test.LambdaTestUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Rule;
@@ -104,7 +110,7 @@ public class TestReconNodeManager {
// This command should never be returned by Recon
reconNodeManager.addDatanodeCommand(datanodeDetails.getUuid(),
new SetNodeOperationalStateCommand(1234,
- HddsProtos.NodeOperationalState.DECOMMISSIONING, 0));
+ DECOMMISSIONING, 0));
// This one should be returned
reconNodeManager.addDatanodeCommand(datanodeDetails.getUuid(),
@@ -156,4 +162,37 @@ public class TestReconNodeManager {
assertNotNull(
reconNodeManager.getNodeByUuid(datanodeDetails.getUuidString()));
}
+
+ @Test
+ public void testUpdateNodeOperationalStateFromScm() throws Exception {
+ ReconStorageConfig scmStorageConfig = new ReconStorageConfig(conf);
+ EventQueue eventQueue = new EventQueue();
+ NetworkTopology clusterMap = new NetworkTopologyImpl(conf);
+ Table<UUID, DatanodeDetails> nodeTable =
+ ReconSCMDBDefinition.NODES.getTable(store);
+ ReconNodeManager reconNodeManager = new ReconNodeManager(conf,
+ scmStorageConfig, eventQueue, clusterMap, nodeTable);
+
+
+ DatanodeDetails datanodeDetails = randomDatanodeDetails();
+ HddsProtos.Node node = mock(HddsProtos.Node.class);
+
+ LambdaTestUtils.intercept(NodeNotFoundException.class, () -> {
+ reconNodeManager.updateNodeOperationalStateFromScm(node, datanodeDetails);
+ });
+
+ reconNodeManager.register(datanodeDetails, null, null);
+ assertEquals(IN_SERVICE, reconNodeManager
+ .getNodeByUuid(datanodeDetails.getUuidString()).getPersistedOpState());
+
+ when(node.getNodeOperationalStates(eq(0)))
+ .thenReturn(DECOMMISSIONING);
+ reconNodeManager.updateNodeOperationalStateFromScm(node, datanodeDetails);
+ assertEquals(DECOMMISSIONING, reconNodeManager
+ .getNodeByUuid(datanodeDetails.getUuidString()).getPersistedOpState());
+ List<DatanodeDetails> nodes =
+ reconNodeManager.getNodes(DECOMMISSIONING, null);
+ assertEquals(1, nodes.size());
+ assertEquals(datanodeDetails.getUuid(), nodes.get(0).getUuid());
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org