You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by ca...@apache.org on 2022/11/09 06:15:53 UTC

[ozone] branch ozone-1.3 updated (6d73f6f6dd -> 2911f933b9)

This is an automated email from the ASF dual-hosted git repository.

captainzmc pushed a change to branch ozone-1.3
in repository https://gitbox.apache.org/repos/asf/ozone.git


    from 6d73f6f6dd HDDS-7326. Intermittent timeout in TestECContainerRecovery.testContainerRecoveryOverReplicationProcessing (#3941)
     new 47411f96ae HDDS-2642. Expose decommission / maintenance metrics via JMX (#3781)
     new 2911f933b9 HDDS-7453. Check certificate expiration at service startup, renew if necessary. (#3930)

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../org/apache/hadoop/hdds/HddsConfigKeys.java     |  10 +
 .../hadoop/hdds/security/x509/SecurityConfig.java  |  18 ++
 .../apache/hadoop/ozone/common/StorageInfo.java    |   4 +
 .../common/src/main/resources/ozone-default.xml    |   8 +
 .../apache/hadoop/ozone/HddsDatanodeService.java   |   5 +
 .../hadoop/ozone/TestHddsSecureDatanodeInit.java   |   2 +-
 .../x509/certificate/client/CertificateClient.java |   3 +-
 .../client/CommonCertificateClient.java            |   6 +
 .../client/DefaultCertificateClient.java           |  60 +++-
 .../certificate/client/SCMCertificateClient.java   |   3 +
 .../client/TestCertificateClientInit.java          |   2 +-
 .../client/TestDefaultCertificateClient.java       |  52 ++++
 .../hadoop/hdds/scm/node/DatanodeAdminMonitor.java |   2 +-
 .../hdds/scm/node/DatanodeAdminMonitorImpl.java    |  65 +++-
 .../hdds/scm/node/NodeDecommissionManager.java     |   8 +-
 .../hdds/scm/node/NodeDecommissionMetrics.java     | 290 +++++++++++++++++
 .../scm/node/DatanodeAdminMonitorTestUtil.java     | 141 +++++++++
 .../hdds/scm/node/TestDatanodeAdminMonitor.java    | 164 +++-------
 .../hdds/scm/node/TestNodeDecommissionMetrics.java | 343 +++++++++++++++++++++
 .../hadoop/ozone/om/TestSecureOzoneManager.java    |   2 +-
 .../java/org/apache/hadoop/ozone/om/OMStorage.java |   4 +
 .../org/apache/hadoop/ozone/om/OzoneManager.java   |  73 +++--
 .../org/apache/hadoop/ozone/recon/ReconServer.java |  25 +-
 .../hadoop/ozone/recon/scm/ReconStorageConfig.java |   4 +
 24 files changed, 1107 insertions(+), 187 deletions(-)
 create mode 100644 hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java
 create mode 100644 hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java
 create mode 100644 hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org


[ozone] 01/02: HDDS-2642. Expose decommission / maintenance metrics via JMX (#3781)

Posted by ca...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

captainzmc pushed a commit to branch ozone-1.3
in repository https://gitbox.apache.org/repos/asf/ozone.git

commit 47411f96aeadb86e356684bccf5a958eec5f04dc
Author: Neil Joshi <ne...@gmail.com>
AuthorDate: Tue Nov 8 19:31:56 2022 -0700

    HDDS-2642. Expose decommission / maintenance metrics via JMX (#3781)
    
    * Expose decommission / maintenance metrics via JMX
---
 .../hadoop/hdds/scm/node/DatanodeAdminMonitor.java |   2 +-
 .../hdds/scm/node/DatanodeAdminMonitorImpl.java    |  65 +++-
 .../hdds/scm/node/NodeDecommissionManager.java     |   8 +-
 .../hdds/scm/node/NodeDecommissionMetrics.java     | 290 +++++++++++++++++
 .../scm/node/DatanodeAdminMonitorTestUtil.java     | 141 +++++++++
 .../hdds/scm/node/TestDatanodeAdminMonitor.java    | 164 +++-------
 .../hdds/scm/node/TestNodeDecommissionMetrics.java | 343 +++++++++++++++++++++
 7 files changed, 884 insertions(+), 129 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
index 3466547751..bd224e45a7 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitor.java
@@ -30,5 +30,5 @@ public interface DatanodeAdminMonitor extends Runnable {
   void startMonitoring(DatanodeDetails dn);
   void stopMonitoring(DatanodeDetails dn);
   Set<DatanodeDetails> getTrackedNodes();
-
+  void setMetrics(NodeDecommissionMetrics metrics);
 }
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
index e676fc1ced..4c446acce7 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hdds.scm.container.ContainerReplica;
 import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaCount;
 import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
 import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import org.apache.hadoop.hdds.scm.node.NodeDecommissionMetrics.ContainerStateInWorkflow;
 import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
 import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
 import org.apache.hadoop.hdds.server.events.EventPublisher;
@@ -38,8 +39,10 @@ import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashSet;
+import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Map;
 import java.util.Queue;
 import java.util.Set;
 import java.util.stream.Collectors;
@@ -74,6 +77,15 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
   private Queue<DatanodeDetails> pendingNodes = new ArrayDeque();
   private Queue<DatanodeDetails> cancelledNodes = new ArrayDeque();
   private Set<DatanodeDetails> trackedNodes = new HashSet<>();
+  private NodeDecommissionMetrics metrics;
+  private long pipelinesWaitingToClose = 0;
+  private long sufficientlyReplicatedContainers = 0;
+  private long trackedDecomMaintenance = 0;
+  private long trackedRecommission = 0;
+  private long unhealthyContainers = 0;
+  private long underReplicatedContainers = 0;
+
+  private Map<String, ContainerStateInWorkflow> containerStateByHost;
 
   private static final Logger LOG =
       LoggerFactory.getLogger(DatanodeAdminMonitorImpl.class);
@@ -90,6 +102,8 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
     this.eventQueue = eventQueue;
     this.nodeManager = nodeManager;
     this.replicationManager = replicationManager;
+
+    containerStateByHost = new HashMap<>();
   }
 
   /**
@@ -117,6 +131,10 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
     cancelledNodes.add(dn);
   }
 
+  public synchronized void setMetrics(NodeDecommissionMetrics metrics) {
+    this.metrics = metrics;
+  }
+
   /**
    * Get the set of nodes which are currently tracked in the decommissioned
    * and maintenance workflow.
@@ -139,16 +157,20 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
   @Override
   public void run() {
     try {
+      containerStateByHost.clear();
       synchronized (this) {
+        trackedRecommission = getCancelledCount();
         processCancelledNodes();
         processPendingNodes();
+        trackedDecomMaintenance = getTrackedNodeCount();
       }
       processTransitioningNodes();
       if (trackedNodes.size() > 0 || pendingNodes.size() > 0) {
         LOG.info("There are {} nodes tracked for decommission and " +
-                "maintenance. {} pending nodes.",
+            "maintenance.  {} pending nodes.",
             trackedNodes.size(), pendingNodes.size());
       }
+      setMetricsToGauge();
     } catch (Exception e) {
       LOG.error("Caught an error in the DatanodeAdminMonitor", e);
       // Intentionally do not re-throw, as if we do the monitor thread
@@ -168,6 +190,28 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
     return trackedNodes.size();
   }
 
+  synchronized void setMetricsToGauge() {
+    synchronized (metrics) {
+      metrics.setContainersUnhealthyTotal(unhealthyContainers);
+      metrics.setRecommissionNodesTotal(trackedRecommission);
+      metrics.setDecommissioningMaintenanceNodesTotal(
+          trackedDecomMaintenance);
+      metrics.setContainersUnderReplicatedTotal(
+          underReplicatedContainers);
+      metrics.setContainersSufficientlyReplicatedTotal(
+          sufficientlyReplicatedContainers);
+      metrics.setPipelinesWaitingToCloseTotal(pipelinesWaitingToClose);
+      metrics.metricRecordOfContainerStateByHost(containerStateByHost);
+    }
+  }
+
+  void resetContainerMetrics() {
+    pipelinesWaitingToClose = 0;
+    sufficientlyReplicatedContainers = 0;
+    unhealthyContainers = 0;
+    underReplicatedContainers = 0;
+  }
+
   private void processCancelledNodes() {
     while (!cancelledNodes.isEmpty()) {
       DatanodeDetails dn = cancelledNodes.poll();
@@ -188,7 +232,9 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
   }
 
   private void processTransitioningNodes() {
+    resetContainerMetrics();
     Iterator<DatanodeDetails> iterator = trackedNodes.iterator();
+
     while (iterator.hasNext()) {
       DatanodeDetails dn = iterator.next();
       try {
@@ -256,8 +302,8 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
       NodeStatus nodeStatus) {
     if (!nodeStatus.isDecommission() && !nodeStatus.isMaintenance()) {
       LOG.warn("Datanode {} has an operational state of {} when it should " +
-              "be undergoing decommission or maintenance. Aborting admin for " +
-              "this node.", dn, nodeStatus.getOperationalState());
+          "be undergoing decommission or maintenance. Aborting admin for " +
+          "this node.", dn, nodeStatus.getOperationalState());
       return false;
     }
     if (nodeStatus.isDead() && !nodeStatus.isInMaintenance()) {
@@ -278,6 +324,10 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
     } else {
       LOG.info("Waiting for pipelines to close for {}. There are {} " +
           "pipelines", dn, pipelines.size());
+      containerStateByHost.put(dn.getHostName(),
+        new ContainerStateInWorkflow(dn.getHostName(), 0L, 0L, 0L,
+            pipelines.size()));
+      pipelinesWaitingToClose += pipelines.size();
       return false;
     }
   }
@@ -327,6 +377,15 @@ public class DatanodeAdminMonitorImpl implements DatanodeAdminMonitor {
     LOG.info("{} has {} sufficientlyReplicated, {} underReplicated and {} " +
         "unhealthy containers",
         dn, sufficientlyReplicated, underReplicated, unhealthy);
+    containerStateByHost.put(dn.getHostName(),
+        new ContainerStateInWorkflow(dn.getHostName(),
+            sufficientlyReplicated,
+            underReplicated,
+            unhealthy,
+            0L));
+    sufficientlyReplicatedContainers += sufficientlyReplicated;
+    underReplicatedContainers += underReplicated;
+    unhealthyContainers += unhealthy;
     if (LOG.isDebugEnabled() && underReplicatedIDs.size() < 10000 &&
         unhealthyIDs.size() < 10000) {
       LOG.debug("{} has {} underReplicated [{}] and {} unhealthy [{}] " +
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java
index 1ea04cdfc3..a84b07d513 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionManager.java
@@ -60,6 +60,9 @@ public class NodeDecommissionManager {
   private boolean useHostnames;
   private long monitorInterval;
 
+  // Decommissioning and Maintenance mode progress related metrics.
+  private NodeDecommissionMetrics metrics;
+
   private static final Logger LOG =
       LoggerFactory.getLogger(NodeDecommissionManager.class);
 
@@ -181,6 +184,7 @@ public class NodeDecommissionManager {
     this.scmContext = scmContext;
     this.eventQueue = eventQueue;
     this.replicationManager = rm;
+    this.metrics = null;
 
     executor = Executors.newScheduledThreadPool(1,
         new ThreadFactoryBuilder().setNameFormat("DatanodeAdminManager-%d")
@@ -208,7 +212,8 @@ public class NodeDecommissionManager {
 
     monitor = new DatanodeAdminMonitorImpl(conf, eventQueue, nodeManager,
         replicationManager);
-
+    this.metrics = NodeDecommissionMetrics.create();
+    monitor.setMetrics(this.metrics);
     executor.scheduleAtFixedRate(monitor, monitorInterval, monitorInterval,
         TimeUnit.SECONDS);
   }
@@ -373,6 +378,7 @@ public class NodeDecommissionManager {
    *  Stops the decommission monitor from running when SCM is shutdown.
    */
   public void stop() {
+    metrics.unRegister();
     if (executor != null) {
       executor.shutdown();
     }
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java
new file mode 100644
index 0000000000..8be3d2a6d0
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java
@@ -0,0 +1,290 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with this
+ * work for additional information regarding copyright ownership.  The ASF
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+ * License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+package org.apache.hadoop.hdds.scm.node;
+
+import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.metrics2.MetricsCollector;
+import org.apache.hadoop.metrics2.MetricsInfo;
+import org.apache.hadoop.metrics2.MetricsRecordBuilder;
+import org.apache.hadoop.metrics2.MetricsSource;
+import org.apache.hadoop.metrics2.MetricsTag;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.Interns;
+import org.apache.hadoop.metrics2.lib.MetricsRegistry;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
+import org.apache.hadoop.ozone.OzoneConsts;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Class contains metrics related to the NodeDecommissionManager.
+ */
+@Metrics(about = "Node Decommission Metrics", context = OzoneConsts.OZONE)
+public final class NodeDecommissionMetrics implements MetricsSource {
+  public static final String METRICS_SOURCE_NAME =
+      org.apache.hadoop.hdds.scm.node.NodeDecommissionMetrics
+          .class.getSimpleName();
+
+  @Metric("Number of nodes tracked for decommissioning and maintenance.")
+  private MutableGaugeLong decommissioningMaintenanceNodesTotal;
+
+  @Metric("Number of nodes tracked for recommissioning.")
+  private MutableGaugeLong recommissionNodesTotal;
+
+  @Metric("Number of nodes tracked with pipelines waiting to close.")
+  private MutableGaugeLong pipelinesWaitingToCloseTotal;
+
+  @Metric("Number of containers under replicated in tracked nodes.")
+  private MutableGaugeLong containersUnderReplicatedTotal;
+
+  @Metric("Number of containers unhealthy in tracked nodes.")
+  private MutableGaugeLong containersUnhealthyTotal;
+
+  @Metric("Number of containers sufficiently replicated in tracked nodes.")
+  private MutableGaugeLong containersSufficientlyReplicatedTotal;
+
+  /**
+   * Inner class for snapshot of Datanode ContainerState in
+   * Decommissioning and Maintenance mode workflow.
+   */
+  public static final class ContainerStateInWorkflow {
+    private long sufficientlyReplicated = 0;
+    private long unhealthyContainers = 0;
+    private long underReplicatedContainers = 0;
+    private String host = "";
+    private long pipelinesWaitingToClose = 0;
+
+    private static final MetricsInfo HOST_UNDER_REPLICATED = Interns.info(
+        "UnderReplicatedDN",
+        "Number of under-replicated containers "
+            + "for host in decommissioning and "
+            + "maintenance mode");
+
+    private static final MetricsInfo HOST_PIPELINES_TO_CLOSE = Interns.info(
+        "PipelinesWaitingToCloseDN",
+        "Number of pipelines waiting to close for "
+            + "host in decommissioning and "
+            + "maintenance mode");
+
+    private static final MetricsInfo HOST_SUFFICIENTLY_REPLICATED = Interns
+        .info(
+            "SufficientlyReplicatedDN",
+        "Number of sufficiently replicated containers "
+            + "for host in decommissioning and "
+            + "maintenance mode");
+
+    private static final MetricsInfo HOST_UNHEALTHY_CONTAINERS = Interns.info(
+        "UnhealthyContainersDN",
+        "Number of unhealthy containers "
+            + "for host in decommissioning and "
+            + "maintenance mode");
+
+
+    public ContainerStateInWorkflow(String host,
+                                    long sufficiently,
+                                    long under,
+                                    long unhealthy,
+                                    long pipelinesToClose) {
+      this.host = host;
+      sufficientlyReplicated = sufficiently;
+      underReplicatedContainers = under;
+      unhealthyContainers = unhealthy;
+      pipelinesWaitingToClose = pipelinesToClose;
+    }
+
+    public String getHost() {
+      return host;
+    }
+
+    public long getSufficientlyReplicated() {
+      return sufficientlyReplicated;
+    }
+
+    public long getPipelinesWaitingToClose() {
+      return pipelinesWaitingToClose;
+    }
+
+    public long getUnderReplicatedContainers() {
+      return underReplicatedContainers;
+    }
+
+    public long getUnhealthyContainers() {
+      return unhealthyContainers;
+    }
+  }
+
+  private MetricsRegistry registry;
+
+  private Map<String, ContainerStateInWorkflow> metricsByHost;
+
+  /** Private constructor. */
+  private NodeDecommissionMetrics() {
+    this.registry = new MetricsRegistry(METRICS_SOURCE_NAME);
+    metricsByHost = new HashMap<>();
+  }
+
+  /**
+   * Create and returns NodeDecommissionMetrics instance.
+   *
+   * @return NodeDecommissionMetrics
+   */
+  public static NodeDecommissionMetrics create() {
+    return DefaultMetricsSystem.instance().register(METRICS_SOURCE_NAME,
+        "Metrics tracking the progress of nodes in the "
+            + "Decommissioning and Maintenance workflows.  "
+            + "Tracks num nodes in mode and container "
+            + "replications state and pipelines waiting to close",
+        new NodeDecommissionMetrics());
+  }
+
+  /**
+   * Get aggregated gauge metrics.
+   */
+  @Override
+  public synchronized void getMetrics(MetricsCollector collector, boolean all) {
+    MetricsRecordBuilder builder = collector
+        .addRecord(METRICS_SOURCE_NAME);
+    decommissioningMaintenanceNodesTotal.snapshot(builder, all);
+    recommissionNodesTotal.snapshot(builder, all);
+    pipelinesWaitingToCloseTotal.snapshot(builder, all);
+    containersUnderReplicatedTotal.snapshot(builder, all);
+    containersUnhealthyTotal.snapshot(builder, all);
+    containersSufficientlyReplicatedTotal.snapshot(builder, all);
+
+    MetricsRecordBuilder recordBuilder = builder;
+    for (Map.Entry<String, ContainerStateInWorkflow> e :
+        metricsByHost.entrySet()) {
+      recordBuilder = recordBuilder.endRecord().addRecord(METRICS_SOURCE_NAME)
+          .add(new MetricsTag(Interns.info("datanode",
+              "datanode host in decommission maintenance workflow"),
+              e.getValue().getHost()))
+          .addGauge(ContainerStateInWorkflow.HOST_PIPELINES_TO_CLOSE,
+              e.getValue().getPipelinesWaitingToClose())
+          .addGauge(ContainerStateInWorkflow.HOST_UNDER_REPLICATED,
+              e.getValue().getUnderReplicatedContainers())
+          .addGauge(ContainerStateInWorkflow.HOST_SUFFICIENTLY_REPLICATED,
+              e.getValue().getSufficientlyReplicated())
+          .addGauge(ContainerStateInWorkflow.HOST_UNHEALTHY_CONTAINERS,
+              e.getValue().getUnhealthyContainers());
+    }
+    recordBuilder.endRecord();
+  }
+
+  /**
+   * Unregister the metrics instance.
+   */
+  public void unRegister() {
+    DefaultMetricsSystem.instance().unregisterSource(METRICS_SOURCE_NAME);
+  }
+
+  public synchronized void setDecommissioningMaintenanceNodesTotal(
+            long numNodesTracked) {
+    decommissioningMaintenanceNodesTotal
+        .set(numNodesTracked);
+  }
+
+  public synchronized void setRecommissionNodesTotal(
+          long numNodesTrackedRecommissioned) {
+    recommissionNodesTotal.set(numNodesTrackedRecommissioned);
+  }
+
+  public synchronized void setPipelinesWaitingToCloseTotal(
+          long numTrackedPipelinesWaitToClose) {
+    pipelinesWaitingToCloseTotal
+        .set(numTrackedPipelinesWaitToClose);
+  }
+
+  public synchronized void setContainersUnderReplicatedTotal(
+          long numTrackedUnderReplicated) {
+    containersUnderReplicatedTotal
+        .set(numTrackedUnderReplicated);
+  }
+
+  public synchronized void setContainersUnhealthyTotal(
+          long numTrackedUnhealthy) {
+    containersUnhealthyTotal
+        .set(numTrackedUnhealthy);
+  }
+
+  public synchronized void setContainersSufficientlyReplicatedTotal(
+          long numTrackedSufficientlyReplicated) {
+    containersSufficientlyReplicatedTotal
+        .set(numTrackedSufficientlyReplicated);
+  }
+
+  public synchronized long getDecommissioningMaintenanceNodesTotal() {
+    return decommissioningMaintenanceNodesTotal.value();
+  }
+
+  public synchronized long getRecommissionNodesTotal() {
+    return recommissionNodesTotal.value();
+  }
+
+  public synchronized long getPipelinesWaitingToCloseTotal() {
+    return pipelinesWaitingToCloseTotal.value();
+  }
+
+  public synchronized long getContainersUnderReplicatedTotal() {
+    return containersUnderReplicatedTotal.value();
+  }
+
+  public synchronized long getContainersUnhealthyTotal() {
+    return containersUnhealthyTotal.value();
+  }
+
+  public synchronized long getContainersSufficientlyReplicatedTotal() {
+    return containersSufficientlyReplicatedTotal.value();
+  }
+
+  public synchronized void metricRecordOfContainerStateByHost(
+      Map<String, ContainerStateInWorkflow> containerStatesByHost) {
+    metricsByHost.clear();
+    metricsByHost.putAll(containerStatesByHost);
+  }
+
+  @VisibleForTesting
+  public Long getPipelinesWaitingToCloseByHost(String host) {
+    ContainerStateInWorkflow workflowMetrics = metricsByHost.get(host);
+    return workflowMetrics == null ? null :
+        workflowMetrics.getPipelinesWaitingToClose();
+  }
+
+  @VisibleForTesting
+  public Long getSufficientlyReplicatedByHost(String host) {
+    ContainerStateInWorkflow workflowMetrics = metricsByHost.get(host);
+    return workflowMetrics == null ? null :
+        workflowMetrics.getSufficientlyReplicated();
+  }
+
+  @VisibleForTesting
+  public Long getUnderReplicatedByHost(String host) {
+    ContainerStateInWorkflow workflowMetrics = metricsByHost.get(host);
+    return workflowMetrics == null ? null :
+        workflowMetrics.getUnderReplicatedContainers();
+  }
+
+  @VisibleForTesting
+  public Long getUnhealthyContainersByHost(String host) {
+    ContainerStateInWorkflow workflowMetrics = metricsByHost.get(host);
+    return workflowMetrics == null ? null :
+        workflowMetrics.getUnhealthyContainers();
+  }
+}
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java
new file mode 100644
index 0000000000..fb4b4e2ad9
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java
@@ -0,0 +1,141 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdds.scm.node;
+
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import org.apache.hadoop.hdds.scm.container.RatisContainerReplicaCount;
+import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaCount;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.apache.hadoop.hdds.server.events.EventHandler;
+import org.apache.hadoop.hdds.server.events.EventPublisher;
+import org.mockito.Mockito;
+
+import java.util.HashSet;
+import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED;
+import static org.mockito.Mockito.reset;
+
+/**
+ * Helper class to provide common methods used to test DatanodeAdminMonitor
+ * and NodeDecommissionMetrics for tracking decommission and maintenance mode
+ * workflow progress.
+ */
+public final class DatanodeAdminMonitorTestUtil {
+  private DatanodeAdminMonitorTestUtil() {
+  }
+
+  /**
+   * Generate a new ContainerReplica with the given containerID and State.
+   * @param containerID The ID the replica is associated with
+   * @param nodeState The persistedOpState stored in datanodeDetails.
+   * @param replicaState The state of the generated replica.
+   * @return A containerReplica with the given ID and state
+   */
+  public static ContainerReplica generateReplica(
+      ContainerID containerID,
+      HddsProtos.NodeOperationalState nodeState,
+      StorageContainerDatanodeProtocolProtos.ContainerReplicaProto
+          .State replicaState) {
+    DatanodeDetails dn = MockDatanodeDetails.randomDatanodeDetails();
+    dn.setPersistedOpState(nodeState);
+    return ContainerReplica.newBuilder()
+        .setContainerState(replicaState)
+        .setContainerID(containerID)
+        .setSequenceId(1)
+        .setDatanodeDetails(dn)
+        .build();
+  }
+
+  /**
+   * Create a ContainerReplicaCount object, including a container with the
+   * requested ContainerID and state, along with a set of replicas of the given
+   * states.
+   * @param containerID The ID of the container to create an included
+   * @param containerState The state of the container
+   * @param states Create a replica for each of the given states.
+   * @return A ContainerReplicaCount containing the generated container and
+   *         replica set
+   */
+  public static ContainerReplicaCount generateReplicaCount(
+      ContainerID containerID,
+      HddsProtos.LifeCycleState containerState,
+      HddsProtos.NodeOperationalState...states) {
+    Set<ContainerReplica> replicas = new HashSet<>();
+    for (HddsProtos.NodeOperationalState s : states) {
+      replicas.add(generateReplica(containerID, s, CLOSED));
+    }
+    ContainerInfo container = new ContainerInfo.Builder()
+        .setContainerID(containerID.getId())
+        .setState(containerState)
+        .build();
+
+    return new RatisContainerReplicaCount(container, replicas, 0, 0, 3, 2);
+  }
+
+  /**
+   * The only interaction the DatanodeAdminMonitor has with the
+   * ReplicationManager, is to request a ContainerReplicaCount object for each
+   * container on nodes being deocmmissioned or moved to maintenance. This
+   * method mocks that interface to return a ContainerReplicaCount with a
+   * container in the given containerState and a set of replias in the given
+   * replicaStates.
+   * @param containerState
+   * @param replicaStates
+   * @throws ContainerNotFoundException
+   */
+  public static void mockGetContainerReplicaCount(
+      ReplicationManager repManager,
+      HddsProtos.LifeCycleState containerState,
+      HddsProtos.NodeOperationalState...replicaStates)
+      throws ContainerNotFoundException {
+    reset(repManager);
+    Mockito.when(repManager.getContainerReplicaCount(
+        Mockito.any(ContainerID.class)))
+        .thenAnswer(invocation ->
+            generateReplicaCount((ContainerID)invocation.getArguments()[0],
+                containerState, replicaStates));
+  }
+
+  /**
+   * This simple internal class is used to track and handle any DatanodeAdmin
+   * events fired by the DatanodeAdminMonitor during tests.
+   */
+  public static class DatanodeAdminHandler implements
+          EventHandler<DatanodeDetails> {
+    private AtomicInteger invocation = new AtomicInteger(0);
+
+    @Override
+    public void onMessage(final DatanodeDetails dn,
+                          final EventPublisher publisher) {
+      invocation.incrementAndGet();
+    }
+
+    public int getInvocation() {
+      return invocation.get();
+    }
+  }
+}
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
index 8afe2a1810..e2b4e13037 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
@@ -21,20 +21,12 @@ import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.protocol.DatanodeDetails;
 import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
-import org.apache.hadoop.hdds.protocol.proto
-    .StorageContainerDatanodeProtocolProtos.ContainerReplicaProto;
 import org.apache.hadoop.hdds.scm.container.ContainerID;
-import org.apache.hadoop.hdds.scm.container.RatisContainerReplicaCount;
-import org.apache.hadoop.hdds.scm.container.ContainerInfo;
 import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
-import org.apache.hadoop.hdds.scm.container.ContainerReplica;
-import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaCount;
 import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
 import org.apache.hadoop.hdds.scm.container.SimpleMockNodeManager;
 import org.apache.hadoop.hdds.scm.events.SCMEvents;
 import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
-import org.apache.hadoop.hdds.server.events.EventHandler;
-import org.apache.hadoop.hdds.server.events.EventPublisher;
 import org.apache.hadoop.hdds.server.events.EventQueue;
 import org.apache.hadoop.security.authentication.client.AuthenticationException;
 import org.junit.jupiter.api.BeforeEach;
@@ -43,7 +35,6 @@ import org.mockito.Mockito;
 import java.io.IOException;
 import java.util.HashSet;
 import java.util.Set;
-import java.util.concurrent.atomic.AtomicInteger;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -51,8 +42,6 @@ import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalSt
 import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE;
 import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_MAINTENANCE;
 import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
-import static org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State.CLOSED;
-import static org.mockito.Mockito.reset;
 
 /**
  * Tests to ensure the DatanodeAdminMonitor is working correctly. This class
@@ -64,7 +53,8 @@ public class TestDatanodeAdminMonitor {
   private SimpleMockNodeManager nodeManager;
   private OzoneConfiguration conf;
   private DatanodeAdminMonitorImpl monitor;
-  private DatanodeAdminHandler startAdminHandler;
+  private DatanodeAdminMonitorTestUtil
+          .DatanodeAdminHandler startAdminHandler;
   private ReplicationManager repManager;
   private EventQueue eventQueue;
 
@@ -73,7 +63,8 @@ public class TestDatanodeAdminMonitor {
     conf = new OzoneConfiguration();
 
     eventQueue = new EventQueue();
-    startAdminHandler = new DatanodeAdminHandler();
+    startAdminHandler = new DatanodeAdminMonitorTestUtil
+            .DatanodeAdminHandler();
     eventQueue.addHandler(SCMEvents.START_ADMIN_ON_NODE, startAdminHandler);
 
     nodeManager = new SimpleMockNodeManager();
@@ -82,6 +73,7 @@ public class TestDatanodeAdminMonitor {
 
     monitor =
         new DatanodeAdminMonitorImpl(conf, eventQueue, nodeManager, repManager);
+    monitor.setMetrics(NodeDecommissionMetrics.create());
   }
 
   @Test
@@ -173,11 +165,13 @@ public class TestDatanodeAdminMonitor {
     nodeManager.setContainers(dn1, generateContainers(3));
     // Mock Replication Manager to return ContainerReplicaCount's which
     // always have a DECOMMISSIONED replica.
-    mockGetContainerReplicaCount(
-        HddsProtos.LifeCycleState.CLOSED,
-        DECOMMISSIONED,
-        IN_SERVICE,
-        IN_SERVICE);
+    DatanodeAdminMonitorTestUtil
+            .mockGetContainerReplicaCount(
+                    repManager,
+                    HddsProtos.LifeCycleState.CLOSED,
+                    DECOMMISSIONED,
+                    IN_SERVICE,
+                    IN_SERVICE);
 
     // Run the monitor for the first time and the node will transition to
     // REPLICATE_CONTAINERS as there are no pipelines to close.
@@ -197,11 +191,13 @@ public class TestDatanodeAdminMonitor {
     // Now change the replicationManager mock to return 3 CLOSED replicas
     // and the node should complete the REPLICATE_CONTAINERS step, moving to
     // complete which will end the decommission workflow
-    mockGetContainerReplicaCount(
-        HddsProtos.LifeCycleState.CLOSED,
-        IN_SERVICE,
-        IN_SERVICE,
-        IN_SERVICE);
+    DatanodeAdminMonitorTestUtil
+            .mockGetContainerReplicaCount(
+                    repManager,
+                    HddsProtos.LifeCycleState.CLOSED,
+                    IN_SERVICE,
+                    IN_SERVICE,
+                    IN_SERVICE);
 
     monitor.run();
 
@@ -219,11 +215,13 @@ public class TestDatanodeAdminMonitor {
             HddsProtos.NodeState.HEALTHY));
 
     nodeManager.setContainers(dn1, generateContainers(3));
-    mockGetContainerReplicaCount(
-        HddsProtos.LifeCycleState.CLOSED,
-        DECOMMISSIONED,
-        IN_SERVICE,
-        IN_SERVICE);
+    DatanodeAdminMonitorTestUtil
+            .mockGetContainerReplicaCount(
+                    repManager,
+                    HddsProtos.LifeCycleState.CLOSED,
+                    DECOMMISSIONED,
+                    IN_SERVICE,
+                    IN_SERVICE);
 
     // Add the node to the monitor, it should have 3 under-replicated containers
     // after the first run
@@ -254,9 +252,13 @@ public class TestDatanodeAdminMonitor {
             HddsProtos.NodeState.HEALTHY));
 
     nodeManager.setContainers(dn1, generateContainers(3));
-    mockGetContainerReplicaCount(
-        HddsProtos.LifeCycleState.CLOSED,
-        DECOMMISSIONED, IN_SERVICE, IN_SERVICE);
+    DatanodeAdminMonitorTestUtil
+            .mockGetContainerReplicaCount(
+                    repManager,
+                    HddsProtos.LifeCycleState.CLOSED,
+                    DECOMMISSIONED,
+                    IN_SERVICE,
+                    IN_SERVICE);
 
     // Add the node to the monitor, it should have 3 under-replicated containers
     // after the first run
@@ -344,11 +346,13 @@ public class TestDatanodeAdminMonitor {
             HddsProtos.NodeState.HEALTHY));
 
     nodeManager.setContainers(dn1, generateContainers(3));
-    mockGetContainerReplicaCount(
-        HddsProtos.LifeCycleState.CLOSED,
-        IN_MAINTENANCE,
-        ENTERING_MAINTENANCE,
-        IN_MAINTENANCE);
+    DatanodeAdminMonitorTestUtil
+            .mockGetContainerReplicaCount(
+                    repManager,
+                    HddsProtos.LifeCycleState.CLOSED,
+                    IN_MAINTENANCE,
+                    ENTERING_MAINTENANCE,
+                    IN_MAINTENANCE);
 
     // Add the node to the monitor, it should transiting to
     // REPLICATE_CONTAINERS as the containers are under-replicated for
@@ -432,51 +436,6 @@ public class TestDatanodeAdminMonitor {
     return containers;
   }
 
-  /**
-   * Create a ContainerReplicaCount object, including a container with the
-   * requested ContainerID and state, along with a set of replicas of the given
-   * states.
-   * @param containerID The ID of the container to create an included
-   * @param containerState The state of the container
-   * @param states Create a replica for each of the given states.
-   * @return A ContainerReplicaCount containing the generated container and
-   *         replica set
-   */
-  private ContainerReplicaCount generateReplicaCount(ContainerID containerID,
-      HddsProtos.LifeCycleState containerState,
-      HddsProtos.NodeOperationalState...states) {
-    Set<ContainerReplica> replicas = new HashSet<>();
-    for (HddsProtos.NodeOperationalState s : states) {
-      replicas.add(generateReplica(containerID, s, CLOSED));
-    }
-    ContainerInfo container = new ContainerInfo.Builder()
-        .setContainerID(containerID.getId())
-        .setState(containerState)
-        .build();
-
-    return new RatisContainerReplicaCount(container, replicas, 0, 0, 3, 2);
-  }
-
-  /**
-   * Generate a new ContainerReplica with the given containerID and State.
-   * @param containerID The ID the replica is associated with
-   * @param nodeState The persistedOpState stored in datanodeDetails.
-   * @param replicaState The state of the generated replica.
-   * @return A containerReplica with the given ID and state
-   */
-  private ContainerReplica generateReplica(ContainerID containerID,
-      HddsProtos.NodeOperationalState nodeState,
-      ContainerReplicaProto.State replicaState) {
-    DatanodeDetails dn = MockDatanodeDetails.randomDatanodeDetails();
-    dn.setPersistedOpState(nodeState);
-    return ContainerReplica.newBuilder()
-        .setContainerState(replicaState)
-        .setContainerID(containerID)
-        .setSequenceId(1)
-        .setDatanodeDetails(dn)
-        .build();
-  }
-
   /**
    * Helper method to get the first node from the set of trackedNodes within
    * the monitor.
@@ -486,47 +445,4 @@ public class TestDatanodeAdminMonitor {
     return
         monitor.getTrackedNodes().toArray(new DatanodeDetails[0])[0];
   }
-
-  /**
-   * The only interaction the DatanodeAdminMonitor has with the
-   * ReplicationManager, is to request a ContainerReplicaCount object for each
-   * container on nodes being deocmmissioned or moved to maintenance. This
-   * method mocks that interface to return a ContainerReplicaCount with a
-   * container in the given containerState and a set of replias in the given
-   * replicaStates.
-   * @param containerState
-   * @param replicaStates
-   * @throws ContainerNotFoundException
-   */
-  private void mockGetContainerReplicaCount(
-      HddsProtos.LifeCycleState containerState,
-      HddsProtos.NodeOperationalState...replicaStates)
-      throws ContainerNotFoundException {
-    reset(repManager);
-    Mockito.when(repManager.getContainerReplicaCount(
-        Mockito.any(ContainerID.class)))
-        .thenAnswer(invocation ->
-            generateReplicaCount((ContainerID)invocation.getArguments()[0],
-                containerState, replicaStates));
-  }
-
-  /**
-   * This simple internal class is used to track and handle any DatanodeAdmin
-   * events fired by the DatanodeAdminMonitor during tests.
-   */
-  private static class DatanodeAdminHandler implements
-      EventHandler<DatanodeDetails> {
-
-    private AtomicInteger invocation = new AtomicInteger(0);
-
-    @Override
-    public void onMessage(final DatanodeDetails dn,
-                          final EventPublisher publisher) {
-      invocation.incrementAndGet();
-    }
-
-    public int getInvocation() {
-      return invocation.get();
-    }
-  }
 }
\ No newline at end of file
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java
new file mode 100644
index 0000000000..0383ef122d
--- /dev/null
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java
@@ -0,0 +1,343 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdds.scm.node;
+
+import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
+import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
+import org.apache.hadoop.hdds.scm.container.ContainerID;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.container.SimpleMockNodeManager;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
+import org.apache.hadoop.hdds.scm.events.SCMEvents;
+import org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException;
+import org.apache.hadoop.hdds.server.events.EventQueue;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mockito;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONED;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.DECOMMISSIONING;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.ENTERING_MAINTENANCE;
+import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState.IN_SERVICE;
+
+/**
+ * Tests for the NodeDecommissionMetrics class.
+ */
+public class TestNodeDecommissionMetrics {
+  private NodeDecommissionMetrics metrics;
+  private SimpleMockNodeManager nodeManager;
+  private OzoneConfiguration conf;
+  private DatanodeAdminMonitorImpl monitor;
+  private DatanodeAdminMonitorTestUtil
+          .DatanodeAdminHandler startAdminHandler;
+  private ReplicationManager repManager;
+  private EventQueue eventQueue;
+
+
+  @BeforeEach
+  public void setup() {
+    conf = new OzoneConfiguration();
+    eventQueue = new EventQueue();
+    startAdminHandler = new DatanodeAdminMonitorTestUtil
+        .DatanodeAdminHandler();
+    eventQueue.addHandler(SCMEvents.START_ADMIN_ON_NODE, startAdminHandler);
+    nodeManager = new SimpleMockNodeManager();
+    repManager = Mockito.mock(ReplicationManager.class);
+    monitor =
+        new DatanodeAdminMonitorImpl(
+            conf, eventQueue, nodeManager, repManager);
+    metrics = NodeDecommissionMetrics.create();
+    monitor.setMetrics(metrics);
+  }
+
+  @AfterEach
+  public void after() {
+    metrics.unRegister();
+  }
+
+  /**
+   * Test for collecting metric for nodes tracked in decommissioning
+   * and maintenance workflow.  Dn in entering maintenance mode.
+   */
+  @Test
+  public void testDecommMonitorCollectTrackedNodes() {
+    DatanodeDetails dn1 = MockDatanodeDetails.randomDatanodeDetails();
+    nodeManager.register(dn1,
+        new NodeStatus(ENTERING_MAINTENANCE,
+            HddsProtos.NodeState.HEALTHY));
+    monitor.startMonitoring(dn1);
+    monitor.run();
+    Assertions.assertEquals(1,
+        metrics.getDecommissioningMaintenanceNodesTotal());
+  }
+
+  /**
+   * Test for collecting metric for nodes tracked in workflow that are
+   * in recommission workflow.  Dn decommissioned, and recommissioned.
+   */
+  @Test
+  public void testDecommMonitorCollectRecommissionNodes() {
+    DatanodeDetails dn1 = MockDatanodeDetails.randomDatanodeDetails();
+    nodeManager.register(dn1,
+        new NodeStatus(DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+    monitor.startMonitoring(dn1);
+    monitor.run();
+    // Recommission by putting node back in service.
+    // Stop monitor and run.
+    monitor.stopMonitoring(dn1);
+    monitor.run();
+    Assertions.assertEquals(0,
+        metrics.getDecommissioningMaintenanceNodesTotal());
+    Assertions.assertEquals(1,
+        metrics.getRecommissionNodesTotal());
+  }
+
+  /**
+   * Test for collecting metric for pipelines waiting to be closed when
+   * datanode enters decommissioning workflow.
+   */
+  @Test
+  public void testDecommMonitorCollectPipelinesWaitingClosed() {
+    DatanodeDetails dn1 = MockDatanodeDetails.createDatanodeDetails(
+        "datanode_host1",
+        "/r1/ng1");
+    nodeManager.register(dn1,
+        new NodeStatus(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+    // Ensure the node has some pipelines
+    nodeManager.setPipelines(dn1, 2);
+    // Add the node to the monitor
+    monitor.startMonitoring(dn1);
+    monitor.run();
+    // Ensure a StartAdmin event was fired
+    eventQueue.processAll(20000);
+    Assertions.assertEquals(2,
+        metrics.getPipelinesWaitingToCloseTotal());
+
+    // should have host specific metric collected
+    // for datanode_host1
+    Assertions.assertEquals(2,
+        metrics.getPipelinesWaitingToCloseByHost(
+            "datanode_host1"));
+    // Clear the pipelines and the metric collected for
+    // datanode_host1 should clear
+    nodeManager.setPipelines(dn1, 0);
+    monitor.run();
+    eventQueue.processAll(20000);
+    Assertions.assertEquals(0,
+        metrics.getPipelinesWaitingToCloseByHost(
+            "datanode_host1"));
+  }
+
+  /**
+   * Test for collecting metric for under replicated containers
+   * from nodes in decommissioning and maintenance workflow.
+   */
+  @Test
+  public void testDecommMonitorCollectUnderReplicated()
+          throws ContainerNotFoundException, NodeNotFoundException {
+    DatanodeDetails dn1 = MockDatanodeDetails.createDatanodeDetails(
+        "datanode_host1",
+        "/r1/ng1");
+    nodeManager.register(dn1,
+        new NodeStatus(HddsProtos.NodeOperationalState.DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+
+    Set<ContainerID> containers = new HashSet<>();
+    containers.add(ContainerID.valueOf(1));
+
+    // create container with 3 replicas, 2 replicas in-service
+    // 1 decommissioned; will result in an under-replicated
+    // container
+    nodeManager.setContainers(dn1, containers);
+    DatanodeAdminMonitorTestUtil
+        .mockGetContainerReplicaCount(repManager,
+            HddsProtos.LifeCycleState.CLOSED,
+            DECOMMISSIONED,
+            IN_SERVICE,
+            IN_SERVICE);
+
+    // Add the node to the monitor, it should have 1 under-replicated
+    // container after the first run
+    monitor.startMonitoring(dn1);
+    monitor.run();
+    Assertions.assertEquals(1,
+        metrics.getContainersUnderReplicatedTotal());
+
+    // should have host specific metric collected
+    // for datanode_host1
+    Assertions.assertEquals(1,
+        metrics.getUnderReplicatedByHost("datanode_host1"));
+  }
+
+  /**
+   * Test for collecting metric for sufficiently replicated containers
+   * from nodes in decommissioning and maintenance workflow.
+   */
+  @Test
+  public void testDecommMonitorCollectSufficientlyReplicated()
+          throws ContainerNotFoundException, NodeNotFoundException {
+    DatanodeDetails dn1 = MockDatanodeDetails.createDatanodeDetails(
+        "datanode_host1",
+        "/r1/ng1");
+    nodeManager.register(dn1,
+        new NodeStatus(DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+    Set<ContainerID> containers = new HashSet<>();
+    containers.add(ContainerID.valueOf(1));
+
+    // create container with 3 replicas,
+    // all in-service
+    nodeManager.setContainers(dn1, containers);
+    DatanodeAdminMonitorTestUtil
+        .mockGetContainerReplicaCount(repManager,
+            HddsProtos.LifeCycleState.CLOSED,
+            IN_SERVICE,
+            IN_SERVICE,
+            IN_SERVICE);
+    monitor.startMonitoring(dn1);
+
+    monitor.run();
+    // expect dn in decommissioning workflow with container
+    // sufficiently replicated
+    Assertions.assertEquals(1,
+        metrics.getContainersSufficientlyReplicatedTotal());
+
+    // should have host specific metric collected
+    // for datanode_host1
+    Assertions.assertEquals(1,
+        metrics.getSufficientlyReplicatedByHost("datanode_host1"));
+  }
+
+  /**
+   * Test for collecting metric for unhealthy containers
+   * from nodes in decommissioning and maintenance workflow.
+   */
+  @Test
+  public void testDecommMonitorCollectUnhealthyContainers()
+      throws ContainerNotFoundException, NodeNotFoundException {
+    DatanodeDetails dn1 = MockDatanodeDetails.createDatanodeDetails(
+        "datanode_host1",
+        "/r1/ng1");
+    nodeManager.register(dn1,
+        new NodeStatus(DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+    Set<ContainerID> containers = new HashSet<>();
+    containers.add(ContainerID.valueOf(1));
+
+    // set OPEN container with 1 replica CLOSED replica state,
+    // in-service node, generates monitored  unhealthy container replica
+    nodeManager.setContainers(dn1, containers);
+    DatanodeAdminMonitorTestUtil
+        .mockGetContainerReplicaCount(repManager,
+            HddsProtos.LifeCycleState.OPEN,
+            IN_SERVICE);
+    monitor.startMonitoring(dn1);
+
+    monitor.run();
+    Assertions.assertEquals(1,
+        metrics.getContainersUnhealthyTotal());
+
+    // should have host specific metric collected
+    // for datanode_host1
+    Assertions.assertEquals(1,
+        metrics.getUnhealthyContainersByHost(
+            "datanode_host1"));
+  }
+
+  /**
+   * Test for collecting aggregated metric for replicated state -
+   * total number of under-replicated containers over multiple
+   * datanodes in the decommissioning and maintenance workflow.
+   */
+  @Test
+  public void testDecommMonitorCollectionMultipleDnContainers()
+      throws ContainerNotFoundException, NodeNotFoundException {
+    // test metric aggregation over several datanodes
+    DatanodeDetails dn1 = MockDatanodeDetails.randomDatanodeDetails();
+    DatanodeDetails dn2 = MockDatanodeDetails.randomDatanodeDetails();
+
+    nodeManager.register(dn1,
+        new NodeStatus(DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+    nodeManager.register(dn2,
+        new NodeStatus(DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+
+    Set<ContainerID> containersDn1 = new HashSet<>();
+    containersDn1.add(ContainerID.valueOf(1));
+    containersDn1.add(ContainerID.valueOf(2));
+
+    nodeManager.setContainers(dn1, containersDn1);
+
+    Set<ContainerID> containersDn2 = new HashSet<>();
+    containersDn2.add(ContainerID.valueOf(3));
+
+    nodeManager.setContainers(dn2, containersDn2);
+    DatanodeAdminMonitorTestUtil
+        .mockGetContainerReplicaCount(repManager,
+            HddsProtos.LifeCycleState.CLOSED,
+            DECOMMISSIONED,
+            IN_SERVICE,
+            IN_SERVICE);
+
+    monitor.startMonitoring(dn1);
+    monitor.startMonitoring(dn2);
+
+    monitor.run();
+    Assertions.assertEquals(3,
+        metrics.getContainersUnderReplicatedTotal());
+  }
+
+  /**
+   * Test for collecting aggregated metric for total number
+   * of pipelines waiting to close - over multiple
+   * datanodes in the decommissioning and maintenance workflow.
+   */
+  @Test
+  public void testDecommMonitorCollectionMultipleDnPipelines() {
+    // test metric aggregation over several datanodes
+    DatanodeDetails dn1 = MockDatanodeDetails.randomDatanodeDetails();
+    DatanodeDetails dn2 = MockDatanodeDetails.randomDatanodeDetails();
+
+    nodeManager.register(dn1,
+        new NodeStatus(DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+    nodeManager.register(dn2,
+        new NodeStatus(DECOMMISSIONING,
+            HddsProtos.NodeState.HEALTHY));
+
+    nodeManager.setPipelines(dn1, 2);
+    nodeManager.setPipelines(dn2, 1);
+
+    monitor.startMonitoring(dn1);
+    monitor.startMonitoring(dn2);
+
+    monitor.run();
+    Assertions.assertEquals(3,
+        metrics.getPipelinesWaitingToCloseTotal());
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org


[ozone] 02/02: HDDS-7453. Check certificate expiration at service startup, renew if necessary. (#3930)

Posted by ca...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

captainzmc pushed a commit to branch ozone-1.3
in repository https://gitbox.apache.org/repos/asf/ozone.git

commit 2911f933b9351c098629b441cb1c97f58f0d3fd2
Author: Istvan Fajth <fa...@gmail.com>
AuthorDate: Wed Nov 9 04:33:49 2022 +0100

    HDDS-7453. Check certificate expiration at service startup, renew if necessary. (#3930)
---
 .../org/apache/hadoop/hdds/HddsConfigKeys.java     | 10 +++
 .../hadoop/hdds/security/x509/SecurityConfig.java  | 18 ++++++
 .../apache/hadoop/ozone/common/StorageInfo.java    |  4 ++
 .../common/src/main/resources/ozone-default.xml    |  8 +++
 .../apache/hadoop/ozone/HddsDatanodeService.java   |  5 ++
 .../hadoop/ozone/TestHddsSecureDatanodeInit.java   |  2 +-
 .../x509/certificate/client/CertificateClient.java |  3 +-
 .../client/CommonCertificateClient.java            |  6 ++
 .../client/DefaultCertificateClient.java           | 60 ++++++++++++++++--
 .../certificate/client/SCMCertificateClient.java   |  3 +
 .../client/TestCertificateClientInit.java          |  2 +-
 .../client/TestDefaultCertificateClient.java       | 52 +++++++++++++++
 .../hadoop/ozone/om/TestSecureOzoneManager.java    |  2 +-
 .../java/org/apache/hadoop/ozone/om/OMStorage.java |  4 ++
 .../org/apache/hadoop/ozone/om/OzoneManager.java   | 73 +++++++++++-----------
 .../org/apache/hadoop/ozone/recon/ReconServer.java | 25 ++++----
 .../hadoop/ozone/recon/scm/ReconStorageConfig.java |  4 ++
 17 files changed, 223 insertions(+), 58 deletions(-)

diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
index 5f8f6f5259..4c017e831c 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/HddsConfigKeys.java
@@ -184,6 +184,16 @@ public final class HddsConfigKeys {
   // Default Certificate duration to one year.
   public static final String HDDS_X509_DEFAULT_DURATION_DEFAULT = "P365D";
 
+  /**
+   * Duration of the grace period within which a certificate should be
+   * renewed before the current one expires.
+   * Default is 28 days.
+   */
+  public static final String HDDS_X509_RENEW_GRACE_DURATION =
+      "hdds.x509.renew.grace.duration";
+
+  public static final String HDDS_X509_RENEW_GRACE_DURATION_DEFAULT = "P28D";
+
   /**
    * Do not instantiate.
    */
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/security/x509/SecurityConfig.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/security/x509/SecurityConfig.java
index b02ce1bdeb..751711c5ce 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/security/x509/SecurityConfig.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/security/x509/SecurityConfig.java
@@ -63,6 +63,8 @@ import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_FILE_NAME;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_FILE_NAME_DEFAULT;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_MAX_DURATION;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_MAX_DURATION_DEFAULT;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_RENEW_GRACE_DURATION;
+import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_RENEW_GRACE_DURATION_DEFAULT;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_SIGNATURE_ALGO;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_X509_SIGNATURE_ALGO_DEFAULT;
 import static org.apache.hadoop.hdds.HddsConfigKeys.OZONE_METADATA_DIRS;
@@ -100,6 +102,7 @@ public class SecurityConfig {
   private final String certificateFileName;
   private final boolean grpcTlsEnabled;
   private final Duration defaultCertDuration;
+  private final Duration renewalGracePeriod;
   private final boolean isSecurityEnabled;
   private final String crlName;
   private boolean grpcTlsUseTestCert;
@@ -164,6 +167,10 @@ public class SecurityConfig {
         this.configuration.get(HDDS_X509_DEFAULT_DURATION,
             HDDS_X509_DEFAULT_DURATION_DEFAULT);
     defaultCertDuration = Duration.parse(certDurationString);
+    String renewalGraceDurationString = this.configuration.get(
+        HDDS_X509_RENEW_GRACE_DURATION,
+        HDDS_X509_RENEW_GRACE_DURATION_DEFAULT);
+    renewalGracePeriod = Duration.parse(renewalGraceDurationString);
 
     if (maxCertDuration.compareTo(defaultCertDuration) < 0) {
       LOG.error("Certificate duration {} should not be greater than Maximum " +
@@ -216,6 +223,17 @@ public class SecurityConfig {
     return defaultCertDuration;
   }
 
+  /**
+   * Duration of the grace period within which a certificate should be
+   * renewed before the current one expires.
+   * Default is 28 days.
+   *
+   * @return the value of hdds.x509.renew.grace.duration property
+   */
+  public Duration getRenewalGracePeriod() {
+    return renewalGracePeriod;
+  }
+
   /**
    * Returns the Standard Certificate file name.
    *
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java
index 6ba438456e..f7d92e5d2b 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/common/StorageInfo.java
@@ -154,6 +154,10 @@ public class StorageInfo {
     properties.setProperty(key, value);
   }
 
+  public void unsetProperty(String key) {
+    properties.remove(key);
+  }
+
   public void setClusterId(String clusterId) {
     properties.setProperty(CLUSTER_ID, clusterId);
   }
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index a481ba5e36..0dae783e69 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -2077,6 +2077,14 @@
       valid. The formats accepted are based on the ISO-8601 duration format
       PnDTnHnMn.nS</description>
   </property>
+  <property>
+    <name>hdds.x509.renew.grace.duration</name>
+    <value>P28D</value>
+    <tag>OZONE, HDDS, SECURITY</tag>
+    <description>Duration of the grace period within which a certificate should be
+      * renewed before the current one expires. Default is 28 days.
+    </description>
+  </property>
   <property>
     <name>hdds.x509.dir.name</name>
     <value>certs</value>
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java
index d36a178550..d775a426da 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/HddsDatanodeService.java
@@ -339,6 +339,11 @@ public class HddsDatanodeService extends GenericCli implements ServicePlugin {
     LOG.info("Initializing secure Datanode.");
 
     CertificateClient.InitResponse response = dnCertClient.init();
+    if (response.equals(CertificateClient.InitResponse.REINIT)) {
+      LOG.info("Re-initialize certificate client.");
+      dnCertClient = new DNCertificateClient(new SecurityConfig(conf));
+      response = dnCertClient.init();
+    }
     LOG.info("Init response: {}", response);
     switch (response) {
     case SUCCESS:
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/TestHddsSecureDatanodeInit.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/TestHddsSecureDatanodeInit.java
index 2d0cb37141..39e36baae0 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/TestHddsSecureDatanodeInit.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/TestHddsSecureDatanodeInit.java
@@ -102,7 +102,7 @@ public class TestHddsSecureDatanodeInit {
     X509Certificate x509Certificate = null;
 
     x509Certificate = KeyStoreTestUtil.generateCertificate(
-        "CN=Test", new KeyPair(publicKey, privateKey), 10,
+        "CN=Test", new KeyPair(publicKey, privateKey), 365,
         securityConfig.getSignatureAlgo());
     certHolder = new X509CertificateHolder(x509Certificate.getEncoded());
 
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CertificateClient.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CertificateClient.java
index 396452fbb2..3357bb3089 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CertificateClient.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CertificateClient.java
@@ -203,7 +203,8 @@ public interface CertificateClient {
     SUCCESS,
     FAILURE,
     GETCERT,
-    RECOVER
+    RECOVER,
+    REINIT
   }
 
   /**
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CommonCertificateClient.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CommonCertificateClient.java
index dbf634f915..bb122955a5 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CommonCertificateClient.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/CommonCertificateClient.java
@@ -25,6 +25,7 @@ import org.slf4j.Logger;
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.FAILURE;
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.GETCERT;
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.RECOVER;
+import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.REINIT;
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.SUCCESS;
 
 /**
@@ -102,6 +103,11 @@ public class CommonCertificateClient extends DefaultCertificateClient {
       } else {
         return FAILURE;
       }
+    case EXPIRED_CERT:
+      getLogger().info("Component certificate is about to expire. Initiating" +
+          "renewal.");
+      removeMaterial();
+      return REINIT;
     default:
       log.error("Unexpected case: {} (private/public/cert)",
           Integer.toBinaryString(init.ordinal()));
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/DefaultCertificateClient.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/DefaultCertificateClient.java
index e84826d481..a55db9a427 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/DefaultCertificateClient.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/DefaultCertificateClient.java
@@ -38,6 +38,7 @@ import java.security.SignatureException;
 import java.security.cert.CertStore;
 import java.security.cert.X509Certificate;
 import java.security.spec.InvalidKeySpecException;
+import java.time.Instant;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -47,6 +48,7 @@ import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReentrantLock;
 
+import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.protocol.SCMSecurityProtocol;
 import org.apache.hadoop.hdds.security.x509.crl.CRLInfo;
@@ -63,8 +65,10 @@ import org.apache.commons.io.FilenameUtils;
 import org.apache.commons.lang3.RandomStringUtils;
 import org.apache.commons.lang3.math.NumberUtils;
 import org.apache.commons.validator.routines.DomainValidator;
+
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.FAILURE;
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.GETCERT;
+import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.REINIT;
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.SUCCESS;
 import static org.apache.hadoop.hdds.security.x509.exceptions.CertificateException.ErrorCode.BOOTSTRAP_ERROR;
 import static org.apache.hadoop.hdds.security.x509.exceptions.CertificateException.ErrorCode.CERTIFICATE_ERROR;
@@ -74,7 +78,6 @@ import static org.apache.hadoop.hdds.security.x509.exceptions.CertificateExcepti
 import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmSecurityClient;
 import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmSecurityClientWithMaxRetry;
 
-import org.apache.ratis.util.FileUtils;
 import org.bouncycastle.cert.X509CertificateHolder;
 import org.slf4j.Logger;
 
@@ -636,7 +639,10 @@ public abstract class DefaultCertificateClient implements CertificateClient {
    * 6. PUBLICKEY_PRIVATEKEY  indicates private and public key were read
    *                          successfully from configured location but
    *                          Certificate.
-   * 7. All                   Keypair as well as certificate is present.
+   * 7. ALL                   Keypair as well as certificate is present.
+   * 8. EXPIRED_CERT          The certificate is present, but either it has
+   *                          already expired, or is about to be expired within
+   *                          the grace period provided in the configuration.
    *
    * */
   protected enum InitCase {
@@ -647,7 +653,8 @@ public abstract class DefaultCertificateClient implements CertificateClient {
     PRIVATE_KEY,
     PRIVATEKEY_CERT,
     PUBLICKEY_PRIVATEKEY,
-    ALL
+    ALL,
+    EXPIRED_CERT
   }
 
   /**
@@ -657,6 +664,9 @@ public abstract class DefaultCertificateClient implements CertificateClient {
    * 2. Generates and stores a keypair.
    * 3. Try to recover public key if private key and certificate is present
    *    but public key is missing.
+   * 4. Checks if the certificate is about to be expired or have already been
+   *    expired, and if yes removes the key material and the certificate and
+   *    asks for re-initialization in the result.
    *
    * Truth table:
    *  +--------------+-----------------+--------------+----------------+
@@ -688,6 +698,13 @@ public abstract class DefaultCertificateClient implements CertificateClient {
    * 2. When keypair (public/private key) is available but certificate is
    *    missing.
    *
+   * Returns REINIT in following case:
+   *    If it would return SUCCESS, but the certificate expiration date is
+   *    within the configured grace period or if the certificate is already
+   *    expired.
+   *    The grace period is configured by the hdds.x509.renew.grace.duration
+   *    configuration property.
+   *
    */
   @Override
   public synchronized InitResponse init() throws CertificateException {
@@ -695,7 +712,6 @@ public abstract class DefaultCertificateClient implements CertificateClient {
     PrivateKey pvtKey = getPrivateKey();
     PublicKey pubKey = getPublicKey();
     X509Certificate certificate = getCertificate();
-
     if (pvtKey != null) {
       initCase = initCase | 1 << 2;
     }
@@ -705,8 +721,21 @@ public abstract class DefaultCertificateClient implements CertificateClient {
     if (certificate != null) {
       initCase = initCase | 1;
     }
+
+    boolean successCase =
+        initCase == InitCase.ALL.ordinal() ||
+            initCase == InitCase.PRIVATEKEY_CERT.ordinal();
+    boolean shouldRenew =
+        certificate != null &&
+            Instant.now().plus(securityConfig.getRenewalGracePeriod())
+                .isAfter(certificate.getNotAfter().toInstant());
+
+    if (successCase && shouldRenew) {
+      initCase = InitCase.EXPIRED_CERT.ordinal();
+    }
+
     getLogger().info("Certificate client init case: {}", initCase);
-    Preconditions.checkArgument(initCase < 8, "Not a " +
+    Preconditions.checkArgument(initCase < InitCase.values().length, "Not a " +
         "valid case.");
     InitCase init = InitCase.values()[initCase];
     return handleCase(init);
@@ -766,6 +795,11 @@ public abstract class DefaultCertificateClient implements CertificateClient {
       } else {
         return FAILURE;
       }
+    case EXPIRED_CERT:
+      getLogger().info("Component certificate is about to expire. Initiating" +
+          "renewal.");
+      removeMaterial();
+      return REINIT;
     default:
       getLogger().error("Unexpected case: {} (private/public/cert)",
           Integer.toBinaryString(init.ordinal()));
@@ -774,6 +808,20 @@ public abstract class DefaultCertificateClient implements CertificateClient {
     }
   }
 
+  protected void removeMaterial() throws CertificateException {
+    try {
+      FileUtils.deleteDirectory(
+          securityConfig.getKeyLocation(component).toFile());
+      getLogger().info("Certificate renewal: key material is removed.");
+      FileUtils.deleteDirectory(
+          securityConfig.getCertificateLocation(component).toFile());
+      getLogger().info("Certificate renewal: certificates are removed.");
+    } catch (IOException e) {
+      throw new CertificateException("Certificate renewal failed: remove key" +
+          " material failed.", e);
+    }
+  }
+
   /**
    * Validate keypair and certificate.
    * */
@@ -988,7 +1036,7 @@ public abstract class DefaultCertificateClient implements CertificateClient {
             certName = ROOT_CA_CERT_PREFIX + certName;
           }
 
-          FileUtils.deleteFileQuietly(basePath.resolve(certName).toFile());
+          FileUtils.deleteQuietly(basePath.resolve(certName).toFile());
           // remove in memory
           certificateMap.remove(certId);
 
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/SCMCertificateClient.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/SCMCertificateClient.java
index d941d88e43..91acc1e767 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/SCMCertificateClient.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/security/x509/certificate/client/SCMCertificateClient.java
@@ -109,6 +109,9 @@ public class SCMCertificateClient extends DefaultCertificateClient {
       } else {
         return FAILURE;
       }
+    case EXPIRED_CERT:
+      LOG.warn("SCM CA certificate is about to be expire!");
+      return SUCCESS;
     default:
       LOG.error("Unexpected case: {} (private/public/cert)",
           Integer.toBinaryString(init.ordinal()));
diff --git a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestCertificateClientInit.java b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestCertificateClientInit.java
index 47b02a9a2e..72730858fc 100644
--- a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestCertificateClientInit.java
+++ b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestCertificateClientInit.java
@@ -212,6 +212,6 @@ public class TestCertificateClientInit {
 
   private X509Certificate getX509Certificate() throws Exception {
     return KeyStoreTestUtil.generateCertificate(
-        "CN=Test", keyPair, 10, securityConfig.getSignatureAlgo());
+        "CN=Test", keyPair, 365, securityConfig.getSignatureAlgo());
   }
 }
\ No newline at end of file
diff --git a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestDefaultCertificateClient.java b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestDefaultCertificateClient.java
index fbc5c34f30..394f4b0a3b 100644
--- a/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestDefaultCertificateClient.java
+++ b/hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/security/x509/certificate/client/TestDefaultCertificateClient.java
@@ -18,6 +18,7 @@
  */
 package org.apache.hadoop.hdds.security.x509.certificate.client;
 
+import org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse;
 import org.apache.hadoop.hdds.security.x509.certificate.utils.CertificateCodec;
 import org.apache.hadoop.hdds.security.x509.exceptions.CertificateException;
 import org.apache.hadoop.hdds.security.x509.keys.KeyCodec;
@@ -35,6 +36,9 @@ import java.security.PrivateKey;
 import java.security.PublicKey;
 import java.security.Signature;
 import java.security.cert.X509Certificate;
+import java.time.Duration;
+import java.util.Calendar;
+import java.util.Date;
 import java.util.UUID;
 
 import org.apache.commons.io.FileUtils;
@@ -48,18 +52,25 @@ import org.apache.hadoop.hdds.security.x509.keys.HDDSKeyGenerator;
 import org.apache.hadoop.security.ssl.KeyStoreTestUtil;
 import org.apache.ozone.test.GenericTestUtils;
 import org.apache.ozone.test.LambdaTestUtils;
+import org.slf4j.Logger;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY;
 import static org.apache.hadoop.hdds.HddsConfigKeys.HDDS_METADATA_DIR_NAME;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_NAMES;
 import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.FAILURE;
+import static org.apache.hadoop.hdds.security.x509.certificate.client.CertificateClient.InitResponse.REINIT;
 import static org.apache.hadoop.hdds.security.x509.certificate.utils.CertificateCodec.getPEMEncodedString;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.mockito.ArgumentMatchers.anyString;
+import static org.mockito.Mockito.atLeastOnce;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.verify;
+import static org.mockito.Mockito.when;
 
 /**
  * Test class for {@link DefaultCertificateClient}.
@@ -474,4 +485,45 @@ public class TestDefaultCertificateClient {
     omClientLog.clearOutput();
   }
 
+  @Test
+  public void testCertificateExpirationHandlingInInit() throws Exception {
+    String certId = "1L";
+    String compName = "TEST";
+
+    Logger mockLogger = mock(Logger.class);
+
+    SecurityConfig config = mock(SecurityConfig.class);
+    Path nonexistent = Paths.get("nonexistent");
+    when(config.getCertificateLocation(anyString())).thenReturn(nonexistent);
+    when(config.getKeyLocation(anyString())).thenReturn(nonexistent);
+    when(config.getRenewalGracePeriod()).thenReturn(Duration.ofDays(28));
+
+    Calendar cal = Calendar.getInstance();
+    cal.add(Calendar.DAY_OF_YEAR, 2);
+    Date expiration = cal.getTime();
+    X509Certificate mockCert = mock(X509Certificate.class);
+    when(mockCert.getNotAfter()).thenReturn(expiration);
+
+    DefaultCertificateClient client =
+        new DefaultCertificateClient(config, mockLogger, certId, compName) {
+          @Override
+          public PrivateKey getPrivateKey() {
+            return mock(PrivateKey.class);
+          }
+
+          @Override
+          public PublicKey getPublicKey() {
+            return mock(PublicKey.class);
+          }
+
+          @Override
+          public X509Certificate getCertificate() {
+            return mockCert;
+          }
+        };
+
+    InitResponse resp = client.init();
+    verify(mockLogger, atLeastOnce()).info(anyString());
+    assertEquals(resp, REINIT);
+  }
 }
\ No newline at end of file
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestSecureOzoneManager.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestSecureOzoneManager.java
index 32cd7beb17..6347a4f965 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestSecureOzoneManager.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestSecureOzoneManager.java
@@ -164,7 +164,7 @@ public class TestSecureOzoneManager {
     CertificateCodec certCodec =
         new CertificateCodec(securityConfig, COMPONENT);
     X509Certificate x509Certificate = KeyStoreTestUtil.generateCertificate(
-        "CN=Test", new KeyPair(publicKey, privateKey), 10,
+        "CN=Test", new KeyPair(publicKey, privateKey), 365,
         securityConfig.getSignatureAlgo());
     certCodec.writeCertificate(new X509CertificateHolder(
         x509Certificate.getEncoded()));
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStorage.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStorage.java
index 31c2aefd1e..0cde1047d5 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStorage.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OMStorage.java
@@ -55,6 +55,10 @@ public class OMStorage extends Storage {
     getStorageInfo().setProperty(OM_CERT_SERIAL_ID, certSerialId);
   }
 
+  public void unsetOmCertSerialId() throws IOException {
+    getStorageInfo().unsetProperty(OM_CERT_SERIAL_ID);
+  }
+
   public void setOmId(String omId) throws IOException {
     if (getState() == StorageState.INITIALIZED) {
       throw new IOException("OM is already initialized.");
diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
index b5be6c02d7..dd9375e5bf 100644
--- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
+++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/OzoneManager.java
@@ -1228,52 +1228,44 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl
     loginOMUserIfSecurityEnabled(conf);
     OMStorage omStorage = new OMStorage(conf);
     StorageState state = omStorage.getState();
-    if (state != StorageState.INITIALIZED) {
-      try {
-        ScmInfo scmInfo = getScmInfo(conf);
-        String clusterId = scmInfo.getClusterId();
-        String scmId = scmInfo.getScmId();
-        if (clusterId == null || clusterId.isEmpty()) {
-          throw new IOException("Invalid Cluster ID");
-        }
-        if (scmId == null || scmId.isEmpty()) {
-          throw new IOException("Invalid SCM ID");
-        }
+    String scmId = null;
+    try {
+      ScmInfo scmInfo = getScmInfo(conf);
+      scmId = scmInfo.getScmId();
+      if (scmId == null || scmId.isEmpty()) {
+        throw new IOException("Invalid SCM ID");
+      }
+      String clusterId = scmInfo.getClusterId();
+      if (clusterId == null || clusterId.isEmpty()) {
+        throw new IOException("Invalid Cluster ID");
+      }
+
+      if (state != StorageState.INITIALIZED) {
         omStorage.setClusterId(clusterId);
-        if (OzoneSecurityUtil.isSecurityEnabled(conf)) {
-          initializeSecurity(conf, omStorage, scmId);
-        }
         omStorage.initialize();
         System.out.println(
             "OM initialization succeeded.Current cluster id for sd="
                 + omStorage.getStorageDir() + ";cid=" + omStorage
                 .getClusterID() + ";layoutVersion=" + omStorage
                 .getLayoutVersion());
-
-        return true;
-      } catch (IOException ioe) {
-        LOG.error("Could not initialize OM version file", ioe);
-        return false;
-      }
-    } else {
-      if (OzoneSecurityUtil.isSecurityEnabled(conf) &&
-          omStorage.getOmCertSerialId() == null) {
-        ScmInfo scmInfo = HAUtils.getScmInfo(conf);
-        String scmId = scmInfo.getScmId();
-        if (scmId == null || scmId.isEmpty()) {
-          throw new IOException("Invalid SCM ID");
-        }
-        LOG.info("OM storage is already initialized. Initializing security");
-        initializeSecurity(conf, omStorage, scmId);
-        omStorage.persistCurrentState();
+      } else {
+        System.out.println(
+            "OM already initialized.Reusing existing cluster id for sd="
+                + omStorage.getStorageDir() + ";cid=" + omStorage
+                .getClusterID() + ";layoutVersion=" + omStorage
+                .getLayoutVersion());
       }
-      System.out.println(
-          "OM already initialized.Reusing existing cluster id for sd="
-              + omStorage.getStorageDir() + ";cid=" + omStorage
-              .getClusterID() + ";layoutVersion=" + omStorage
-              .getLayoutVersion());
-      return true;
+    } catch (IOException ioe) {
+      LOG.error("Could not initialize OM version file", ioe);
+      return false;
+    }
+
+    if (OzoneSecurityUtil.isSecurityEnabled(conf)) {
+      LOG.info("OM storage initialized. Initializing security");
+      initializeSecurity(conf, omStorage, scmId);
     }
+    omStorage.persistCurrentState();
+    return true;
   }
 
   /**
@@ -1289,6 +1281,13 @@ public final class OzoneManager extends ServiceRuntimeInfoImpl
         new OMCertificateClient(new SecurityConfig(conf),
             omStore.getOmCertSerialId());
     CertificateClient.InitResponse response = certClient.init();
+    if (response.equals(CertificateClient.InitResponse.REINIT)) {
+      LOG.info("Re-initialize certificate client.");
+      omStore.unsetOmCertSerialId();
+      omStore.persistCurrentState();
+      certClient = new OMCertificateClient(new SecurityConfig(conf));
+      response = certClient.init();
+    }
     LOG.info("Init response: {}", response);
     switch (response) {
     case SUCCESS:
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServer.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServer.java
index 5fb1d2b02b..55e5856f98 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServer.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/ReconServer.java
@@ -117,19 +117,14 @@ public class ReconServer extends GenericCli {
       loginReconUserIfSecurityEnabled(configuration);
       try {
         if (reconStorage.getState() != INITIALIZED) {
-          if (OzoneSecurityUtil.isSecurityEnabled(configuration)) {
-            initializeCertificateClient(configuration);
-          }
           reconStorage.initialize();
-        } else {
-          if (OzoneSecurityUtil.isSecurityEnabled(configuration) &&
-              reconStorage.getReconCertSerialId() == null) {
-            LOG.info("ReconStorageConfig is already initialized." +
-                "Initializing certificate.");
-            initializeCertificateClient(configuration);
-            reconStorage.persistCurrentState();
-          }
         }
+        if (OzoneSecurityUtil.isSecurityEnabled(configuration)) {
+          LOG.info("ReconStorageConfig initialized." +
+              "Initializing certificate.");
+          initializeCertificateClient(configuration);
+        }
+        reconStorage.persistCurrentState();
       } catch (Exception e) {
         LOG.error("Error during initializing Recon certificate", e);
       }
@@ -182,6 +177,14 @@ public class ReconServer extends GenericCli {
         reconStorage.getReconCertSerialId());
 
     CertificateClient.InitResponse response = certClient.init();
+    if (response.equals(CertificateClient.InitResponse.REINIT)) {
+      LOG.info("Re-initialize certificate client.");
+      reconStorage.unsetReconCertSerialId();
+      reconStorage.persistCurrentState();
+      certClient = new ReconCertificateClient(new SecurityConfig(configuration),
+          reconStorage.getReconCertSerialId());
+      response = certClient.init();
+    }
     LOG.info("Init response: {}", response);
     switch (response) {
     case SUCCESS:
diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageConfig.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageConfig.java
index 7a82542e6e..6e13408b41 100644
--- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageConfig.java
+++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageConfig.java
@@ -90,5 +90,9 @@ public class ReconStorageConfig extends SCMStorageConfig {
     return getStorageInfo().getProperty(RECON_CERT_SERIAL_ID);
   }
 
+  public void unsetReconCertSerialId() {
+    getStorageInfo().unsetProperty(RECON_CERT_SERIAL_ID);
+  }
+
 
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org