You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by ad...@apache.org on 2021/11/17 14:23:31 UTC

[ozone] branch master updated: HDDS-5848. Introduce more replication metrics (#2758)

This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 83e5aa3  HDDS-5848. Introduce more replication metrics (#2758)
83e5aa3 is described below

commit 83e5aa3d403c47395a9287374bc1fa0c9d24cce7
Author: Doroszlai, Attila <64...@users.noreply.github.com>
AuthorDate: Wed Nov 17 15:23:13 2021 +0100

    HDDS-5848. Introduce more replication metrics (#2758)
---
 .../container/replication/MeasuredReplicator.java  | 34 ++++++++++++++----
 .../replication/ReplicationSupervisor.java         |  1 -
 .../replication/ReplicationSupervisorMetrics.java  |  1 -
 .../replication/TestMeasuredReplicator.java        | 34 +++++++++++++-----
 .../hdds/scm/container/ReplicationManager.java     | 30 ++++++++++------
 .../replication/ReplicationManagerMetrics.java     | 41 ++++++++++++++++++++++
 .../hdds/scm/container/TestReplicationManager.java | 10 ++++++
 7 files changed, 123 insertions(+), 28 deletions(-)

diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java
index 5d99cc4..d1175ef 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
 import org.apache.hadoop.ozone.container.replication.ReplicationTask.Status;
 
 import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.util.Time;
 
 /**
  * ContainerReplicator wrapper with additional metrics.
@@ -38,19 +39,25 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
 
   private final ContainerReplicator delegate;
 
-  @Metric
+  @Metric(about = "Number of successful replication tasks")
   private MutableCounterLong success;
 
-  @Metric
+  @Metric(about = "Time spent on successful replication tasks")
   private MutableGaugeLong successTime;
 
-  @Metric
+  @Metric(about = "Number of failed replication attempts")
   private MutableCounterLong failure;
 
-  @Metric
+  @Metric(about = "Time spent waiting in the queue before starting the task")
   private MutableGaugeLong queueTime;
 
-  @Metric
+  @Metric(about = "Time spent on failed replication attempts")
+  private MutableGaugeLong failureTime;
+
+  @Metric(about = "Bytes transferred for failed replication attempts")
+  private MutableGaugeLong failureBytes;
+
+  @Metric(about = "Bytes transferred for successful replication tasks")
   private MutableGaugeLong transferredBytes;
 
   public MeasuredReplicator(ContainerReplicator delegate) {
@@ -61,18 +68,21 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
 
   @Override
   public void replicate(ReplicationTask task) {
-    long start = System.currentTimeMillis();
+    long start = Time.monotonicNow();
 
     long msInQueue =
         (Instant.now().getNano() - task.getQueued().getNano()) / 1_000_000;
     queueTime.incr(msInQueue);
     delegate.replicate(task);
+    long elapsed = Time.monotonicNow() - start;
     if (task.getStatus() == Status.FAILED) {
       failure.incr();
+      failureBytes.incr(task.getTransferredBytes());
+      failureTime.incr(elapsed);
     } else if (task.getStatus() == Status.DONE) {
       transferredBytes.incr(task.getTransferredBytes());
       success.incr();
-      successTime.incr(System.currentTimeMillis() - start);
+      successTime.incr(elapsed);
     }
   }
 
@@ -92,6 +102,11 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
   }
 
   @VisibleForTesting
+  public MutableGaugeLong getFailureTime() {
+    return failureTime;
+  }
+
+  @VisibleForTesting
   public MutableCounterLong getFailure() {
     return failure;
   }
@@ -106,4 +121,9 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
     return transferredBytes;
   }
 
+  @VisibleForTesting
+  public MutableGaugeLong getFailureBytes() {
+    return failureBytes;
+  }
+
 }
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java
index 86e9cb4..05a4173 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java
@@ -120,7 +120,6 @@ public class ReplicationSupervisor {
    *
    * @return Count of in-flight replications.
    */
-  @VisibleForTesting
   int getInFlightReplications() {
     return containersInFlight.size();
   }
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java
index 6dc40db..df48abd 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java
@@ -56,7 +56,6 @@ public class ReplicationSupervisorMetrics implements MetricsSource {
   }
 
   @Override
-  @SuppressWarnings("SuspiciousMethodCalls")
   public void getMetrics(MetricsCollector collector, boolean all) {
     collector.addRecord(SOURCE)
         .addGauge(Interns.info("numInFlightReplications",
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java
index 618935e..f8f0b93 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java
@@ -59,7 +59,7 @@ public class TestMeasuredReplicator {
   }
 
   @Test
-  public void measureFailureSuccessAndBytes() throws Exception {
+  public void measureFailureSuccessAndBytes() {
     //WHEN
     measuredReplicator.replicate(new ReplicationTask(1L, new ArrayList<>()));
     measuredReplicator.replicate(new ReplicationTask(2L, new ArrayList<>()));
@@ -71,12 +71,14 @@ public class TestMeasuredReplicator {
     Assert.assertEquals(1, measuredReplicator.getFailure().value());
 
     //sum of container ids (success) in kb
-    Assert.assertEquals(4 * 1024,
+    Assert.assertEquals((1 + 3) * 1024,
         measuredReplicator.getTransferredBytes().value());
+    Assert.assertEquals(2 * 1024,
+        measuredReplicator.getFailureBytes().value());
   }
 
   @Test
-  public void testSuccessTime() throws Exception {
+  public void testReplicationTime() throws Exception {
     //WHEN
     //will wait at least the 300ms
     measuredReplicator.replicate(new ReplicationTask(101L, new ArrayList<>()));
@@ -85,18 +87,32 @@ public class TestMeasuredReplicator {
 
     //THEN
     //even containers should be failed
+    long successTime = measuredReplicator.getSuccessTime().value();
+    long failureTime = measuredReplicator.getFailureTime().value();
     Assert.assertTrue(
-        "Measured time should be at least 300 ms but was "
-            + measuredReplicator.getSuccessTime().value(),
-        measuredReplicator.getSuccessTime().value() >= 300L);
+        "Measured time should be at least 300 ms but was " + successTime,
+        successTime >= 300L);
+    Assert.assertTrue(
+        "Measured time should be at least 300 ms but was " + failureTime,
+        failureTime >= 300L);
   }
 
   @Test
-  public void testSuccessTimeFailureExcluded() throws Exception {
+  public void testFailureTimeSuccessExcluded() {
+    //WHEN
+    //will wait at least the 15ms
+    measuredReplicator.replicate(new ReplicationTask(15L, new ArrayList<>()));
 
+    //THEN
+    //even containers should be failed, supposed to be zero
+    Assert.assertEquals(0, measuredReplicator.getFailureTime().value());
+  }
+
+  @Test
+  public void testSuccessTimeFailureExcluded() {
     //WHEN
-    //will wait at least the 300ms
-    measuredReplicator.replicate(new ReplicationTask(300L, new ArrayList<>()));
+    //will wait at least the 10ms
+    measuredReplicator.replicate(new ReplicationTask(10L, new ArrayList<>()));
 
     //THEN
     //even containers should be failed, supposed to be zero
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
index 91d8578..00409b7 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
@@ -459,18 +459,13 @@ public class ReplicationManager implements SCMService {
             action -> replicas.stream()
                 .anyMatch(r -> r.getDatanodeDetails().equals(action.datanode)),
             ()-> metrics.incrNumReplicationCmdsTimeout(),
-            () -> {
-              metrics.incrNumReplicationCmdsCompleted();
-              metrics.incrNumReplicationBytesCompleted(
-                  container.getUsedBytes());
-            });
+            action -> updateCompletedReplicationMetrics(container, action));
 
         updateInflightAction(container, inflightDeletion,
             action -> replicas.stream()
-                .noneMatch(r ->
-                    r.getDatanodeDetails().equals(action.datanode)),
+                .noneMatch(r -> r.getDatanodeDetails().equals(action.datanode)),
             () -> metrics.incrNumDeletionCmdsTimeout(),
-            () -> metrics.incrNumDeletionCmdsCompleted());
+            action -> updateCompletedDeletionMetrics(container, action));
 
         /*
          * If container is under deleting and all it's replicas are deleted,
@@ -546,6 +541,20 @@ public class ReplicationManager implements SCMService {
     }
   }
 
+  private void updateCompletedReplicationMetrics(ContainerInfo container,
+      InflightAction action) {
+    metrics.incrNumReplicationCmdsCompleted();
+    metrics.incrNumReplicationBytesCompleted(container.getUsedBytes());
+    metrics.addReplicationTime(clock.millis() - action.time);
+  }
+
+  private void updateCompletedDeletionMetrics(ContainerInfo container,
+      InflightAction action) {
+    metrics.incrNumDeletionCmdsCompleted();
+    metrics.incrNumDeletionBytesCompleted(container.getUsedBytes());
+    metrics.addDeletionTime(clock.millis() - action.time);
+  }
+
   /**
    * Reconciles the InflightActions for a given container.
    *
@@ -559,7 +568,7 @@ public class ReplicationManager implements SCMService {
       final Map<ContainerID, List<InflightAction>> inflightActions,
       final Predicate<InflightAction> filter,
       final Runnable timeoutCounter,
-      final Runnable completedCounter) {
+      final Consumer<InflightAction> completedCounter) {
     final ContainerID id = container.containerID();
     final long deadline = clock.millis() - rmConf.getEventTimeout();
     if (inflightActions.containsKey(id)) {
@@ -581,7 +590,7 @@ public class ReplicationManager implements SCMService {
             if (isTimeout) {
               timeoutCounter.run();
             } else if (isCompleted) {
-              completedCounter.run();
+              completedCounter.accept(a);
             }
 
             updateMoveIfNeeded(isUnhealthy, isCompleted, isTimeout,
@@ -1553,6 +1562,7 @@ public class ReplicationManager implements SCMService {
         action -> inflightDeletion.get(id).add(action));
 
     metrics.incrNumDeletionCmdsSent();
+    metrics.incrNumDeletionBytesTotal(container.getUsedBytes());
   }
 
   /**
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
index b5080d5..9a70f69 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.lib.Interns;
 import org.apache.hadoop.metrics2.lib.MetricsRegistry;
 import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.metrics2.lib.MutableRate;
 import org.apache.hadoop.ozone.OzoneConsts;
 
 /**
@@ -74,6 +75,18 @@ public final class ReplicationManagerMetrics implements MetricsSource {
   @Metric("Number of replication bytes completed.")
   private MutableCounterLong numReplicationBytesCompleted;
 
+  @Metric("Number of deletion bytes total.")
+  private MutableCounterLong numDeletionBytesTotal;
+
+  @Metric("Number of deletion bytes completed.")
+  private MutableCounterLong numDeletionBytesCompleted;
+
+  @Metric("Time elapsed for replication")
+  private MutableRate replicationTime;
+
+  @Metric("Time elapsed for deletion")
+  private MutableRate deletionTime;
+
   private MetricsRegistry registry;
 
   private ReplicationManager replicationManager;
@@ -105,6 +118,10 @@ public final class ReplicationManagerMetrics implements MetricsSource {
     numDeletionCmdsTimeout.snapshot(builder, all);
     numReplicationBytesTotal.snapshot(builder, all);
     numReplicationBytesCompleted.snapshot(builder, all);
+    numDeletionBytesTotal.snapshot(builder, all);
+    numDeletionBytesCompleted.snapshot(builder, all);
+    replicationTime.snapshot(builder, all);
+    deletionTime.snapshot(builder, all);
   }
 
   public void unRegister() {
@@ -143,6 +160,22 @@ public final class ReplicationManagerMetrics implements MetricsSource {
     this.numReplicationBytesCompleted.incr(bytes);
   }
 
+  public void incrNumDeletionBytesTotal(long bytes) {
+    this.numDeletionBytesTotal.incr(bytes);
+  }
+
+  public void incrNumDeletionBytesCompleted(long bytes) {
+    this.numDeletionBytesCompleted.incr(bytes);
+  }
+
+  public void addReplicationTime(long millis) {
+    this.replicationTime.add(millis);
+  }
+
+  public void addDeletionTime(long millis) {
+    this.deletionTime.add(millis);
+  }
+
   public long getInflightReplication() {
     return replicationManager.getInflightReplication().size();
   }
@@ -179,6 +212,14 @@ public final class ReplicationManagerMetrics implements MetricsSource {
     return this.numDeletionCmdsTimeout.value();
   }
 
+  public long getNumDeletionBytesTotal() {
+    return this.numDeletionBytesTotal.value();
+  }
+
+  public long getNumDeletionBytesCompleted() {
+    return this.numDeletionBytesCompleted.value();
+  }
+
   public long getNumReplicationBytesTotal() {
     return this.numReplicationBytesTotal.value();
   }
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
index 1fd06b6..ea42292 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
@@ -491,6 +491,7 @@ public class TestReplicationManager {
   @Test
   public void testOverReplicatedQuasiClosedContainer() throws IOException {
     final ContainerInfo container = getContainer(LifeCycleState.QUASI_CLOSED);
+    container.setUsedBytes(101);
     final ContainerID id = container.containerID();
     final UUID originNodeId = UUID.randomUUID();
     final ContainerReplica replicaOne = getReplicas(
@@ -541,6 +542,8 @@ public class TestReplicationManager {
 
     final long currentDeleteCommandCompleted = replicationManager.getMetrics()
         .getNumDeletionCmdsCompleted();
+    final long deleteBytesCompleted =
+        replicationManager.getMetrics().getNumDeletionBytesCompleted();
 
     replicationManager.processAll();
     eventQueue.processAll(1000);
@@ -549,6 +552,8 @@ public class TestReplicationManager {
         .getInflightDeletion());
     Assert.assertEquals(currentDeleteCommandCompleted + 1,
         replicationManager.getMetrics().getNumDeletionCmdsCompleted());
+    Assert.assertEquals(deleteBytesCompleted + 101,
+        replicationManager.getMetrics().getNumDeletionBytesCompleted());
   }
 
   /**
@@ -696,6 +701,7 @@ public class TestReplicationManager {
       throws IOException, InterruptedException,
       TimeoutException {
     final ContainerInfo container = getContainer(LifeCycleState.QUASI_CLOSED);
+    container.setUsedBytes(99);
     final ContainerID id = container.containerID();
     final UUID originNodeId = UUID.randomUUID();
     final ContainerReplica replicaOne = getReplicas(
@@ -711,6 +717,8 @@ public class TestReplicationManager {
         .getInvocationCount(SCMCommandProto.Type.replicateContainerCommand);
     final int currentDeleteCommandCount = datanodeCommandHandler
         .getInvocationCount(SCMCommandProto.Type.deleteContainerCommand);
+    final long currentBytesToDelete = replicationManager.getMetrics()
+        .getNumDeletionBytesTotal();
 
     replicationManager.processAll();
     GenericTestUtils.waitFor(
@@ -747,6 +755,8 @@ public class TestReplicationManager {
         replicaTwo.getDatanodeDetails()));
     Assert.assertEquals(currentDeleteCommandCount + 1,
         replicationManager.getMetrics().getNumDeletionCmdsSent());
+    Assert.assertEquals(currentBytesToDelete + 99,
+        replicationManager.getMetrics().getNumDeletionBytesTotal());
     Assert.assertEquals(1, replicationManager.getInflightDeletion().size());
     Assert.assertEquals(1, replicationManager.getMetrics()
         .getInflightDeletion());

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org