You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by ad...@apache.org on 2021/11/17 14:23:31 UTC
[ozone] branch master updated: HDDS-5848. Introduce more replication metrics (#2758)
This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 83e5aa3 HDDS-5848. Introduce more replication metrics (#2758)
83e5aa3 is described below
commit 83e5aa3d403c47395a9287374bc1fa0c9d24cce7
Author: Doroszlai, Attila <64...@users.noreply.github.com>
AuthorDate: Wed Nov 17 15:23:13 2021 +0100
HDDS-5848. Introduce more replication metrics (#2758)
---
.../container/replication/MeasuredReplicator.java | 34 ++++++++++++++----
.../replication/ReplicationSupervisor.java | 1 -
.../replication/ReplicationSupervisorMetrics.java | 1 -
.../replication/TestMeasuredReplicator.java | 34 +++++++++++++-----
.../hdds/scm/container/ReplicationManager.java | 30 ++++++++++------
.../replication/ReplicationManagerMetrics.java | 41 ++++++++++++++++++++++
.../hdds/scm/container/TestReplicationManager.java | 10 ++++++
7 files changed, 123 insertions(+), 28 deletions(-)
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java
index 5d99cc4..d1175ef 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/MeasuredReplicator.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
import org.apache.hadoop.ozone.container.replication.ReplicationTask.Status;
import com.google.common.annotations.VisibleForTesting;
+import org.apache.hadoop.util.Time;
/**
* ContainerReplicator wrapper with additional metrics.
@@ -38,19 +39,25 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
private final ContainerReplicator delegate;
- @Metric
+ @Metric(about = "Number of successful replication tasks")
private MutableCounterLong success;
- @Metric
+ @Metric(about = "Time spent on successful replication tasks")
private MutableGaugeLong successTime;
- @Metric
+ @Metric(about = "Number of failed replication attempts")
private MutableCounterLong failure;
- @Metric
+ @Metric(about = "Time spent waiting in the queue before starting the task")
private MutableGaugeLong queueTime;
- @Metric
+ @Metric(about = "Time spent on failed replication attempts")
+ private MutableGaugeLong failureTime;
+
+ @Metric(about = "Bytes transferred for failed replication attempts")
+ private MutableGaugeLong failureBytes;
+
+ @Metric(about = "Bytes transferred for successful replication tasks")
private MutableGaugeLong transferredBytes;
public MeasuredReplicator(ContainerReplicator delegate) {
@@ -61,18 +68,21 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
@Override
public void replicate(ReplicationTask task) {
- long start = System.currentTimeMillis();
+ long start = Time.monotonicNow();
long msInQueue =
(Instant.now().getNano() - task.getQueued().getNano()) / 1_000_000;
queueTime.incr(msInQueue);
delegate.replicate(task);
+ long elapsed = Time.monotonicNow() - start;
if (task.getStatus() == Status.FAILED) {
failure.incr();
+ failureBytes.incr(task.getTransferredBytes());
+ failureTime.incr(elapsed);
} else if (task.getStatus() == Status.DONE) {
transferredBytes.incr(task.getTransferredBytes());
success.incr();
- successTime.incr(System.currentTimeMillis() - start);
+ successTime.incr(elapsed);
}
}
@@ -92,6 +102,11 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
}
@VisibleForTesting
+ public MutableGaugeLong getFailureTime() {
+ return failureTime;
+ }
+
+ @VisibleForTesting
public MutableCounterLong getFailure() {
return failure;
}
@@ -106,4 +121,9 @@ public class MeasuredReplicator implements ContainerReplicator, AutoCloseable {
return transferredBytes;
}
+ @VisibleForTesting
+ public MutableGaugeLong getFailureBytes() {
+ return failureBytes;
+ }
+
}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java
index 86e9cb4..05a4173 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisor.java
@@ -120,7 +120,6 @@ public class ReplicationSupervisor {
*
* @return Count of in-flight replications.
*/
- @VisibleForTesting
int getInFlightReplications() {
return containersInFlight.size();
}
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java
index 6dc40db..df48abd 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/replication/ReplicationSupervisorMetrics.java
@@ -56,7 +56,6 @@ public class ReplicationSupervisorMetrics implements MetricsSource {
}
@Override
- @SuppressWarnings("SuspiciousMethodCalls")
public void getMetrics(MetricsCollector collector, boolean all) {
collector.addRecord(SOURCE)
.addGauge(Interns.info("numInFlightReplications",
diff --git a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java
index 618935e..f8f0b93 100644
--- a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java
+++ b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/replication/TestMeasuredReplicator.java
@@ -59,7 +59,7 @@ public class TestMeasuredReplicator {
}
@Test
- public void measureFailureSuccessAndBytes() throws Exception {
+ public void measureFailureSuccessAndBytes() {
//WHEN
measuredReplicator.replicate(new ReplicationTask(1L, new ArrayList<>()));
measuredReplicator.replicate(new ReplicationTask(2L, new ArrayList<>()));
@@ -71,12 +71,14 @@ public class TestMeasuredReplicator {
Assert.assertEquals(1, measuredReplicator.getFailure().value());
//sum of container ids (success) in kb
- Assert.assertEquals(4 * 1024,
+ Assert.assertEquals((1 + 3) * 1024,
measuredReplicator.getTransferredBytes().value());
+ Assert.assertEquals(2 * 1024,
+ measuredReplicator.getFailureBytes().value());
}
@Test
- public void testSuccessTime() throws Exception {
+ public void testReplicationTime() throws Exception {
//WHEN
//will wait at least the 300ms
measuredReplicator.replicate(new ReplicationTask(101L, new ArrayList<>()));
@@ -85,18 +87,32 @@ public class TestMeasuredReplicator {
//THEN
//even containers should be failed
+ long successTime = measuredReplicator.getSuccessTime().value();
+ long failureTime = measuredReplicator.getFailureTime().value();
Assert.assertTrue(
- "Measured time should be at least 300 ms but was "
- + measuredReplicator.getSuccessTime().value(),
- measuredReplicator.getSuccessTime().value() >= 300L);
+ "Measured time should be at least 300 ms but was " + successTime,
+ successTime >= 300L);
+ Assert.assertTrue(
+ "Measured time should be at least 300 ms but was " + failureTime,
+ failureTime >= 300L);
}
@Test
- public void testSuccessTimeFailureExcluded() throws Exception {
+ public void testFailureTimeSuccessExcluded() {
+ //WHEN
+ //will wait at least the 15ms
+ measuredReplicator.replicate(new ReplicationTask(15L, new ArrayList<>()));
+ //THEN
+ //even containers should be failed, supposed to be zero
+ Assert.assertEquals(0, measuredReplicator.getFailureTime().value());
+ }
+
+ @Test
+ public void testSuccessTimeFailureExcluded() {
//WHEN
- //will wait at least the 300ms
- measuredReplicator.replicate(new ReplicationTask(300L, new ArrayList<>()));
+ //will wait at least the 10ms
+ measuredReplicator.replicate(new ReplicationTask(10L, new ArrayList<>()));
//THEN
//even containers should be failed, supposed to be zero
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
index 91d8578..00409b7 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
@@ -459,18 +459,13 @@ public class ReplicationManager implements SCMService {
action -> replicas.stream()
.anyMatch(r -> r.getDatanodeDetails().equals(action.datanode)),
()-> metrics.incrNumReplicationCmdsTimeout(),
- () -> {
- metrics.incrNumReplicationCmdsCompleted();
- metrics.incrNumReplicationBytesCompleted(
- container.getUsedBytes());
- });
+ action -> updateCompletedReplicationMetrics(container, action));
updateInflightAction(container, inflightDeletion,
action -> replicas.stream()
- .noneMatch(r ->
- r.getDatanodeDetails().equals(action.datanode)),
+ .noneMatch(r -> r.getDatanodeDetails().equals(action.datanode)),
() -> metrics.incrNumDeletionCmdsTimeout(),
- () -> metrics.incrNumDeletionCmdsCompleted());
+ action -> updateCompletedDeletionMetrics(container, action));
/*
* If container is under deleting and all it's replicas are deleted,
@@ -546,6 +541,20 @@ public class ReplicationManager implements SCMService {
}
}
+ private void updateCompletedReplicationMetrics(ContainerInfo container,
+ InflightAction action) {
+ metrics.incrNumReplicationCmdsCompleted();
+ metrics.incrNumReplicationBytesCompleted(container.getUsedBytes());
+ metrics.addReplicationTime(clock.millis() - action.time);
+ }
+
+ private void updateCompletedDeletionMetrics(ContainerInfo container,
+ InflightAction action) {
+ metrics.incrNumDeletionCmdsCompleted();
+ metrics.incrNumDeletionBytesCompleted(container.getUsedBytes());
+ metrics.addDeletionTime(clock.millis() - action.time);
+ }
+
/**
* Reconciles the InflightActions for a given container.
*
@@ -559,7 +568,7 @@ public class ReplicationManager implements SCMService {
final Map<ContainerID, List<InflightAction>> inflightActions,
final Predicate<InflightAction> filter,
final Runnable timeoutCounter,
- final Runnable completedCounter) {
+ final Consumer<InflightAction> completedCounter) {
final ContainerID id = container.containerID();
final long deadline = clock.millis() - rmConf.getEventTimeout();
if (inflightActions.containsKey(id)) {
@@ -581,7 +590,7 @@ public class ReplicationManager implements SCMService {
if (isTimeout) {
timeoutCounter.run();
} else if (isCompleted) {
- completedCounter.run();
+ completedCounter.accept(a);
}
updateMoveIfNeeded(isUnhealthy, isCompleted, isTimeout,
@@ -1553,6 +1562,7 @@ public class ReplicationManager implements SCMService {
action -> inflightDeletion.get(id).add(action));
metrics.incrNumDeletionCmdsSent();
+ metrics.incrNumDeletionBytesTotal(container.getUsedBytes());
}
/**
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
index b5080d5..9a70f69 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManagerMetrics.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.Interns;
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
+import org.apache.hadoop.metrics2.lib.MutableRate;
import org.apache.hadoop.ozone.OzoneConsts;
/**
@@ -74,6 +75,18 @@ public final class ReplicationManagerMetrics implements MetricsSource {
@Metric("Number of replication bytes completed.")
private MutableCounterLong numReplicationBytesCompleted;
+ @Metric("Number of deletion bytes total.")
+ private MutableCounterLong numDeletionBytesTotal;
+
+ @Metric("Number of deletion bytes completed.")
+ private MutableCounterLong numDeletionBytesCompleted;
+
+ @Metric("Time elapsed for replication")
+ private MutableRate replicationTime;
+
+ @Metric("Time elapsed for deletion")
+ private MutableRate deletionTime;
+
private MetricsRegistry registry;
private ReplicationManager replicationManager;
@@ -105,6 +118,10 @@ public final class ReplicationManagerMetrics implements MetricsSource {
numDeletionCmdsTimeout.snapshot(builder, all);
numReplicationBytesTotal.snapshot(builder, all);
numReplicationBytesCompleted.snapshot(builder, all);
+ numDeletionBytesTotal.snapshot(builder, all);
+ numDeletionBytesCompleted.snapshot(builder, all);
+ replicationTime.snapshot(builder, all);
+ deletionTime.snapshot(builder, all);
}
public void unRegister() {
@@ -143,6 +160,22 @@ public final class ReplicationManagerMetrics implements MetricsSource {
this.numReplicationBytesCompleted.incr(bytes);
}
+ public void incrNumDeletionBytesTotal(long bytes) {
+ this.numDeletionBytesTotal.incr(bytes);
+ }
+
+ public void incrNumDeletionBytesCompleted(long bytes) {
+ this.numDeletionBytesCompleted.incr(bytes);
+ }
+
+ public void addReplicationTime(long millis) {
+ this.replicationTime.add(millis);
+ }
+
+ public void addDeletionTime(long millis) {
+ this.deletionTime.add(millis);
+ }
+
public long getInflightReplication() {
return replicationManager.getInflightReplication().size();
}
@@ -179,6 +212,14 @@ public final class ReplicationManagerMetrics implements MetricsSource {
return this.numDeletionCmdsTimeout.value();
}
+ public long getNumDeletionBytesTotal() {
+ return this.numDeletionBytesTotal.value();
+ }
+
+ public long getNumDeletionBytesCompleted() {
+ return this.numDeletionBytesCompleted.value();
+ }
+
public long getNumReplicationBytesTotal() {
return this.numReplicationBytesTotal.value();
}
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
index 1fd06b6..ea42292 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/TestReplicationManager.java
@@ -491,6 +491,7 @@ public class TestReplicationManager {
@Test
public void testOverReplicatedQuasiClosedContainer() throws IOException {
final ContainerInfo container = getContainer(LifeCycleState.QUASI_CLOSED);
+ container.setUsedBytes(101);
final ContainerID id = container.containerID();
final UUID originNodeId = UUID.randomUUID();
final ContainerReplica replicaOne = getReplicas(
@@ -541,6 +542,8 @@ public class TestReplicationManager {
final long currentDeleteCommandCompleted = replicationManager.getMetrics()
.getNumDeletionCmdsCompleted();
+ final long deleteBytesCompleted =
+ replicationManager.getMetrics().getNumDeletionBytesCompleted();
replicationManager.processAll();
eventQueue.processAll(1000);
@@ -549,6 +552,8 @@ public class TestReplicationManager {
.getInflightDeletion());
Assert.assertEquals(currentDeleteCommandCompleted + 1,
replicationManager.getMetrics().getNumDeletionCmdsCompleted());
+ Assert.assertEquals(deleteBytesCompleted + 101,
+ replicationManager.getMetrics().getNumDeletionBytesCompleted());
}
/**
@@ -696,6 +701,7 @@ public class TestReplicationManager {
throws IOException, InterruptedException,
TimeoutException {
final ContainerInfo container = getContainer(LifeCycleState.QUASI_CLOSED);
+ container.setUsedBytes(99);
final ContainerID id = container.containerID();
final UUID originNodeId = UUID.randomUUID();
final ContainerReplica replicaOne = getReplicas(
@@ -711,6 +717,8 @@ public class TestReplicationManager {
.getInvocationCount(SCMCommandProto.Type.replicateContainerCommand);
final int currentDeleteCommandCount = datanodeCommandHandler
.getInvocationCount(SCMCommandProto.Type.deleteContainerCommand);
+ final long currentBytesToDelete = replicationManager.getMetrics()
+ .getNumDeletionBytesTotal();
replicationManager.processAll();
GenericTestUtils.waitFor(
@@ -747,6 +755,8 @@ public class TestReplicationManager {
replicaTwo.getDatanodeDetails()));
Assert.assertEquals(currentDeleteCommandCount + 1,
replicationManager.getMetrics().getNumDeletionCmdsSent());
+ Assert.assertEquals(currentBytesToDelete + 99,
+ replicationManager.getMetrics().getNumDeletionBytesTotal());
Assert.assertEquals(1, replicationManager.getInflightDeletion().size());
Assert.assertEquals(1, replicationManager.getMetrics()
.getInflightDeletion());
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org