You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by ad...@apache.org on 2022/06/30 19:13:44 UTC

[ozone] branch master updated: HDDS-6959. Fix move timeout in latest iteration metric (#3562)

This is an automated email from the ASF dual-hosted git repository.

adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 7838582393 HDDS-6959. Fix move timeout in latest iteration metric (#3562)
7838582393 is described below

commit 7838582393729d694998682df6939d3e35d0c1d4
Author: Ritesh H Shukla <ke...@gmail.com>
AuthorDate: Thu Jun 30 12:13:37 2022 -0700

    HDDS-6959. Fix move timeout in latest iteration metric (#3562)
---
 .../scm/container/balancer/ContainerBalancer.java  |  3 +-
 .../balancer/ContainerBalancerMetrics.java         | 37 ++++++++++++++++++++++
 .../container/replication/ReplicationManager.java  |  2 +-
 .../container/balancer/TestContainerBalancer.java  | 31 ++++++++++++++++++
 4 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
index 49dd12b8e3..a77f7a8c05 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
@@ -635,6 +635,8 @@ public class ContainerBalancer extends StatefulService {
       future = replicationManager
           .move(containerID, source, moveSelection.getTargetNode())
           .whenComplete((result, ex) -> {
+
+            metrics.incrementCurrentIterationContainerMoveMetric(result, 1);
             if (ex != null) {
               LOG.info("Container move for container {} from source {} to " +
                       "target {} failed with exceptions {}",
@@ -645,7 +647,6 @@ public class ContainerBalancer extends StatefulService {
               if (result == LegacyReplicationManager.MoveResult.COMPLETED) {
                 metrics.incrementDataSizeMovedGBInLatestIteration(
                     containerInfo.getUsedBytes() / OzoneConsts.GB);
-                metrics.incrementNumContainerMovesCompletedInLatestIteration(1);
                 if (LOG.isDebugEnabled()) {
                   LOG.debug(
                       "Container move completed for container {} to target {}",
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
index 3a7ce49ab2..b135c1ca2a 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
@@ -19,6 +19,7 @@
 
 package org.apache.hadoop.hdds.scm.container.balancer;
 
+import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager.MoveResult;
 import org.apache.hadoop.metrics2.MetricsSystem;
 import org.apache.hadoop.metrics2.annotation.Metric;
 import org.apache.hadoop.metrics2.annotation.Metrics;
@@ -121,6 +122,42 @@ public final class ContainerBalancerMetrics {
     this.numContainerMovesCompletedInLatestIteration.incr(valueToAdd);
   }
 
+  public void incrementCurrentIterationContainerMoveMetric(
+      MoveResult result,
+      long valueToAdd) {
+    if (result == null) {
+      return;
+    }
+    switch (result) {
+    case COMPLETED:
+      this.numContainerMovesCompletedInLatestIteration.incr(valueToAdd);
+      break;
+    case REPLICATION_FAIL_TIME_OUT:
+    case DELETION_FAIL_TIME_OUT:
+      this.numContainerMovesTimeoutInLatestIteration.incr(valueToAdd);
+      break;
+    // TODO: Add metrics for other errors that need to be tracked.
+    case FAIL_NOT_RUNNING:
+    case REPLICATION_FAIL_INFLIGHT_REPLICATION:
+    case FAIL_NOT_LEADER:
+    case REPLICATION_FAIL_NOT_EXIST_IN_SOURCE:
+    case REPLICATION_FAIL_EXIST_IN_TARGET:
+    case REPLICATION_FAIL_CONTAINER_NOT_CLOSED:
+    case REPLICATION_FAIL_INFLIGHT_DELETION:
+    case REPLICATION_FAIL_NODE_NOT_IN_SERVICE:
+    case DELETION_FAIL_NODE_NOT_IN_SERVICE:
+    case REPLICATION_FAIL_NODE_UNHEALTHY:
+    case DELETION_FAIL_NODE_UNHEALTHY:
+    case DELETE_FAIL_POLICY:
+    case PLACEMENT_POLICY_NOT_SATISFIED:
+    case UNEXPECTED_REMOVE_SOURCE_AT_INFLIGHT_REPLICATION:
+    case UNEXPECTED_REMOVE_TARGET_AT_INFLIGHT_DELETION:
+    case FAIL_CAN_NOT_RECORD_TO_DB:
+    default:
+      break;
+    }
+  }
+
   public void resetNumContainerMovesCompletedInLatestIteration() {
     numContainerMovesCompletedInLatestIteration.incr(
         -getNumContainerMovesCompletedInLatestIteration());
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
index 5d4da102d9..fc643af51b 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ReplicationManager.java
@@ -458,7 +458,7 @@ public class ReplicationManager implements SCMService {
 
 
   /**
-  * following functions will be refactored in a seperate jira.
+  * following functions will be refactored in a separate jira.
   */
   public CompletableFuture<LegacyReplicationManager.MoveResult> move(
       ContainerID cid, DatanodeDetails src, DatanodeDetails tgt)
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
index d57226bb30..dd27e8f446 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
@@ -739,6 +739,37 @@ public class TestContainerBalancer {
 
   }
 
+  @Test
+  public void checkIterationResultTimeoutFromReplicationManager()
+      throws NodeNotFoundException, IOException,
+      IllegalContainerBalancerStateException,
+      InvalidContainerBalancerConfigurationException {
+    CompletableFuture<MoveResult> future
+        = CompletableFuture.supplyAsync(() ->
+        MoveResult.REPLICATION_FAIL_TIME_OUT);
+    CompletableFuture<MoveResult> future2
+        = CompletableFuture.supplyAsync(() ->
+        MoveResult.DELETION_FAIL_TIME_OUT);
+    Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
+            Mockito.any(DatanodeDetails.class),
+            Mockito.any(DatanodeDetails.class)))
+        .thenReturn(future, future2);
+
+    balancerConfiguration.setThreshold(10);
+    balancerConfiguration.setIterations(1);
+    balancerConfiguration.setMaxSizeEnteringTarget(10 * OzoneConsts.GB);
+    balancerConfiguration.setMaxSizeToMovePerIteration(100 * OzoneConsts.GB);
+    balancerConfiguration.setMaxDatanodesPercentageToInvolvePerIteration(100);
+    balancerConfiguration.setMoveTimeout(Duration.ofMillis(1000));
+
+    startBalancer(balancerConfiguration);
+    sleepWhileBalancing(2000);
+
+    Assertions.assertTrue(containerBalancer.getMetrics()
+        .getNumContainerMovesTimeoutInLatestIteration() > 0);
+    stopBalancer();
+  }
+  
   @Test
   public void testStartAndImmediateStopForDeadlock()
       throws IllegalContainerBalancerStateException, IOException,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org