You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by si...@apache.org on 2022/09/27 05:25:19 UTC

[ozone] branch master updated: HDDS-6492. Add metric for failed container moves (#3751)

This is an automated email from the ASF dual-hosted git repository.

siddhant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 69695eabb7 HDDS-6492. Add metric for failed container moves (#3751)
69695eabb7 is described below

commit 69695eabb73861d6d4dcf8286383c9a673844c37
Author: Sumit Agrawal <su...@gmail.com>
AuthorDate: Tue Sep 27 10:55:15 2022 +0530

    HDDS-6492. Add metric for failed container moves (#3751)
---
 .../scm/container/balancer/ContainerBalancer.java  | 33 ++++++++-----
 .../balancer/ContainerBalancerMetrics.java         | 29 +++++++++++
 .../container/balancer/TestContainerBalancer.java  | 56 +++++++++++++++++++---
 3 files changed, 100 insertions(+), 18 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
index aec9e14325..0c833627e9 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
@@ -529,19 +529,10 @@ public class ContainerBalancer extends StatefulService {
       allFuturesResult.get(config.getMoveTimeout().toMillis(),
           TimeUnit.MILLISECONDS);
     } catch (InterruptedException e) {
+      LOG.warn("Container balancer is interrupted");
       Thread.currentThread().interrupt();
     } catch (TimeoutException e) {
-      long timeoutCounts = moveSelectionToFutureMap.entrySet().stream()
-          .filter(entry -> !entry.getValue().isDone())
-          .peek(entry -> {
-            LOG.warn("Container move canceled for container {} from source {}" +
-                    " to target {} due to timeout.",
-                entry.getKey().getContainerID(),
-                containerToSourceMap.get(entry.getKey().getContainerID())
-                    .getUuidString(),
-                entry.getKey().getTargetNode().getUuidString());
-            entry.getValue().cancel(true);
-          }).count();
+      long timeoutCounts = cancelAndCountPendingMoves();
       LOG.warn("{} Container moves are canceled.", timeoutCounts);
       metrics.incrementNumContainerMovesTimeoutInLatestIteration(timeoutCounts);
     } catch (ExecutionException e) {
@@ -560,6 +551,8 @@ public class ContainerBalancer extends StatefulService {
         sizeActuallyMovedInLatestIteration / OzoneConsts.GB);
     metrics.incrementDataSizeMovedGB(
         metrics.getDataSizeMovedGBInLatestIteration());
+    metrics.incrementNumContainerMovesFailed(
+        metrics.getNumContainerMovesFailedInLatestIteration());
     LOG.info("Iteration Summary. Number of Datanodes involved: {}. Size " +
             "moved: {} ({} Bytes). Number of Container moves completed: {}.",
         countDatanodesInvolvedPerIteration,
@@ -568,6 +561,20 @@ public class ContainerBalancer extends StatefulService {
         metrics.getNumContainerMovesCompletedInLatestIteration());
   }
 
+  private long cancelAndCountPendingMoves() {
+    return moveSelectionToFutureMap.entrySet().stream()
+        .filter(entry -> !entry.getValue().isDone())
+        .peek(entry -> {
+          LOG.warn("Container move timeout for container {} from source {}" +
+                  " to target {}.",
+              entry.getKey().getContainerID(),
+              containerToSourceMap.get(entry.getKey().getContainerID())
+                  .getUuidString(),
+              entry.getKey().getTargetNode().getUuidString());
+          entry.getValue().cancel(true);
+        }).count();
+  }
+
   /**
    * Match a source datanode with a target datanode and identify the container
    * to move.
@@ -668,6 +675,7 @@ public class ContainerBalancer extends StatefulService {
                   containerID.toString(),
                   source.getUuidString(),
                   moveSelection.getTargetNode().getUuidString(), ex);
+              metrics.incrementNumContainerMovesFailedInLatestIteration(1);
             } else {
               if (result == LegacyReplicationManager.MoveResult.COMPLETED) {
                 sizeActuallyMovedInLatestIteration +=
@@ -690,9 +698,11 @@ public class ContainerBalancer extends StatefulService {
     } catch (ContainerNotFoundException e) {
       LOG.warn("Could not find Container {} for container move",
           containerID, e);
+      metrics.incrementNumContainerMovesFailedInLatestIteration(1);
       return false;
     } catch (NodeNotFoundException | TimeoutException e) {
       LOG.warn("Container move failed for container {}", containerID, e);
+      metrics.incrementNumContainerMovesFailedInLatestIteration(1);
       return false;
     }
 
@@ -871,6 +881,7 @@ public class ContainerBalancer extends StatefulService {
     metrics.resetNumDatanodesInvolvedInLatestIteration();
     metrics.resetDataSizeUnbalancedGB();
     metrics.resetNumDatanodesUnbalanced();
+    metrics.resetNumContainerMovesFailedInLatestIteration();
   }
 
   /**
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
index b135c1ca2a..72c1cda7d0 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
@@ -74,6 +74,14 @@ public final class ContainerBalancerMetrics {
       "Container Balancer.")
   private MutableCounterLong dataSizeMovedGB;
 
+  @Metric(about = "Total number container for which moves failed " +
+      "exceptionally across all iterations of Container Balancer.")
+  private MutableCounterLong numContainerMovesFailed;
+
+  @Metric(about = "Total number container for which moves failed " +
+      "exceptionally in latest iteration of Container Balancer.")
+  private MutableCounterLong numContainerMovesFailedInLatestIteration;
+
   /**
    * Create and register metrics named {@link ContainerBalancerMetrics#NAME}
    * for {@link ContainerBalancer}.
@@ -267,4 +275,25 @@ public final class ContainerBalancerMetrics {
   public void incrementDataSizeMovedGB(long valueToAdd) {
     dataSizeMovedGB.incr(valueToAdd);
   }
+
+  public long getNumContainerMovesFailed() {
+    return numContainerMovesFailed.value();
+  }
+
+  public void incrementNumContainerMovesFailed(long valueToAdd) {
+    numContainerMovesFailed.incr(valueToAdd);
+  }
+
+  public long getNumContainerMovesFailedInLatestIteration() {
+    return numContainerMovesFailedInLatestIteration.value();
+  }
+
+  public void incrementNumContainerMovesFailedInLatestIteration(
+      long valueToAdd) {
+    numContainerMovesFailedInLatestIteration.incr(valueToAdd);
+  }
+  public void resetNumContainerMovesFailedInLatestIteration() {
+    numContainerMovesFailedInLatestIteration.incr(
+        -getNumContainerMovesFailedInLatestIteration());
+  }
 }
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
index 2ddb19dceb..6253784946 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
@@ -30,15 +30,16 @@ import org.apache.hadoop.hdds.scm.PlacementPolicy;
 import org.apache.hadoop.hdds.scm.PlacementPolicyValidateProxy;
 import org.apache.hadoop.hdds.scm.container.ContainerID;
 import org.apache.hadoop.hdds.scm.container.ContainerInfo;
-import org.apache.hadoop.hdds.scm.container.ContainerReplica;
 import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.container.ContainerReplica;
 import org.apache.hadoop.hdds.scm.container.MockNodeManager;
-import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager;
-import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager.MoveResult;
-import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
 import org.apache.hadoop.hdds.scm.container.placement.algorithms.ContainerPlacementPolicyFactory;
 import org.apache.hadoop.hdds.scm.container.placement.algorithms.SCMContainerPlacementMetrics;
 import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeStat;
+import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager;
+import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager.MoveResult;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
 import org.apache.hadoop.hdds.scm.ha.SCMContext;
 import org.apache.hadoop.hdds.scm.ha.SCMService;
 import org.apache.hadoop.hdds.scm.ha.SCMServiceManager;
@@ -721,7 +722,7 @@ public class TestContainerBalancer {
     Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
             Mockito.any(DatanodeDetails.class),
             Mockito.any(DatanodeDetails.class)))
-        .thenReturn(genCompletableFuture(500), genCompletableFuture(2000));
+        .thenReturn(genCompletableFuture(500), genCompletableFuture(3000));
 
     balancerConfiguration.setThreshold(10);
     balancerConfiguration.setIterations(1);
@@ -779,7 +780,49 @@ public class TestContainerBalancer {
         .getNumContainerMovesTimeoutInLatestIteration() > 0);
     stopBalancer();
   }
-  
+
+  @Test
+  public void checkIterationResultException()
+      throws NodeNotFoundException, IOException,
+      IllegalContainerBalancerStateException,
+      InvalidContainerBalancerConfigurationException,
+      TimeoutException {
+
+    CompletableFuture<MoveResult> f = new CompletableFuture();
+    f.completeExceptionally(new RuntimeException("Runtime Exception"));
+    Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
+            Mockito.any(DatanodeDetails.class),
+            Mockito.any(DatanodeDetails.class)))
+        .thenThrow(new ContainerNotFoundException("Test Container not found"),
+            new NodeNotFoundException("Test Node not found"))
+        .thenReturn(f).thenReturn(CompletableFuture.supplyAsync(() -> {
+          try {
+            Thread.sleep(200);
+          } catch (Exception ex) {
+          }
+          throw new RuntimeException("Throw");
+        }));
+
+    balancerConfiguration.setThreshold(10);
+    balancerConfiguration.setIterations(1);
+    balancerConfiguration.setMaxSizeEnteringTarget(10 * STORAGE_UNIT);
+    balancerConfiguration.setMaxSizeToMovePerIteration(100 * STORAGE_UNIT);
+    balancerConfiguration.setMaxDatanodesPercentageToInvolvePerIteration(100);
+    balancerConfiguration.setMoveTimeout(Duration.ofMillis(500));
+
+    startBalancer(balancerConfiguration);
+    sleepWhileBalancing(1000);
+
+    Assertions.assertEquals(
+        ContainerBalancer.IterationResult.ITERATION_COMPLETED,
+        containerBalancer.getIterationResult());
+    Assertions.assertTrue(
+        containerBalancer.getMetrics()
+            .getNumContainerMovesFailed() >= 3);
+    stopBalancer();
+
+  }
+
   @Test
   public void testStartAndImmediateStopForDeadlock()
       throws IllegalContainerBalancerStateException, IOException,
@@ -999,5 +1042,4 @@ public class TestContainerBalancer {
       return LegacyReplicationManager.MoveResult.COMPLETED;
     });
   }
-
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org