You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by lj...@apache.org on 2022/04/26 08:23:19 UTC

[ozone] branch master updated: HDDS-6553. Incorrect timeout when checking balancer iteration result. (#3272)

This is an automated email from the ASF dual-hosted git repository.

ljain pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new f575e238f8 HDDS-6553. Incorrect timeout when checking balancer iteration result. (#3272)
f575e238f8 is described below

commit f575e238f885f1fc607fa2896c6917de0d853811
Author: Symious <yi...@foxmail.com>
AuthorDate: Tue Apr 26 16:23:14 2022 +0800

    HDDS-6553. Incorrect timeout when checking balancer iteration result. (#3272)
---
 .../scm/container/balancer/ContainerBalancer.java  | 125 +++++++++++----------
 .../balancer/ContainerBalancerMetrics.java         |  70 +++++++++---
 .../container/balancer/TestContainerBalancer.java  |  46 ++++++++
 3 files changed, 165 insertions(+), 76 deletions(-)

diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
index 2e7b04feca..09d289e817 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
@@ -461,62 +461,44 @@ public class ContainerBalancer implements SCMService {
    */
   private void checkIterationMoveResults(Set<DatanodeDetails> selectedTargets) {
     this.countDatanodesInvolvedPerIteration = 0;
-    this.sizeMovedPerIteration = 0;
-    for (Map.Entry<ContainerMoveSelection,
-            CompletableFuture<ReplicationManager.MoveResult>>
-        futureEntry : moveSelectionToFutureMap.entrySet()) {
-      ContainerMoveSelection moveSelection = futureEntry.getKey();
-      CompletableFuture<ReplicationManager.MoveResult> future =
-          futureEntry.getValue();
-      try {
-        ReplicationManager.MoveResult result = future.get(
-            config.getMoveTimeout().toMillis(), TimeUnit.MILLISECONDS);
-        if (result == ReplicationManager.MoveResult.COMPLETED) {
-          try {
-            ContainerInfo container =
-                containerManager.getContainer(moveSelection.getContainerID());
-            this.sizeMovedPerIteration += container.getUsedBytes();
-            metrics.incrementNumContainerMovesInLatestIteration(1);
-            LOG.info("Container move completed for container {} to target {}",
-                container.containerID(),
-                moveSelection.getTargetNode().getUuidString());
-          } catch (ContainerNotFoundException e) {
-            LOG.warn("Could not find Container {} while " +
-                    "checking move results in ContainerBalancer",
-                moveSelection.getContainerID(), e);
-          }
-        } else {
-          if (LOG.isDebugEnabled()) {
-            LOG.debug("Container move for container {} to target {} failed: {}",
-                moveSelection.getContainerID(),
-                moveSelection.getTargetNode().getUuidString(), result);
-          }
-        }
-      } catch (InterruptedException e) {
-        LOG.warn("Interrupted while waiting for container move result for " +
-                "container {}.",
-            moveSelection.getContainerID(), e);
-        Thread.currentThread().interrupt();
-      } catch (ExecutionException e) {
-        LOG.warn("Container move for container {} completed exceptionally.",
-            moveSelection.getContainerID(), e);
-      } catch (TimeoutException e) {
-        LOG.warn("Container move for container {} timed out.",
-            moveSelection.getContainerID(), e);
-      }
+
+    CompletableFuture<Void> allFuturesResult = CompletableFuture.allOf(
+        moveSelectionToFutureMap.values()
+            .toArray(new CompletableFuture[moveSelectionToFutureMap.size()]));
+    try {
+      allFuturesResult.get(config.getMoveTimeout().toMillis(),
+          TimeUnit.MILLISECONDS);
+    } catch (InterruptedException e) {
+      Thread.currentThread().interrupt();
+    } catch (TimeoutException e) {
+      long timeoutCounts = moveSelectionToFutureMap.entrySet().stream()
+          .filter(entry -> !entry.getValue().isDone())
+          .peek(entry -> {
+            LOG.warn("Container move canceled for container {} to target {} " +
+                "due to timeout.", entry.getKey().getContainerID(),
+                entry.getKey().getTargetNode().getUuidString());
+            entry.getValue().cancel(true);
+          }).count();
+      LOG.warn("{} Container moves are canceled.", timeoutCounts);
+      metrics.incrementNumContainerMovesTimeoutInLatestIteration(timeoutCounts);
+    } catch (ExecutionException e) {
+      LOG.error("Got exception while checkIterationMoveResults", e);
     }
+
     countDatanodesInvolvedPerIteration =
         sourceToTargetMap.size() + selectedTargets.size();
     metrics.incrementNumDatanodesInvolvedInLatestIteration(
         countDatanodesInvolvedPerIteration);
-    sizeMovedPerIteration /= OzoneConsts.GB;
-    metrics.incrementDataSizeMovedGBInLatestIteration(sizeMovedPerIteration);
-    metrics.incrementNumContainerMoves(
-        metrics.getNumContainerMovesInLatestIteration());
-    metrics.incrementDataSizeMovedGB(sizeMovedPerIteration);
+    metrics.incrementNumContainerMovesCompleted(
+        metrics.getNumContainerMovesCompletedInLatestIteration());
+    metrics.incrementNumContainerMovesTimeout(
+        metrics.getNumContainerMovesTimeoutInLatestIteration());
+    metrics.incrementDataSizeMovedGB(
+        metrics.getDataSizeMovedGBInLatestIteration());
     LOG.info("Number of datanodes involved in this iteration: {}. Size moved " +
             "in this iteration: {}GB.",
-        countDatanodesInvolvedPerIteration, sizeMovedPerIteration);
+        countDatanodesInvolvedPerIteration,
+        metrics.getDataSizeMovedGBInLatestIteration());
   }
 
   /**
@@ -603,25 +585,49 @@ public class ContainerBalancer implements SCMService {
    */
   private boolean moveContainer(DatanodeDetails source,
                                 ContainerMoveSelection moveSelection) {
-    ContainerID container = moveSelection.getContainerID();
+    ContainerID containerID = moveSelection.getContainerID();
     CompletableFuture<ReplicationManager.MoveResult> future;
     try {
+      ContainerInfo containerInfo = containerManager.getContainer(containerID);
       future = replicationManager
-          .move(container, source, moveSelection.getTargetNode());
+          .move(containerID, source, moveSelection.getTargetNode())
+          .whenComplete((result, ex) -> {
+            if (ex != null) {
+              LOG.info("Container move for container {} from source {} to " +
+                      "target {} failed with exceptions {}",
+                  containerID.toString(),
+                  source.getUuidString(),
+                  moveSelection.getTargetNode().getUuidString(), ex);
+            } else {
+              if (result == ReplicationManager.MoveResult.COMPLETED) {
+                metrics.incrementDataSizeMovedGBInLatestIteration(
+                    containerInfo.getUsedBytes() / OzoneConsts.GB);
+                metrics.incrementNumContainerMovesCompletedInLatestIteration(1);
+                if (LOG.isDebugEnabled()) {
+                  LOG.debug(
+                      "Container move completed for container {} to target {}",
+                      containerID,
+                      moveSelection.getTargetNode().getUuidString());
+                }
+              } else {
+                LOG.warn(
+                    "Container move for container {} to target {} failed: {}",
+                    moveSelection.getContainerID(),
+                    moveSelection.getTargetNode().getUuidString(), result);
+              }
+            }
+          });
     } catch (ContainerNotFoundException e) {
-      LOG.warn("Could not find Container {} for container move", container, e);
+      LOG.warn("Could not find Container {} for container move",
+          containerID, e);
       return false;
     } catch (NodeNotFoundException e) {
-      LOG.warn("Container move failed for container {}", container, e);
+      LOG.warn("Container move failed for container {}", containerID, e);
       return false;
     }
+
     if (future.isDone()) {
       if (future.isCompletedExceptionally()) {
-        LOG.info("Container move for container {} from source {} to target {}" +
-                "failed with exceptions",
-            container.toString(),
-            source.getUuidString(),
-            moveSelection.getTargetNode().getUuidString());
         return false;
       } else {
         ReplicationManager.MoveResult result = future.join();
@@ -785,7 +791,8 @@ public class ContainerBalancer implements SCMService {
     this.countDatanodesInvolvedPerIteration = 0;
     this.sizeMovedPerIteration = 0;
     metrics.resetDataSizeMovedGBInLatestIteration();
-    metrics.resetNumContainerMovesInLatestIteration();
+    metrics.resetNumContainerMovesCompletedInLatestIteration();
+    metrics.resetNumContainerMovesTimeoutInLatestIteration();
     metrics.resetNumDatanodesInvolvedInLatestIteration();
     metrics.resetDataSizeUnbalancedGB();
     metrics.resetNumDatanodesUnbalanced();
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
index f40dd443b1..3a7ce49ab2 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
@@ -40,9 +40,13 @@ public final class ContainerBalancerMetrics {
       " in the latest iteration.")
   private MutableCounterLong dataSizeMovedGBInLatestIteration;
 
-  @Metric(about = "Number of container moves performed by Container Balancer " +
-      "in the latest iteration.")
-  private MutableCounterLong numContainerMovesInLatestIteration;
+  @Metric(about = "Number of completed container moves performed by " +
+      "Container Balancer in the latest iteration.")
+  private MutableCounterLong numContainerMovesCompletedInLatestIteration;
+
+  @Metric(about = "Number of timeout container moves performed by " +
+      "Container Balancer in the latest iteration.")
+  private MutableCounterLong numContainerMovesTimeoutInLatestIteration;
 
   @Metric(about = "Number of iterations that Container Balancer has run for.")
   private MutableCounterLong numIterations;
@@ -57,9 +61,13 @@ public final class ContainerBalancerMetrics {
   @Metric(about = "Number of unbalanced datanodes.")
   private MutableCounterLong numDatanodesUnbalanced;
 
-  @Metric(about = "Total number of container moves across all iterations of " +
-      "Container Balancer.")
-  private MutableCounterLong numContainerMoves;
+  @Metric(about = "Total number of completed container moves across all " +
+      "iterations of Container Balancer.")
+  private MutableCounterLong numContainerMovesCompleted;
+
+  @Metric(about = "Total number of timeout container moves across " +
+      "all iterations of Container Balancer.")
+  private MutableCounterLong numContainerMovesTimeout;
 
   @Metric(about = "Total data size in GB moved across all iterations of " +
       "Container Balancer.")
@@ -104,17 +112,37 @@ public final class ContainerBalancerMetrics {
    * latest iteration.
    * @return number of container moves
    */
-  public long getNumContainerMovesInLatestIteration() {
-    return numContainerMovesInLatestIteration.value();
+  public long getNumContainerMovesCompletedInLatestIteration() {
+    return numContainerMovesCompletedInLatestIteration.value();
   }
 
-  public void incrementNumContainerMovesInLatestIteration(long valueToAdd) {
-    this.numContainerMovesInLatestIteration.incr(valueToAdd);
+  public void incrementNumContainerMovesCompletedInLatestIteration(
+      long valueToAdd) {
+    this.numContainerMovesCompletedInLatestIteration.incr(valueToAdd);
   }
 
-  public void resetNumContainerMovesInLatestIteration() {
-    numContainerMovesInLatestIteration.incr(
-        -getNumContainerMovesInLatestIteration());
+  public void resetNumContainerMovesCompletedInLatestIteration() {
+    numContainerMovesCompletedInLatestIteration.incr(
+        -getNumContainerMovesCompletedInLatestIteration());
+  }
+
+  /**
+   * Gets the number of timeout container moves performed by
+   * Container Balancer in the latest iteration.
+   * @return number of timeout container moves
+   */
+  public long getNumContainerMovesTimeoutInLatestIteration() {
+    return numContainerMovesTimeoutInLatestIteration.value();
+  }
+
+  public void incrementNumContainerMovesTimeoutInLatestIteration(
+      long valueToAdd) {
+    this.numContainerMovesTimeoutInLatestIteration.incr(valueToAdd);
+  }
+
+  public void resetNumContainerMovesTimeoutInLatestIteration() {
+    numContainerMovesTimeoutInLatestIteration.incr(
+        -getNumContainerMovesTimeoutInLatestIteration());
   }
 
   /**
@@ -179,12 +207,20 @@ public final class ContainerBalancerMetrics {
     numDatanodesUnbalanced.incr(-getNumDatanodesUnbalanced());
   }
 
-  public long getNumContainerMoves() {
-    return numContainerMoves.value();
+  public long getNumContainerMovesCompleted() {
+    return numContainerMovesCompleted.value();
+  }
+
+  public void incrementNumContainerMovesCompleted(long valueToAdd) {
+    numContainerMovesCompleted.incr(valueToAdd);
+  }
+
+  public long getNumContainerMovesTimeout() {
+    return numContainerMovesTimeout.value();
   }
 
-  public void incrementNumContainerMoves(long valueToAdd) {
-    numContainerMoves.incr(valueToAdd);
+  public void incrementNumContainerMovesTimeout(long valueToAdd) {
+    numContainerMovesTimeout.incr(valueToAdd);
   }
 
   public long getDataSizeMovedGB() {
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
index 0163152134..48069c505d 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
@@ -642,6 +642,40 @@ public class TestContainerBalancer {
     containerBalancer.stop();
   }
 
+  @Test
+  public void checkIterationResultTimeout()
+      throws NodeNotFoundException, ContainerNotFoundException {
+
+    Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
+            Mockito.any(DatanodeDetails.class),
+            Mockito.any(DatanodeDetails.class)))
+        .thenReturn(genCompletableFuture(500), genCompletableFuture(2000));
+
+    balancerConfiguration.setThreshold(10);
+    balancerConfiguration.setIterations(1);
+    balancerConfiguration.setMaxSizeEnteringTarget(10 * OzoneConsts.GB);
+    balancerConfiguration.setMaxSizeToMovePerIteration(100 * OzoneConsts.GB);
+    balancerConfiguration.setMaxDatanodesPercentageToInvolvePerIteration(100);
+    balancerConfiguration.setMoveTimeout(Duration.ofMillis(1000));
+
+    startBalancer(balancerConfiguration);
+    sleepWhileBalancing(2000);
+
+    /*
+    According to the setup and configurations, this iteration's result should
+    be ITERATION_COMPLETED.
+     */
+    Assert.assertEquals(ContainerBalancer.IterationResult.ITERATION_COMPLETED,
+        containerBalancer.getIterationResult());
+    Assert.assertEquals(1,
+        containerBalancer.getMetrics()
+            .getNumContainerMovesCompletedInLatestIteration());
+    Assert.assertTrue(containerBalancer.getMetrics()
+            .getNumContainerMovesTimeoutInLatestIteration() > 1);
+    containerBalancer.stop();
+
+  }
+
   /**
    * Determines unBalanced nodes, that is, over and under utilized nodes,
    * according to the generated utilization values for nodes and the threshold.
@@ -834,4 +868,16 @@ public class TestContainerBalancer {
     }
   }
 
+  private CompletableFuture<ReplicationManager.MoveResult>
+      genCompletableFuture(int sleepMilSec) {
+    return CompletableFuture.supplyAsync(() -> {
+      try {
+        Thread.sleep(sleepMilSec);
+      } catch (InterruptedException e) {
+        e.printStackTrace();
+      }
+      return ReplicationManager.MoveResult.COMPLETED;
+    });
+  }
+
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org