You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by lj...@apache.org on 2022/04/26 08:23:19 UTC
[ozone] branch master updated: HDDS-6553. Incorrect timeout when checking balancer iteration result. (#3272)
This is an automated email from the ASF dual-hosted git repository.
ljain pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new f575e238f8 HDDS-6553. Incorrect timeout when checking balancer iteration result. (#3272)
f575e238f8 is described below
commit f575e238f885f1fc607fa2896c6917de0d853811
Author: Symious <yi...@foxmail.com>
AuthorDate: Tue Apr 26 16:23:14 2022 +0800
HDDS-6553. Incorrect timeout when checking balancer iteration result. (#3272)
---
.../scm/container/balancer/ContainerBalancer.java | 125 +++++++++++----------
.../balancer/ContainerBalancerMetrics.java | 70 +++++++++---
.../container/balancer/TestContainerBalancer.java | 46 ++++++++
3 files changed, 165 insertions(+), 76 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
index 2e7b04feca..09d289e817 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
@@ -461,62 +461,44 @@ public class ContainerBalancer implements SCMService {
*/
private void checkIterationMoveResults(Set<DatanodeDetails> selectedTargets) {
this.countDatanodesInvolvedPerIteration = 0;
- this.sizeMovedPerIteration = 0;
- for (Map.Entry<ContainerMoveSelection,
- CompletableFuture<ReplicationManager.MoveResult>>
- futureEntry : moveSelectionToFutureMap.entrySet()) {
- ContainerMoveSelection moveSelection = futureEntry.getKey();
- CompletableFuture<ReplicationManager.MoveResult> future =
- futureEntry.getValue();
- try {
- ReplicationManager.MoveResult result = future.get(
- config.getMoveTimeout().toMillis(), TimeUnit.MILLISECONDS);
- if (result == ReplicationManager.MoveResult.COMPLETED) {
- try {
- ContainerInfo container =
- containerManager.getContainer(moveSelection.getContainerID());
- this.sizeMovedPerIteration += container.getUsedBytes();
- metrics.incrementNumContainerMovesInLatestIteration(1);
- LOG.info("Container move completed for container {} to target {}",
- container.containerID(),
- moveSelection.getTargetNode().getUuidString());
- } catch (ContainerNotFoundException e) {
- LOG.warn("Could not find Container {} while " +
- "checking move results in ContainerBalancer",
- moveSelection.getContainerID(), e);
- }
- } else {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Container move for container {} to target {} failed: {}",
- moveSelection.getContainerID(),
- moveSelection.getTargetNode().getUuidString(), result);
- }
- }
- } catch (InterruptedException e) {
- LOG.warn("Interrupted while waiting for container move result for " +
- "container {}.",
- moveSelection.getContainerID(), e);
- Thread.currentThread().interrupt();
- } catch (ExecutionException e) {
- LOG.warn("Container move for container {} completed exceptionally.",
- moveSelection.getContainerID(), e);
- } catch (TimeoutException e) {
- LOG.warn("Container move for container {} timed out.",
- moveSelection.getContainerID(), e);
- }
+
+ CompletableFuture<Void> allFuturesResult = CompletableFuture.allOf(
+ moveSelectionToFutureMap.values()
+ .toArray(new CompletableFuture[moveSelectionToFutureMap.size()]));
+ try {
+ allFuturesResult.get(config.getMoveTimeout().toMillis(),
+ TimeUnit.MILLISECONDS);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ } catch (TimeoutException e) {
+ long timeoutCounts = moveSelectionToFutureMap.entrySet().stream()
+ .filter(entry -> !entry.getValue().isDone())
+ .peek(entry -> {
+ LOG.warn("Container move canceled for container {} to target {} " +
+ "due to timeout.", entry.getKey().getContainerID(),
+ entry.getKey().getTargetNode().getUuidString());
+ entry.getValue().cancel(true);
+ }).count();
+ LOG.warn("{} Container moves are canceled.", timeoutCounts);
+ metrics.incrementNumContainerMovesTimeoutInLatestIteration(timeoutCounts);
+ } catch (ExecutionException e) {
+ LOG.error("Got exception while checkIterationMoveResults", e);
}
+
countDatanodesInvolvedPerIteration =
sourceToTargetMap.size() + selectedTargets.size();
metrics.incrementNumDatanodesInvolvedInLatestIteration(
countDatanodesInvolvedPerIteration);
- sizeMovedPerIteration /= OzoneConsts.GB;
- metrics.incrementDataSizeMovedGBInLatestIteration(sizeMovedPerIteration);
- metrics.incrementNumContainerMoves(
- metrics.getNumContainerMovesInLatestIteration());
- metrics.incrementDataSizeMovedGB(sizeMovedPerIteration);
+ metrics.incrementNumContainerMovesCompleted(
+ metrics.getNumContainerMovesCompletedInLatestIteration());
+ metrics.incrementNumContainerMovesTimeout(
+ metrics.getNumContainerMovesTimeoutInLatestIteration());
+ metrics.incrementDataSizeMovedGB(
+ metrics.getDataSizeMovedGBInLatestIteration());
LOG.info("Number of datanodes involved in this iteration: {}. Size moved " +
"in this iteration: {}GB.",
- countDatanodesInvolvedPerIteration, sizeMovedPerIteration);
+ countDatanodesInvolvedPerIteration,
+ metrics.getDataSizeMovedGBInLatestIteration());
}
/**
@@ -603,25 +585,49 @@ public class ContainerBalancer implements SCMService {
*/
private boolean moveContainer(DatanodeDetails source,
ContainerMoveSelection moveSelection) {
- ContainerID container = moveSelection.getContainerID();
+ ContainerID containerID = moveSelection.getContainerID();
CompletableFuture<ReplicationManager.MoveResult> future;
try {
+ ContainerInfo containerInfo = containerManager.getContainer(containerID);
future = replicationManager
- .move(container, source, moveSelection.getTargetNode());
+ .move(containerID, source, moveSelection.getTargetNode())
+ .whenComplete((result, ex) -> {
+ if (ex != null) {
+ LOG.info("Container move for container {} from source {} to " +
+ "target {} failed with exceptions {}",
+ containerID.toString(),
+ source.getUuidString(),
+ moveSelection.getTargetNode().getUuidString(), ex);
+ } else {
+ if (result == ReplicationManager.MoveResult.COMPLETED) {
+ metrics.incrementDataSizeMovedGBInLatestIteration(
+ containerInfo.getUsedBytes() / OzoneConsts.GB);
+ metrics.incrementNumContainerMovesCompletedInLatestIteration(1);
+ if (LOG.isDebugEnabled()) {
+ LOG.debug(
+ "Container move completed for container {} to target {}",
+ containerID,
+ moveSelection.getTargetNode().getUuidString());
+ }
+ } else {
+ LOG.warn(
+ "Container move for container {} to target {} failed: {}",
+ moveSelection.getContainerID(),
+ moveSelection.getTargetNode().getUuidString(), result);
+ }
+ }
+ });
} catch (ContainerNotFoundException e) {
- LOG.warn("Could not find Container {} for container move", container, e);
+ LOG.warn("Could not find Container {} for container move",
+ containerID, e);
return false;
} catch (NodeNotFoundException e) {
- LOG.warn("Container move failed for container {}", container, e);
+ LOG.warn("Container move failed for container {}", containerID, e);
return false;
}
+
if (future.isDone()) {
if (future.isCompletedExceptionally()) {
- LOG.info("Container move for container {} from source {} to target {}" +
- "failed with exceptions",
- container.toString(),
- source.getUuidString(),
- moveSelection.getTargetNode().getUuidString());
return false;
} else {
ReplicationManager.MoveResult result = future.join();
@@ -785,7 +791,8 @@ public class ContainerBalancer implements SCMService {
this.countDatanodesInvolvedPerIteration = 0;
this.sizeMovedPerIteration = 0;
metrics.resetDataSizeMovedGBInLatestIteration();
- metrics.resetNumContainerMovesInLatestIteration();
+ metrics.resetNumContainerMovesCompletedInLatestIteration();
+ metrics.resetNumContainerMovesTimeoutInLatestIteration();
metrics.resetNumDatanodesInvolvedInLatestIteration();
metrics.resetDataSizeUnbalancedGB();
metrics.resetNumDatanodesUnbalanced();
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
index f40dd443b1..3a7ce49ab2 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
@@ -40,9 +40,13 @@ public final class ContainerBalancerMetrics {
" in the latest iteration.")
private MutableCounterLong dataSizeMovedGBInLatestIteration;
- @Metric(about = "Number of container moves performed by Container Balancer " +
- "in the latest iteration.")
- private MutableCounterLong numContainerMovesInLatestIteration;
+ @Metric(about = "Number of completed container moves performed by " +
+ "Container Balancer in the latest iteration.")
+ private MutableCounterLong numContainerMovesCompletedInLatestIteration;
+
+ @Metric(about = "Number of timeout container moves performed by " +
+ "Container Balancer in the latest iteration.")
+ private MutableCounterLong numContainerMovesTimeoutInLatestIteration;
@Metric(about = "Number of iterations that Container Balancer has run for.")
private MutableCounterLong numIterations;
@@ -57,9 +61,13 @@ public final class ContainerBalancerMetrics {
@Metric(about = "Number of unbalanced datanodes.")
private MutableCounterLong numDatanodesUnbalanced;
- @Metric(about = "Total number of container moves across all iterations of " +
- "Container Balancer.")
- private MutableCounterLong numContainerMoves;
+ @Metric(about = "Total number of completed container moves across all " +
+ "iterations of Container Balancer.")
+ private MutableCounterLong numContainerMovesCompleted;
+
+ @Metric(about = "Total number of timeout container moves across " +
+ "all iterations of Container Balancer.")
+ private MutableCounterLong numContainerMovesTimeout;
@Metric(about = "Total data size in GB moved across all iterations of " +
"Container Balancer.")
@@ -104,17 +112,37 @@ public final class ContainerBalancerMetrics {
* latest iteration.
* @return number of container moves
*/
- public long getNumContainerMovesInLatestIteration() {
- return numContainerMovesInLatestIteration.value();
+ public long getNumContainerMovesCompletedInLatestIteration() {
+ return numContainerMovesCompletedInLatestIteration.value();
}
- public void incrementNumContainerMovesInLatestIteration(long valueToAdd) {
- this.numContainerMovesInLatestIteration.incr(valueToAdd);
+ public void incrementNumContainerMovesCompletedInLatestIteration(
+ long valueToAdd) {
+ this.numContainerMovesCompletedInLatestIteration.incr(valueToAdd);
}
- public void resetNumContainerMovesInLatestIteration() {
- numContainerMovesInLatestIteration.incr(
- -getNumContainerMovesInLatestIteration());
+ public void resetNumContainerMovesCompletedInLatestIteration() {
+ numContainerMovesCompletedInLatestIteration.incr(
+ -getNumContainerMovesCompletedInLatestIteration());
+ }
+
+ /**
+ * Gets the number of timeout container moves performed by
+ * Container Balancer in the latest iteration.
+ * @return number of timeout container moves
+ */
+ public long getNumContainerMovesTimeoutInLatestIteration() {
+ return numContainerMovesTimeoutInLatestIteration.value();
+ }
+
+ public void incrementNumContainerMovesTimeoutInLatestIteration(
+ long valueToAdd) {
+ this.numContainerMovesTimeoutInLatestIteration.incr(valueToAdd);
+ }
+
+ public void resetNumContainerMovesTimeoutInLatestIteration() {
+ numContainerMovesTimeoutInLatestIteration.incr(
+ -getNumContainerMovesTimeoutInLatestIteration());
}
/**
@@ -179,12 +207,20 @@ public final class ContainerBalancerMetrics {
numDatanodesUnbalanced.incr(-getNumDatanodesUnbalanced());
}
- public long getNumContainerMoves() {
- return numContainerMoves.value();
+ public long getNumContainerMovesCompleted() {
+ return numContainerMovesCompleted.value();
+ }
+
+ public void incrementNumContainerMovesCompleted(long valueToAdd) {
+ numContainerMovesCompleted.incr(valueToAdd);
+ }
+
+ public long getNumContainerMovesTimeout() {
+ return numContainerMovesTimeout.value();
}
- public void incrementNumContainerMoves(long valueToAdd) {
- numContainerMoves.incr(valueToAdd);
+ public void incrementNumContainerMovesTimeout(long valueToAdd) {
+ numContainerMovesTimeout.incr(valueToAdd);
}
public long getDataSizeMovedGB() {
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
index 0163152134..48069c505d 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
@@ -642,6 +642,40 @@ public class TestContainerBalancer {
containerBalancer.stop();
}
+ @Test
+ public void checkIterationResultTimeout()
+ throws NodeNotFoundException, ContainerNotFoundException {
+
+ Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
+ Mockito.any(DatanodeDetails.class),
+ Mockito.any(DatanodeDetails.class)))
+ .thenReturn(genCompletableFuture(500), genCompletableFuture(2000));
+
+ balancerConfiguration.setThreshold(10);
+ balancerConfiguration.setIterations(1);
+ balancerConfiguration.setMaxSizeEnteringTarget(10 * OzoneConsts.GB);
+ balancerConfiguration.setMaxSizeToMovePerIteration(100 * OzoneConsts.GB);
+ balancerConfiguration.setMaxDatanodesPercentageToInvolvePerIteration(100);
+ balancerConfiguration.setMoveTimeout(Duration.ofMillis(1000));
+
+ startBalancer(balancerConfiguration);
+ sleepWhileBalancing(2000);
+
+ /*
+ According to the setup and configurations, this iteration's result should
+ be ITERATION_COMPLETED.
+ */
+ Assert.assertEquals(ContainerBalancer.IterationResult.ITERATION_COMPLETED,
+ containerBalancer.getIterationResult());
+ Assert.assertEquals(1,
+ containerBalancer.getMetrics()
+ .getNumContainerMovesCompletedInLatestIteration());
+ Assert.assertTrue(containerBalancer.getMetrics()
+ .getNumContainerMovesTimeoutInLatestIteration() > 1);
+ containerBalancer.stop();
+
+ }
+
/**
* Determines unBalanced nodes, that is, over and under utilized nodes,
* according to the generated utilization values for nodes and the threshold.
@@ -834,4 +868,16 @@ public class TestContainerBalancer {
}
}
+ private CompletableFuture<ReplicationManager.MoveResult>
+ genCompletableFuture(int sleepMilSec) {
+ return CompletableFuture.supplyAsync(() -> {
+ try {
+ Thread.sleep(sleepMilSec);
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ return ReplicationManager.MoveResult.COMPLETED;
+ });
+ }
+
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org