You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by si...@apache.org on 2022/09/27 05:25:19 UTC
[ozone] branch master updated: HDDS-6492. Add metric for failed container moves (#3751)
This is an automated email from the ASF dual-hosted git repository.
siddhant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 69695eabb7 HDDS-6492. Add metric for failed container moves (#3751)
69695eabb7 is described below
commit 69695eabb73861d6d4dcf8286383c9a673844c37
Author: Sumit Agrawal <su...@gmail.com>
AuthorDate: Tue Sep 27 10:55:15 2022 +0530
HDDS-6492. Add metric for failed container moves (#3751)
---
.../scm/container/balancer/ContainerBalancer.java | 33 ++++++++-----
.../balancer/ContainerBalancerMetrics.java | 29 +++++++++++
.../container/balancer/TestContainerBalancer.java | 56 +++++++++++++++++++---
3 files changed, 100 insertions(+), 18 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
index aec9e14325..0c833627e9 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancer.java
@@ -529,19 +529,10 @@ public class ContainerBalancer extends StatefulService {
allFuturesResult.get(config.getMoveTimeout().toMillis(),
TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
+ LOG.warn("Container balancer is interrupted");
Thread.currentThread().interrupt();
} catch (TimeoutException e) {
- long timeoutCounts = moveSelectionToFutureMap.entrySet().stream()
- .filter(entry -> !entry.getValue().isDone())
- .peek(entry -> {
- LOG.warn("Container move canceled for container {} from source {}" +
- " to target {} due to timeout.",
- entry.getKey().getContainerID(),
- containerToSourceMap.get(entry.getKey().getContainerID())
- .getUuidString(),
- entry.getKey().getTargetNode().getUuidString());
- entry.getValue().cancel(true);
- }).count();
+ long timeoutCounts = cancelAndCountPendingMoves();
LOG.warn("{} Container moves are canceled.", timeoutCounts);
metrics.incrementNumContainerMovesTimeoutInLatestIteration(timeoutCounts);
} catch (ExecutionException e) {
@@ -560,6 +551,8 @@ public class ContainerBalancer extends StatefulService {
sizeActuallyMovedInLatestIteration / OzoneConsts.GB);
metrics.incrementDataSizeMovedGB(
metrics.getDataSizeMovedGBInLatestIteration());
+ metrics.incrementNumContainerMovesFailed(
+ metrics.getNumContainerMovesFailedInLatestIteration());
LOG.info("Iteration Summary. Number of Datanodes involved: {}. Size " +
"moved: {} ({} Bytes). Number of Container moves completed: {}.",
countDatanodesInvolvedPerIteration,
@@ -568,6 +561,20 @@ public class ContainerBalancer extends StatefulService {
metrics.getNumContainerMovesCompletedInLatestIteration());
}
+ private long cancelAndCountPendingMoves() {
+ return moveSelectionToFutureMap.entrySet().stream()
+ .filter(entry -> !entry.getValue().isDone())
+ .peek(entry -> {
+ LOG.warn("Container move timeout for container {} from source {}" +
+ " to target {}.",
+ entry.getKey().getContainerID(),
+ containerToSourceMap.get(entry.getKey().getContainerID())
+ .getUuidString(),
+ entry.getKey().getTargetNode().getUuidString());
+ entry.getValue().cancel(true);
+ }).count();
+ }
+
/**
* Match a source datanode with a target datanode and identify the container
* to move.
@@ -668,6 +675,7 @@ public class ContainerBalancer extends StatefulService {
containerID.toString(),
source.getUuidString(),
moveSelection.getTargetNode().getUuidString(), ex);
+ metrics.incrementNumContainerMovesFailedInLatestIteration(1);
} else {
if (result == LegacyReplicationManager.MoveResult.COMPLETED) {
sizeActuallyMovedInLatestIteration +=
@@ -690,9 +698,11 @@ public class ContainerBalancer extends StatefulService {
} catch (ContainerNotFoundException e) {
LOG.warn("Could not find Container {} for container move",
containerID, e);
+ metrics.incrementNumContainerMovesFailedInLatestIteration(1);
return false;
} catch (NodeNotFoundException | TimeoutException e) {
LOG.warn("Container move failed for container {}", containerID, e);
+ metrics.incrementNumContainerMovesFailedInLatestIteration(1);
return false;
}
@@ -871,6 +881,7 @@ public class ContainerBalancer extends StatefulService {
metrics.resetNumDatanodesInvolvedInLatestIteration();
metrics.resetDataSizeUnbalancedGB();
metrics.resetNumDatanodesUnbalanced();
+ metrics.resetNumContainerMovesFailedInLatestIteration();
}
/**
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
index b135c1ca2a..72c1cda7d0 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/balancer/ContainerBalancerMetrics.java
@@ -74,6 +74,14 @@ public final class ContainerBalancerMetrics {
"Container Balancer.")
private MutableCounterLong dataSizeMovedGB;
+ @Metric(about = "Total number container for which moves failed " +
+ "exceptionally across all iterations of Container Balancer.")
+ private MutableCounterLong numContainerMovesFailed;
+
+ @Metric(about = "Total number container for which moves failed " +
+ "exceptionally in latest iteration of Container Balancer.")
+ private MutableCounterLong numContainerMovesFailedInLatestIteration;
+
/**
* Create and register metrics named {@link ContainerBalancerMetrics#NAME}
* for {@link ContainerBalancer}.
@@ -267,4 +275,25 @@ public final class ContainerBalancerMetrics {
public void incrementDataSizeMovedGB(long valueToAdd) {
dataSizeMovedGB.incr(valueToAdd);
}
+
+ public long getNumContainerMovesFailed() {
+ return numContainerMovesFailed.value();
+ }
+
+ public void incrementNumContainerMovesFailed(long valueToAdd) {
+ numContainerMovesFailed.incr(valueToAdd);
+ }
+
+ public long getNumContainerMovesFailedInLatestIteration() {
+ return numContainerMovesFailedInLatestIteration.value();
+ }
+
+ public void incrementNumContainerMovesFailedInLatestIteration(
+ long valueToAdd) {
+ numContainerMovesFailedInLatestIteration.incr(valueToAdd);
+ }
+ public void resetNumContainerMovesFailedInLatestIteration() {
+ numContainerMovesFailedInLatestIteration.incr(
+ -getNumContainerMovesFailedInLatestIteration());
+ }
}
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
index 2ddb19dceb..6253784946 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/balancer/TestContainerBalancer.java
@@ -30,15 +30,16 @@ import org.apache.hadoop.hdds.scm.PlacementPolicy;
import org.apache.hadoop.hdds.scm.PlacementPolicyValidateProxy;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
-import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.ContainerManager;
+import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
+import org.apache.hadoop.hdds.scm.container.ContainerReplica;
import org.apache.hadoop.hdds.scm.container.MockNodeManager;
-import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager;
-import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager.MoveResult;
-import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
import org.apache.hadoop.hdds.scm.container.placement.algorithms.ContainerPlacementPolicyFactory;
import org.apache.hadoop.hdds.scm.container.placement.algorithms.SCMContainerPlacementMetrics;
import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeStat;
+import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager;
+import org.apache.hadoop.hdds.scm.container.replication.LegacyReplicationManager.MoveResult;
+import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.ha.SCMService;
import org.apache.hadoop.hdds.scm.ha.SCMServiceManager;
@@ -721,7 +722,7 @@ public class TestContainerBalancer {
Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
Mockito.any(DatanodeDetails.class),
Mockito.any(DatanodeDetails.class)))
- .thenReturn(genCompletableFuture(500), genCompletableFuture(2000));
+ .thenReturn(genCompletableFuture(500), genCompletableFuture(3000));
balancerConfiguration.setThreshold(10);
balancerConfiguration.setIterations(1);
@@ -779,7 +780,49 @@ public class TestContainerBalancer {
.getNumContainerMovesTimeoutInLatestIteration() > 0);
stopBalancer();
}
-
+
+ @Test
+ public void checkIterationResultException()
+ throws NodeNotFoundException, IOException,
+ IllegalContainerBalancerStateException,
+ InvalidContainerBalancerConfigurationException,
+ TimeoutException {
+
+ CompletableFuture<MoveResult> f = new CompletableFuture();
+ f.completeExceptionally(new RuntimeException("Runtime Exception"));
+ Mockito.when(replicationManager.move(Mockito.any(ContainerID.class),
+ Mockito.any(DatanodeDetails.class),
+ Mockito.any(DatanodeDetails.class)))
+ .thenThrow(new ContainerNotFoundException("Test Container not found"),
+ new NodeNotFoundException("Test Node not found"))
+ .thenReturn(f).thenReturn(CompletableFuture.supplyAsync(() -> {
+ try {
+ Thread.sleep(200);
+ } catch (Exception ex) {
+ }
+ throw new RuntimeException("Throw");
+ }));
+
+ balancerConfiguration.setThreshold(10);
+ balancerConfiguration.setIterations(1);
+ balancerConfiguration.setMaxSizeEnteringTarget(10 * STORAGE_UNIT);
+ balancerConfiguration.setMaxSizeToMovePerIteration(100 * STORAGE_UNIT);
+ balancerConfiguration.setMaxDatanodesPercentageToInvolvePerIteration(100);
+ balancerConfiguration.setMoveTimeout(Duration.ofMillis(500));
+
+ startBalancer(balancerConfiguration);
+ sleepWhileBalancing(1000);
+
+ Assertions.assertEquals(
+ ContainerBalancer.IterationResult.ITERATION_COMPLETED,
+ containerBalancer.getIterationResult());
+ Assertions.assertTrue(
+ containerBalancer.getMetrics()
+ .getNumContainerMovesFailed() >= 3);
+ stopBalancer();
+
+ }
+
@Test
public void testStartAndImmediateStopForDeadlock()
throws IllegalContainerBalancerStateException, IOException,
@@ -999,5 +1042,4 @@ public class TestContainerBalancer {
return LegacyReplicationManager.MoveResult.COMPLETED;
});
}
-
}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org