You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by sh...@apache.org on 2021/06/11 07:24:28 UTC
[ozone] branch master updated: HDDS-5263. SCM may stay in safe mode
forever after a unclean shutdown of SCM. (#2294)
This is an automated email from the ASF dual-hosted git repository.
shashikant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new ac7166b HDDS-5263. SCM may stay in safe mode forever after a unclean shutdown of SCM. (#2294)
ac7166b is described below
commit ac7166b88234f89881fa91ecd459e4563b758f92
Author: Bharat Viswanadham <bh...@apache.org>
AuthorDate: Fri Jun 11 12:54:06 2021 +0530
HDDS-5263. SCM may stay in safe mode forever after a unclean shutdown of SCM. (#2294)
---
.../hdds/scm/block/SCMBlockDeletingService.java | 2 +-
.../hdds/scm/container/ReplicationManager.java | 4 +-
.../org/apache/hadoop/hdds/scm/ha/SCMContext.java | 60 +++++++++++++++
.../apache/hadoop/hdds/scm/ha/SCMStateMachine.java | 70 ++++++++++++++----
.../scm/pipeline/BackgroundPipelineCreatorV2.java | 2 +-
.../hdds/scm/safemode/ContainerSafeModeRule.java | 56 ++++++++++++--
.../hdds/scm/safemode/DataNodeSafeModeRule.java | 8 ++
.../scm/safemode/HealthyPipelineSafeModeRule.java | 85 +++++++++++++++-------
.../safemode/OneReplicaPipelineSafeModeRule.java | 70 ++++++++++++------
.../hdds/scm/safemode/SCMSafeModeManager.java | 41 ++++++++++-
.../hadoop/hdds/scm/safemode/SafeModeExitRule.java | 9 +++
.../hdds/scm/server/StorageContainerManager.java | 9 ++-
.../hadoop/hdds/scm/block/TestBlockManager.java | 2 +-
.../apache/hadoop/hdds/scm/ha/TestSCMContext.java | 3 +
.../hdds/scm/pipeline/TestPipelineManagerImpl.java | 5 +-
.../hdds/scm/pipeline/TestSCMPipelineManager.java | 6 +-
.../safemode/TestHealthyPipelineSafeModeRule.java | 6 +-
.../TestOneReplicaPipelineSafeModeRule.java | 2 +-
.../hdds/scm/safemode/TestSCMSafeModeManager.java | 33 ++++++---
.../apache/hadoop/ozone/MiniOzoneClusterImpl.java | 6 ++
.../ozone/scm/TestSCMInstallSnapshotWithHA.java | 1 +
21 files changed, 377 insertions(+), 103 deletions(-)
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
index edf6ca8..46ddc4a 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/block/SCMBlockDeletingService.java
@@ -224,7 +224,7 @@ public class SCMBlockDeletingService extends BackgroundService
public void notifyStatusChanged() {
serviceLock.lock();
try {
- if (scmContext.isLeader()) {
+ if (scmContext.isLeaderReady()) {
serviceStatus = ServiceStatus.RUNNING;
} else {
serviceStatus = ServiceStatus.PAUSING;
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
index 7bc345a..8accae6 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/ReplicationManager.java
@@ -1283,9 +1283,9 @@ public class ReplicationManager implements MetricsSource, SCMService {
public void notifyStatusChanged() {
serviceLock.lock();
try {
- // 1) SCMContext#isLeader returns true.
+ // 1) SCMContext#isLeaderReady returns true.
// 2) not in safe mode.
- if (scmContext.isLeader() && !scmContext.isInSafeMode()) {
+ if (scmContext.isLeaderReady() && !scmContext.isInSafeMode()) {
// transition from PAUSING to RUNNING
if (serviceStatus != ServiceStatus.RUNNING) {
LOG.info("Service {} transitions to RUNNING.", getServiceName());
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMContext.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMContext.java
index 9321e5a..b6e933b 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMContext.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMContext.java
@@ -61,6 +61,7 @@ public final class SCMContext {
* Raft related info.
*/
private boolean isLeader;
+ private boolean isLeaderReady;
private long term;
/**
@@ -77,6 +78,7 @@ public final class SCMContext {
this.term = term;
this.safeModeStatus = safeModeStatus;
this.scm = scm;
+ this.isLeaderReady = false;
}
/**
@@ -90,6 +92,12 @@ public final class SCMContext {
isLeader, term, leader, newTerm);
isLeader = leader;
+ // If it is not leader, set isLeaderReady to false.
+ if (!isLeader) {
+ isLeaderReady = false;
+ LOG.info("update <isLeaderReady> from <{}> to <{}>", isLeaderReady,
+ false);
+ }
term = newTerm;
} finally {
lock.writeLock().unlock();
@@ -97,8 +105,32 @@ public final class SCMContext {
}
/**
+ * Set isLeaderReady flag to true, this indicate leader is ready to accept
+ * transactions.
+ *
+ * On the leader SCM once all the previous leader term transaction are
+ * applied, this will be called to set the isLeaderReady to true.
+ *
+ */
+ public void setLeaderReady() {
+ lock.writeLock().lock();
+ try {
+ LOG.info("update <isLeaderReady> from <{}> to <{}>",
+ isLeaderReady, true);
+
+ isLeaderReady = true;
+ } finally {
+ lock.writeLock().unlock();
+ }
+ }
+
+ /**
* Check whether current SCM is leader or not.
*
+ * Use this API to know if SCM can send a command to DN once after it is
+ * elected as leader.
+ * True - it is leader, else false.
+ *
* @return isLeader
*/
public boolean isLeader() {
@@ -114,6 +146,34 @@ public final class SCMContext {
}
}
+
+ /**
+ * Check whether current SCM is leader ready.
+ *
+ * Use this API to know when all the previous leader term transactions are
+ * applied and the SCM DB/in-memory state is latest state and then only
+ * particular command/action need to be taken by SCM.
+ *
+ * In general all background services should use this API to start their
+ * service.
+ *
+ * True - it is leader and ready, else false.
+ *
+ * @return isLeaderReady
+ */
+ public boolean isLeaderReady() {
+ lock.readLock().lock();
+ try {
+ if (term == INVALID_TERM) {
+ return true;
+ }
+
+ return isLeaderReady;
+ } finally {
+ lock.readLock().unlock();
+ }
+ }
+
/**
* Get term of current leader SCM.
*
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
index 6fdbc5d..f4e13fb 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/ha/SCMStateMachine.java
@@ -26,6 +26,8 @@ import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicLong;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
@@ -80,6 +82,9 @@ public class SCMStateMachine extends BaseStateMachine {
// and reinitialize().
private DBCheckpoint installingDBCheckpoint = null;
+ private AtomicLong currentLeaderTerm = new AtomicLong(-1L);
+ private AtomicBoolean refreshedAfterLeaderReady = new AtomicBoolean(false);
+
public SCMStateMachine(final StorageContainerManager scm,
SCMHADBTransactionBuffer buffer) {
this.scm = scm;
@@ -132,6 +137,12 @@ public class SCMStateMachine extends BaseStateMachine {
final SCMRatisRequest request = SCMRatisRequest.decode(
Message.valueOf(trx.getStateMachineLogEntry().getLogData()));
applyTransactionFuture.complete(process(request));
+
+ // After previous term transactions are applied, still in safe mode,
+ // perform refreshAndValidate to update the safemode rule state.
+ if (scm.isInSafeMode() && refreshedAfterLeaderReady.get()) {
+ scm.getScmSafeModeManager().refreshAndValidate();
+ }
transactionBuffer.updateLatestTrxInfo(TransactionInfo.builder()
.setCurrentTerm(trx.getLogEntry().getTerm())
.setTransactionIndex(trx.getLogEntry().getIndex())
@@ -229,21 +240,19 @@ public class SCMStateMachine extends BaseStateMachine {
if (!isInitialized) {
return;
}
+
+ currentLeaderTerm.set(scm.getScmHAManager().getRatisServer().getDivision()
+ .getInfo().getCurrentTerm());
+
if (!groupMemberId.getPeerId().equals(newLeaderId)) {
LOG.info("leader changed, yet current SCM is still follower.");
return;
}
- long term = scm.getScmHAManager()
- .getRatisServer()
- .getDivision()
- .getInfo()
- .getCurrentTerm();
-
- LOG.info("current SCM becomes leader of term {}.", term);
+ LOG.info("current SCM becomes leader of term {}.", currentLeaderTerm);
- scm.getScmContext().updateLeaderAndTerm(true, term);
- scm.getSCMServiceManager().notifyStatusChanged();
+ scm.getScmContext().updateLeaderAndTerm(true,
+ currentLeaderTerm.get());
scm.getSequenceIdGen().invalidateBatch();
DeletedBlockLog deletedBlockLog = scm.getScmBlockManager()
@@ -251,7 +260,6 @@ public class SCMStateMachine extends BaseStateMachine {
Preconditions.checkArgument(
deletedBlockLog instanceof DeletedBlockLogImplV2);
((DeletedBlockLogImplV2) deletedBlockLog).onBecomeLeader();
-
scm.getScmDecommissionManager().onBecomeLeader();
}
@@ -286,17 +294,49 @@ public class SCMStateMachine extends BaseStateMachine {
@Override
public void notifyTermIndexUpdated(long term, long index) {
- if (transactionBuffer != null) {
- transactionBuffer.updateLatestTrxInfo(
- TransactionInfo.builder().setCurrentTerm(term)
- .setTransactionIndex(index).build());
- }
+
// We need to call updateLastApplied here because now in ratis when a
// node becomes leader, it is checking stateMachineIndex >=
// placeHolderIndex (when a node becomes leader, it writes a conf entry
// with some information like its peers and termIndex). So, calling
// updateLastApplied updates lastAppliedTermIndex.
updateLastAppliedTermIndex(term, index);
+
+ // Skip below part if state machine is not initialized.
+
+ if (!isInitialized) {
+ return;
+ }
+
+ if (transactionBuffer != null) {
+ transactionBuffer.updateLatestTrxInfo(
+ TransactionInfo.builder().setCurrentTerm(term)
+ .setTransactionIndex(index).build());
+ }
+
+ if (currentLeaderTerm.get() == term) {
+ // On leader SCM once after it is ready, notify SCM services and also set
+ // leader ready in SCMContext.
+ if (scm.getScmHAManager().getRatisServer().getDivision().getInfo()
+ .isLeaderReady()) {
+ scm.getScmContext().setLeaderReady();
+ scm.getSCMServiceManager().notifyStatusChanged();
+ }
+
+ // Means all transactions before this term have been applied.
+ // This means after a restart, all pending transactions have been applied.
+ // Perform
+ // 1. Refresh Safemode rules state.
+ // 2. Start DN Rpc server.
+ if (!refreshedAfterLeaderReady.get()) {
+ scm.getScmSafeModeManager().refresh();
+ LOG.info("bharat starting from sm");
+ scm.getDatanodeProtocolServer().start();
+
+ refreshedAfterLeaderReady.set(true);
+ }
+ currentLeaderTerm.set(-1L);
+ }
}
@Override
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreatorV2.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreatorV2.java
index 2ab0b31..a88f422 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreatorV2.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/pipeline/BackgroundPipelineCreatorV2.java
@@ -254,7 +254,7 @@ public class BackgroundPipelineCreatorV2 implements SCMService {
try {
// 1) SCMContext#isLeader returns true.
// 2) not in safe mode or createPipelineInSafeMode is true
- if (scmContext.isLeader() &&
+ if (scmContext.isLeaderReady() &&
(!scmContext.isInSafeMode() || createPipelineInSafeMode)) {
// transition from PAUSING to RUNNING
if (serviceStatus != ServiceStatus.RUNNING) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
index ffa0209..f2d7c02 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/ContainerSafeModeRule.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManagerV2;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.server.SCMDatanodeProtocolServer.NodeRegistrationContainerReport;
import org.apache.hadoop.hdds.server.events.EventQueue;
@@ -34,6 +35,8 @@ import org.apache.hadoop.hdds.server.events.TypedEvent;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* Class defining Safe mode exit criteria for Containers.
@@ -41,6 +44,8 @@ import com.google.common.base.Preconditions;
public class ContainerSafeModeRule extends
SafeModeExitRule<NodeRegistrationContainerReport>{
+ public static final Logger LOG =
+ LoggerFactory.getLogger(ContainerSafeModeRule.class);
// Required cutoff % for containers with at least 1 reported replica.
private double safeModeCutoff;
// Containers read from scm db (excluding containers in ALLOCATED state).
@@ -48,11 +53,14 @@ public class ContainerSafeModeRule extends
private double maxContainer;
private AtomicLong containerWithMinReplicas = new AtomicLong(0);
+ private final ContainerManagerV2 containerManager;
public ContainerSafeModeRule(String ruleName, EventQueue eventQueue,
ConfigurationSource conf,
- List<ContainerInfo> containers, SCMSafeModeManager manager) {
+ List<ContainerInfo> containers,
+ ContainerManagerV2 containerManager, SCMSafeModeManager manager) {
super(manager, ruleName, eventQueue);
+ this.containerManager = containerManager;
safeModeCutoff = conf.getDouble(
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT_DEFAULT);
@@ -77,6 +85,8 @@ public class ContainerSafeModeRule extends
maxContainer = containerMap.size();
long cutOff = (long) Math.ceil(maxContainer * safeModeCutoff);
getSafeModeMetrics().setNumContainerWithOneReplicaReportedThreshold(cutOff);
+
+ LOG.info("containers with one replica threshold count {}", cutOff);
}
@@ -87,12 +97,12 @@ public class ContainerSafeModeRule extends
@Override
- protected boolean validate() {
+ protected synchronized boolean validate() {
return getCurrentContainerThreshold() >= safeModeCutoff;
}
@VisibleForTesting
- public double getCurrentContainerThreshold() {
+ public synchronized double getCurrentContainerThreshold() {
if (maxContainer == 0) {
return 1;
}
@@ -100,7 +110,8 @@ public class ContainerSafeModeRule extends
}
@Override
- protected void process(NodeRegistrationContainerReport reportsProto) {
+ protected synchronized void process(
+ NodeRegistrationContainerReport reportsProto) {
reportsProto.getReport().getReportsList().forEach(c -> {
if (containerMap.containsKey(c.getContainerID())) {
@@ -121,7 +132,7 @@ public class ContainerSafeModeRule extends
}
@Override
- protected void cleanup() {
+ protected synchronized void cleanup() {
containerMap.clear();
}
@@ -134,4 +145,39 @@ public class ContainerSafeModeRule extends
getCurrentContainerThreshold(), this.safeModeCutoff);
}
+
+ @Override
+ public synchronized void refresh(boolean forceRefresh) {
+ if (forceRefresh) {
+ reInitializeRule();
+ } else {
+ if (!validate()) {
+ reInitializeRule();
+ }
+ }
+ }
+
+ private void reInitializeRule() {
+ containerMap.clear();
+ containerManager.getContainers().forEach(container -> {
+ // There can be containers in OPEN/CLOSING state which were never
+ // created by the client. We are not considering these containers for
+ // now. These containers can be handled by tracking pipelines.
+
+ Optional.ofNullable(container.getState())
+ .filter(state -> (state == HddsProtos.LifeCycleState.QUASI_CLOSED ||
+ state == HddsProtos.LifeCycleState.CLOSED))
+ .ifPresent(s -> containerMap.put(container.getContainerID(),
+ container));
+ });
+
+ maxContainer = containerMap.size();
+ long cutOff = (long) Math.ceil(maxContainer * safeModeCutoff);
+
+ LOG.info("Refreshed one replica container threshold {}, " +
+ "currentThreshold {}", cutOff, containerWithMinReplicas.get());
+ getSafeModeMetrics()
+ .setNumContainerWithOneReplicaReportedThreshold(cutOff);
+ }
+
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java
index ea5a78f..0c4ce84 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/DataNodeSafeModeRule.java
@@ -86,4 +86,12 @@ public class DataNodeSafeModeRule extends
.format("registered datanodes (=%d) >= required datanodes (=%d)",
this.registeredDns, this.requiredDns);
}
+
+
+ @Override
+ public void refresh(boolean forceRefresh) {
+ // Do nothing.
+ // As for this rule, there is nothing we read from SCM DB state and
+ // validate it.
+ }
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java
index 14bc58b..3ecf550 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/HealthyPipelineSafeModeRule.java
@@ -51,24 +51,22 @@ public class HealthyPipelineSafeModeRule extends SafeModeExitRule<Pipeline> {
private int currentHealthyPipelineCount = 0;
private final double healthyPipelinesPercent;
private final Set<PipelineID> processedPipelineIDs = new HashSet<>();
+ private final PipelineManager pipelineManager;
+ private final int minHealthyPipelines;
HealthyPipelineSafeModeRule(String ruleName, EventQueue eventQueue,
PipelineManager pipelineManager,
SCMSafeModeManager manager, ConfigurationSource configuration) {
super(manager, ruleName, eventQueue);
+ this.pipelineManager = pipelineManager;
healthyPipelinesPercent =
configuration.getDouble(HddsConfigKeys.
HDDS_SCM_SAFEMODE_HEALTHY_PIPELINE_THRESHOLD_PCT,
HddsConfigKeys.
HDDS_SCM_SAFEMODE_HEALTHY_PIPELINE_THRESHOLD_PCT_DEFAULT);
- int minDatanodes = configuration.getInt(
- HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
- HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT);
-
// We only care about THREE replica pipeline
- int minHealthyPipelines = minDatanodes /
- HddsProtos.ReplicationFactor.THREE_VALUE;
+ minHealthyPipelines = getMinHealthyPipelines(configuration);
Preconditions.checkArgument(
(healthyPipelinesPercent >= 0.0 && healthyPipelinesPercent <= 1.0),
@@ -76,26 +74,22 @@ public class HealthyPipelineSafeModeRule extends SafeModeExitRule<Pipeline> {
HDDS_SCM_SAFEMODE_HEALTHY_PIPELINE_THRESHOLD_PCT
+ " value should be >= 0.0 and <= 1.0");
- // We want to wait for RATIS THREE factor write pipelines
- int pipelineCount = pipelineManager.getPipelines(
- new RatisReplicationConfig(HddsProtos.ReplicationFactor.THREE),
- Pipeline.PipelineState.OPEN).size();
+ initializeRule(false);
+ }
- // This value will be zero when pipeline count is 0.
- // On a fresh installed cluster, there will be zero pipelines in the SCM
- // pipeline DB.
- healthyPipelineThresholdCount = Math.max(minHealthyPipelines,
- (int) Math.ceil(healthyPipelinesPercent * pipelineCount));
+ private int getMinHealthyPipelines(ConfigurationSource config) {
+ int minDatanodes = config.getInt(
+ HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
+ HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE_DEFAULT);
- LOG.info("Total pipeline count is {}, healthy pipeline " +
- "threshold count is {}", pipelineCount, healthyPipelineThresholdCount);
+ // We only care about THREE replica pipeline
+ return minDatanodes / HddsProtos.ReplicationFactor.THREE_VALUE;
- getSafeModeMetrics().setNumHealthyPipelinesThreshold(
- healthyPipelineThresholdCount);
}
@VisibleForTesting
- public void setHealthyPipelineThresholdCount(int actualPipelineCount) {
+ public synchronized void setHealthyPipelineThresholdCount(
+ int actualPipelineCount) {
healthyPipelineThresholdCount =
(int) Math.ceil(healthyPipelinesPercent * actualPipelineCount);
}
@@ -106,12 +100,12 @@ public class HealthyPipelineSafeModeRule extends SafeModeExitRule<Pipeline> {
}
@Override
- protected boolean validate() {
+ protected synchronized boolean validate() {
return currentHealthyPipelineCount >= healthyPipelineThresholdCount;
}
@Override
- protected void process(Pipeline pipeline) {
+ protected synchronized void process(Pipeline pipeline) {
// When SCM is in safe mode for long time, already registered
// datanode can send pipeline report again, or SCMPipelineManager will
@@ -130,22 +124,57 @@ public class HealthyPipelineSafeModeRule extends SafeModeExitRule<Pipeline> {
SCMSafeModeManager.getLogger().info(
"SCM in safe mode. Healthy pipelines reported count is {}, " +
"required healthy pipeline reported count is {}",
- currentHealthyPipelineCount, healthyPipelineThresholdCount);
+ currentHealthyPipelineCount, getHealthyPipelineThresholdCount());
+
}
}
+
+ public synchronized void refresh(boolean forceRefresh) {
+ if (forceRefresh) {
+ initializeRule(true);
+ } else {
+ if (!validate()) {
+ initializeRule(true);
+ }
+ }
+ }
+
+ private synchronized void initializeRule(boolean refresh) {
+ int pipelineCount = pipelineManager.getPipelines(
+ new RatisReplicationConfig(HddsProtos.ReplicationFactor.THREE),
+ Pipeline.PipelineState.OPEN).size();
+
+ healthyPipelineThresholdCount = Math.max(minHealthyPipelines,
+ (int) Math.ceil(healthyPipelinesPercent * pipelineCount));
+
+ if (refresh) {
+ LOG.info("Refreshed total pipeline count is {}, healthy pipeline " +
+ "threshold count is {}", pipelineCount,
+ healthyPipelineThresholdCount);
+ } else {
+ LOG.info("Total pipeline count is {}, healthy pipeline " +
+ "threshold count is {}", pipelineCount,
+ healthyPipelineThresholdCount);
+ }
+
+ getSafeModeMetrics().setNumHealthyPipelinesThreshold(
+ healthyPipelineThresholdCount);
+ }
+
+
@Override
- protected void cleanup() {
+ protected synchronized void cleanup() {
processedPipelineIDs.clear();
}
@VisibleForTesting
- public int getCurrentHealthyPipelineCount() {
+ public synchronized int getCurrentHealthyPipelineCount() {
return currentHealthyPipelineCount;
}
@VisibleForTesting
- public int getHealthyPipelineThresholdCount() {
+ public synchronized int getHealthyPipelineThresholdCount() {
return healthyPipelineThresholdCount;
}
@@ -153,7 +182,7 @@ public class HealthyPipelineSafeModeRule extends SafeModeExitRule<Pipeline> {
public String getStatusText() {
return String.format("healthy Ratis/THREE pipelines (=%d) >= "
+ "healthyPipelineThresholdCount (=%d)",
- this.currentHealthyPipelineCount,
- this.healthyPipelineThresholdCount);
+ getCurrentHealthyPipelineCount(),
+ getHealthyPipelineThresholdCount());
}
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java
index 3f5bcd6..0303e46 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/OneReplicaPipelineSafeModeRule.java
@@ -56,6 +56,7 @@ public class OneReplicaPipelineSafeModeRule extends
private Set<PipelineID> oldPipelineIDSet;
private int currentReportedPipelineCount = 0;
private PipelineManager pipelineManager;
+ private final double pipelinePercent;
public OneReplicaPipelineSafeModeRule(String ruleName, EventQueue eventQueue,
@@ -63,32 +64,21 @@ public class OneReplicaPipelineSafeModeRule extends
SCMSafeModeManager safeModeManager, ConfigurationSource configuration) {
super(safeModeManager, ruleName, eventQueue);
- double percent =
+ pipelinePercent =
configuration.getDouble(
HddsConfigKeys.HDDS_SCM_SAFEMODE_ONE_NODE_REPORTED_PIPELINE_PCT,
HddsConfigKeys.
HDDS_SCM_SAFEMODE_ONE_NODE_REPORTED_PIPELINE_PCT_DEFAULT);
- Preconditions.checkArgument((percent >= 0.0 && percent <= 1.0),
+ Preconditions.checkArgument((pipelinePercent >= 0.0
+ && pipelinePercent <= 1.0),
HddsConfigKeys.
HDDS_SCM_SAFEMODE_ONE_NODE_REPORTED_PIPELINE_PCT +
" value should be >= 0.0 and <= 1.0");
this.pipelineManager = pipelineManager;
- oldPipelineIDSet = pipelineManager.getPipelines(
- new RatisReplicationConfig(ReplicationFactor.THREE),
- Pipeline.PipelineState.OPEN)
- .stream().map(p -> p.getId()).collect(Collectors.toSet());
- int totalPipelineCount = oldPipelineIDSet.size();
-
- thresholdCount = (int) Math.ceil(percent * totalPipelineCount);
-
- LOG.info("Total pipeline count is {}, pipeline's with at least one " +
- "datanode reported threshold count is {}", totalPipelineCount,
- thresholdCount);
+ initializeRule(false);
- getSafeModeMetrics().setNumPipelinesWithAtleastOneReplicaReportedThreshold(
- thresholdCount);
}
@Override
@@ -97,12 +87,12 @@ public class OneReplicaPipelineSafeModeRule extends
}
@Override
- protected boolean validate() {
+ protected synchronized boolean validate() {
return currentReportedPipelineCount >= thresholdCount;
}
@Override
- protected void process(PipelineReportFromDatanode report) {
+ protected synchronized void process(PipelineReportFromDatanode report) {
Preconditions.checkNotNull(report);
for (PipelineReport report1 : report.getReport().getPipelineReportList()) {
Pipeline pipeline;
@@ -136,17 +126,17 @@ public class OneReplicaPipelineSafeModeRule extends
}
@Override
- protected void cleanup() {
+ protected synchronized void cleanup() {
reportedPipelineIDSet.clear();
}
@VisibleForTesting
- public int getThresholdCount() {
+ public synchronized int getThresholdCount() {
return thresholdCount;
}
@VisibleForTesting
- public int getCurrentReportedPipelineCount() {
+ public synchronized int getCurrentReportedPipelineCount() {
return currentReportedPipelineCount;
}
@@ -156,7 +146,43 @@ public class OneReplicaPipelineSafeModeRule extends
.format(
"reported Ratis/THREE pipelines with at least one datanode (=%d) "
+ ">= threshold (=%d)",
- this.currentReportedPipelineCount,
- this.thresholdCount);
+ getCurrentReportedPipelineCount(),
+ getThresholdCount());
+ }
+
+ @Override
+ public synchronized void refresh(boolean forceRefresh) {
+ if (forceRefresh) {
+ initializeRule(true);
+ } else {
+ if (!validate()) {
+ initializeRule(true);
+ }
+ }
+ }
+
+ private void initializeRule(boolean refresh) {
+
+ oldPipelineIDSet = pipelineManager.getPipelines(
+ new RatisReplicationConfig(ReplicationFactor.THREE),
+ Pipeline.PipelineState.OPEN)
+ .stream().map(p -> p.getId()).collect(Collectors.toSet());
+
+ int totalPipelineCount = oldPipelineIDSet.size();
+
+ thresholdCount = (int) Math.ceil(pipelinePercent * totalPipelineCount);
+
+ if (refresh) {
+ LOG.info("Refreshed Total pipeline count is {}, pipeline's with at " +
+ "least one datanode reported threshold count is {}",
+ totalPipelineCount, thresholdCount);
+ } else {
+ LOG.info("Total pipeline count is {}, pipeline's with at " +
+ "least one datanode reported threshold count is {}",
+ totalPipelineCount, thresholdCount);
+ }
+
+ getSafeModeMetrics().setNumPipelinesWithAtleastOneReplicaReportedThreshold(
+ thresholdCount);
}
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
index 206e40b..60dba33 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SCMSafeModeManager.java
@@ -28,6 +28,7 @@ import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
+import org.apache.hadoop.hdds.scm.container.ContainerManagerV2;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
import org.apache.hadoop.hdds.scm.ha.SCMContext;
import org.apache.hadoop.hdds.scm.ha.SCMService.Event;
@@ -109,7 +110,8 @@ public class SCMSafeModeManager implements SafeModeManager {
private final SafeModeMetrics safeModeMetrics;
public SCMSafeModeManager(ConfigurationSource conf,
- List<ContainerInfo> allContainers, PipelineManager pipelineManager,
+ List<ContainerInfo> allContainers,
+ ContainerManagerV2 containerManager, PipelineManager pipelineManager,
EventQueue eventQueue, SCMServiceManager serviceManager,
SCMContext scmContext) {
this.config = conf;
@@ -125,7 +127,7 @@ public class SCMSafeModeManager implements SafeModeManager {
this.safeModeMetrics = SafeModeMetrics.create();
ContainerSafeModeRule containerSafeModeRule =
new ContainerSafeModeRule(CONT_EXIT_RULE, eventQueue, config,
- allContainers, this);
+ allContainers, containerManager, this);
DataNodeSafeModeRule dataNodeSafeModeRule =
new DataNodeSafeModeRule(DN_EXIT_RULE, eventQueue, config, this);
exitRules.put(CONT_EXIT_RULE, containerSafeModeRule);
@@ -194,11 +196,13 @@ public class SCMSafeModeManager implements SafeModeManager {
EventPublisher eventQueue) {
if (exitRules.get(ruleName) != null) {
- validatedRules.add(ruleName);
+ boolean added = validatedRules.add(ruleName);
if (preCheckRules.contains(ruleName)) {
validatedPreCheckRules.add(ruleName);
}
- LOG.info("{} rule is successfully validated", ruleName);
+ if (added) {
+ LOG.info("{} rule is successfully validated", ruleName);
+ }
} else {
// This should never happen
LOG.error("No Such Exit rule {}", ruleName);
@@ -253,6 +257,35 @@ public class SCMSafeModeManager implements SafeModeManager {
emitSafeModeStatus();
}
+ /**
+ * Refresh Rule state.
+ */
+ public void refresh() {
+ if (inSafeMode.get()) {
+ exitRules.values().forEach(rule -> {
+ // Refresh rule irrespective of validate(), as at this point validate
+ // does not represent current state validation, as validate is being
+ // done with stale state.
+ rule.refresh(true);
+ });
+ }
+ }
+
+ /**
+ * Refresh Rule state and validate rules.
+ */
+ public void refreshAndValidate() {
+ if (inSafeMode.get()) {
+ exitRules.values().forEach(rule -> {
+ rule.refresh(false);
+ if (rule.validate() && inSafeMode.get()) {
+ validateSafeModeExitRules(rule.getRuleName(), eventPublisher);
+ rule.cleanup();
+ }
+ });
+ }
+ }
+
@Override
public boolean getInSafeMode() {
if (!isSafeModeEnabled) {
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java
index 5c553b8..0d68cf6 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/safemode/SafeModeExitRule.java
@@ -116,4 +116,13 @@ public abstract class SafeModeExitRule<T> implements EventHandler<T> {
* @return status text.
*/
abstract String getStatusText();
+
+ /**
+ * Refresh the rule state from current state of SCM.
+ *
+ * @param forceRefresh - refresh rule irrespective of validate() is
+ * true/false.
+ *
+ */
+ protected abstract void refresh(boolean forceRefresh);
}
diff --git a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
index 388dd8e..097de16 100644
--- a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
+++ b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/server/StorageContainerManager.java
@@ -579,7 +579,7 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl
scmSafeModeManager = configurator.getScmSafeModeManager();
} else {
scmSafeModeManager = new SCMSafeModeManager(conf,
- containerManager.getContainers(),
+ containerManager.getContainers(), containerManager,
pipelineManager, eventQueue, serviceManager, scmContext);
}
scmDecommissionManager = new NodeDecommissionManager(conf, scmNodeManager,
@@ -1229,7 +1229,11 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl
LOG.info(buildRpcServerStartMessage("ScmDatanodeProtocl RPC " +
"server", getDatanodeProtocolServer().getDatanodeRpcAddress()));
}
- getDatanodeProtocolServer().start();
+
+ // If HA is enabled, start datanode protocol server once leader is ready.
+ if (!scmStorageConfig.isSCMHAEnabled()) {
+ getDatanodeProtocolServer().start();
+ }
if (getSecurityProtocolServer() != null) {
getSecurityProtocolServer().start();
persistSCMCertificates();
@@ -1516,7 +1520,6 @@ public final class StorageContainerManager extends ServiceRuntimeInfoImpl
public boolean checkLeader() {
// For NON-HA setup, the node will always be the leader
if (!SCMHAUtils.isSCMHAEnabled(configuration)) {
- Preconditions.checkArgument(scmContext.isLeader());
return true;
} else {
// FOR HA setup, the node has to be the leader and ready to serve
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestBlockManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestBlockManager.java
index 16c06cb..c033831 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestBlockManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/block/TestBlockManager.java
@@ -157,7 +157,7 @@ public class TestBlockManager {
pipelineManager,
scmMetadataStore.getContainerTable());
SCMSafeModeManager safeModeManager = new SCMSafeModeManager(conf,
- containerManager.getContainers(),
+ containerManager.getContainers(), containerManager,
pipelineManager, eventQueue, serviceManager, scmContext) {
@Override
public void emitSafeModeStatus() {
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMContext.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMContext.java
index 9fe2da1..f60c2eb 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMContext.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/ha/TestSCMContext.java
@@ -41,7 +41,9 @@ public class TestSCMContext {
// become leader
scmContext.updateLeaderAndTerm(true, 10);
+ scmContext.setLeaderReady();
assertTrue(scmContext.isLeader());
+ assertTrue(scmContext.isLeaderReady());
try {
assertEquals(scmContext.getTermOfLeader(), 10);
} catch (NotLeaderException e) {
@@ -51,6 +53,7 @@ public class TestSCMContext {
// step down
scmContext.updateLeaderAndTerm(false, 0);
assertFalse(scmContext.isLeader());
+ assertFalse(scmContext.isLeaderReady());
}
@Test
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestPipelineManagerImpl.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestPipelineManagerImpl.java
index a32f222..a0bb44a 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestPipelineManagerImpl.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestPipelineManagerImpl.java
@@ -348,7 +348,7 @@ public class TestPipelineManagerImpl {
public void testPipelineReport() throws Exception {
PipelineManagerV2Impl pipelineManager = createPipelineManager(true);
SCMSafeModeManager scmSafeModeManager =
- new SCMSafeModeManager(conf, new ArrayList<>(), pipelineManager,
+ new SCMSafeModeManager(conf, new ArrayList<>(), null, pipelineManager,
new EventQueue(), serviceManager, scmContext);
Pipeline pipeline = pipelineManager
.createPipeline(new RatisReplicationConfig(ReplicationFactor.THREE));
@@ -465,7 +465,8 @@ public class TestPipelineManagerImpl {
SCMSafeModeManager scmSafeModeManager =
new SCMSafeModeManager(new OzoneConfiguration(), new ArrayList<>(),
- pipelineManager, new EventQueue(), serviceManager, scmContext);
+ null, pipelineManager, new EventQueue(),
+ serviceManager, scmContext);
PipelineReportHandler pipelineReportHandler =
new PipelineReportHandler(scmSafeModeManager, pipelineManager,
SCMContext.emptyContext(), conf);
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestSCMPipelineManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestSCMPipelineManager.java
index f67779f..daf171d 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestSCMPipelineManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/pipeline/TestSCMPipelineManager.java
@@ -222,8 +222,8 @@ public class TestSCMPipelineManager {
mockRatisProvider);
SCMSafeModeManager scmSafeModeManager =
- new SCMSafeModeManager(conf, new ArrayList<>(), pipelineManager,
- eventQueue, new SCMServiceManager(),
+ new SCMSafeModeManager(conf, new ArrayList<>(), null,
+ pipelineManager, eventQueue, new SCMServiceManager(),
SCMContext.emptyContext());
// create a pipeline in allocated state with no dns yet reported
@@ -494,7 +494,7 @@ public class TestSCMPipelineManager {
SCMSafeModeManager scmSafeModeManager =
new SCMSafeModeManager(new OzoneConfiguration(), new ArrayList<>(),
- pipelineManager, eventQueue,
+ null, pipelineManager, eventQueue,
new SCMServiceManager(),
SCMContext.emptyContext());
PipelineReportHandler pipelineReportHandler =
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java
index 317b879..14b839d 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestHealthyPipelineSafeModeRule.java
@@ -92,7 +92,7 @@ public class TestHealthyPipelineSafeModeRule {
pipelineManager.setPipelineProvider(HddsProtos.ReplicationType.RATIS,
mockRatisProvider);
SCMSafeModeManager scmSafeModeManager = new SCMSafeModeManager(
- config, containers, pipelineManager, eventQueue,
+ config, containers, null, pipelineManager, eventQueue,
serviceManager, scmContext);
HealthyPipelineSafeModeRule healthyPipelineSafeModeRule =
@@ -172,7 +172,7 @@ public class TestHealthyPipelineSafeModeRule {
MockRatisPipelineProvider.markPipelineHealthy(pipeline3);
SCMSafeModeManager scmSafeModeManager = new SCMSafeModeManager(
- config, containers, pipelineManager, eventQueue,
+ config, containers, null, pipelineManager, eventQueue,
serviceManager, scmContext);
HealthyPipelineSafeModeRule healthyPipelineSafeModeRule =
@@ -270,7 +270,7 @@ public class TestHealthyPipelineSafeModeRule {
MockRatisPipelineProvider.markPipelineHealthy(pipeline3);
SCMSafeModeManager scmSafeModeManager = new SCMSafeModeManager(
- config, containers, pipelineManager, eventQueue,
+ config, containers, null, pipelineManager, eventQueue,
serviceManager, scmContext);
HealthyPipelineSafeModeRule healthyPipelineSafeModeRule =
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java
index 11042fa..e91a505 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestOneReplicaPipelineSafeModeRule.java
@@ -111,7 +111,7 @@ public class TestOneReplicaPipelineSafeModeRule {
HddsProtos.ReplicationFactor.ONE);
SCMSafeModeManager scmSafeModeManager =
- new SCMSafeModeManager(ozoneConfiguration, containers,
+ new SCMSafeModeManager(ozoneConfiguration, containers, null,
pipelineManager, eventQueue, serviceManager, scmContext);
rule = scmSafeModeManager.getOneReplicaPipelineSafeModeRule();
diff --git a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
index 0f71ea0..8ac3fcd 100644
--- a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
+++ b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/safemode/TestSCMSafeModeManager.java
@@ -123,7 +123,7 @@ public class TestSCMSafeModeManager {
@Test
public void testSafeModeStateWithNullContainers() {
new SCMSafeModeManager(config, Collections.emptyList(),
- null, queue, serviceManager, scmContext);
+ null, null, queue, serviceManager, scmContext);
}
private void testSafeMode(int numContainers) throws Exception {
@@ -135,7 +135,8 @@ public class TestSCMSafeModeManager {
container.setState(HddsProtos.LifeCycleState.CLOSED);
}
scmSafeModeManager = new SCMSafeModeManager(
- config, containers, null, queue, serviceManager, scmContext);
+ config, containers, null, null, queue,
+ serviceManager, scmContext);
assertTrue(scmSafeModeManager.getInSafeMode());
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
@@ -168,7 +169,8 @@ public class TestSCMSafeModeManager {
container.setState(HddsProtos.LifeCycleState.CLOSED);
}
scmSafeModeManager = new SCMSafeModeManager(
- config, containers, null, queue, serviceManager, scmContext);
+ config, containers, null, null, queue,
+ serviceManager, scmContext);
long cutOff = (long) Math.ceil(numContainers * config.getDouble(
HddsConfigKeys.HDDS_SCM_SAFEMODE_THRESHOLD_PCT,
@@ -264,7 +266,8 @@ public class TestSCMSafeModeManager {
scmContext,
serviceManager);
scmSafeModeManager = new SCMSafeModeManager(
- conf, containers, pipelineManager, queue, serviceManager, scmContext);
+ conf, containers, null, pipelineManager, queue, serviceManager,
+ scmContext);
fail("testFailWithIncorrectValueForHealthyPipelinePercent");
} catch (IllegalArgumentException ex) {
GenericTestUtils.assertExceptionContains("value should be >= 0.0 and <=" +
@@ -289,7 +292,8 @@ public class TestSCMSafeModeManager {
scmContext,
serviceManager);
scmSafeModeManager = new SCMSafeModeManager(
- conf, containers, pipelineManager, queue, serviceManager, scmContext);
+ conf, containers, null, pipelineManager, queue, serviceManager,
+ scmContext);
fail("testFailWithIncorrectValueForOneReplicaPipelinePercent");
} catch (IllegalArgumentException ex) {
GenericTestUtils.assertExceptionContains("value should be >= 0.0 and <=" +
@@ -313,7 +317,8 @@ public class TestSCMSafeModeManager {
scmContext,
serviceManager);
scmSafeModeManager = new SCMSafeModeManager(
- conf, containers, pipelineManager, queue, serviceManager, scmContext);
+ conf, containers, null, pipelineManager, queue, serviceManager,
+ scmContext);
fail("testFailWithIncorrectValueForSafeModePercent");
} catch (IllegalArgumentException ex) {
GenericTestUtils.assertExceptionContains("value should be >= 0.0 and <=" +
@@ -367,7 +372,8 @@ public class TestSCMSafeModeManager {
}
scmSafeModeManager = new SCMSafeModeManager(
- conf, containers, pipelineManager, queue, serviceManager, scmContext);
+ conf, containers, null, pipelineManager, queue, serviceManager,
+ scmContext);
assertTrue(scmSafeModeManager.getInSafeMode());
testContainerThreshold(containers, 1.0);
@@ -478,7 +484,8 @@ public class TestSCMSafeModeManager {
conf.setBoolean(HddsConfigKeys.HDDS_SCM_SAFEMODE_ENABLED, false);
PipelineManager pipelineManager = Mockito.mock(PipelineManager.class);
scmSafeModeManager = new SCMSafeModeManager(
- conf, containers, pipelineManager, queue, serviceManager, scmContext);
+ conf, containers, null, pipelineManager, queue, serviceManager,
+ scmContext);
assertFalse(scmSafeModeManager.getInSafeMode());
}
@@ -510,7 +517,7 @@ public class TestSCMSafeModeManager {
}
scmSafeModeManager = new SCMSafeModeManager(
- config, containers, null, queue, serviceManager, scmContext);
+ config, containers, null, null, queue, serviceManager, scmContext);
assertTrue(scmSafeModeManager.getInSafeMode());
@@ -534,7 +541,8 @@ public class TestSCMSafeModeManager {
OzoneConfiguration conf = new OzoneConfiguration(config);
conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE, numOfDns);
scmSafeModeManager = new SCMSafeModeManager(
- conf, containers, null, queue, serviceManager, scmContext);
+ conf, containers, null, null, queue,
+ serviceManager, scmContext);
// Assert SCM is in Safe mode.
assertTrue(scmSafeModeManager.getInSafeMode());
@@ -609,7 +617,7 @@ public class TestSCMSafeModeManager {
MockRatisPipelineProvider.markPipelineHealthy(pipeline);
scmSafeModeManager = new SCMSafeModeManager(
- config, containers, pipelineManager, queue, serviceManager,
+ config, containers, null, pipelineManager, queue, serviceManager,
scmContext);
queue.fireEvent(SCMEvents.NODE_REGISTRATION_CONT_REPORT,
@@ -669,7 +677,8 @@ public class TestSCMSafeModeManager {
SafeModeEventHandler smHandler = new SafeModeEventHandler();
queue.addHandler(SCMEvents.SAFE_MODE_STATUS, smHandler);
scmSafeModeManager = new SCMSafeModeManager(
- config, containers, pipelineManager, queue, serviceManager, scmContext);
+ config, containers, null, pipelineManager, queue, serviceManager,
+ scmContext);
// Assert SCM is in Safe mode.
assertTrue(scmSafeModeManager.getInSafeMode());
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java
index b09859f..e9d6d6a 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/MiniOzoneClusterImpl.java
@@ -682,6 +682,10 @@ public class MiniOzoneClusterImpl implements MiniOzoneCluster {
// default max retry timeout set to 30s
scmClientConfig.setMaxRetryTimeout(30 * 1000);
conf.setFromObject(scmClientConfig);
+ // In this way safemode exit will happen only when atleast we have one
+ // pipeline.
+ conf.setInt(HddsConfigKeys.HDDS_SCM_SAFEMODE_MIN_DATANODE,
+ numOfDatanodes >=3 ? 3 : 1);
configureTrace();
}
@@ -723,6 +727,8 @@ public class MiniOzoneClusterImpl implements MiniOzoneCluster {
scmStore.setScmId(scmId.get());
scmStore.initialize();
if (SCMHAUtils.isSCMHAEnabled(conf)) {
+ scmStore.setSCMHAFlag(true);
+ scmStore.persistCurrentState();
SCMRatisServerImpl.initialize(clusterId, scmId.get(),
SCMHANodeDetails.loadSCMHAConfig(conf).getLocalNodeDetails(), conf);
}
diff --git a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestSCMInstallSnapshotWithHA.java b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestSCMInstallSnapshotWithHA.java
index 18f9391..4223ae9 100644
--- a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestSCMInstallSnapshotWithHA.java
+++ b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/scm/TestSCMInstallSnapshotWithHA.java
@@ -93,6 +93,7 @@ public class TestSCMInstallSnapshotWithHA {
scmhaConfiguration.setRatisSnapshotThreshold(SNAPSHOT_THRESHOLD);
conf.setFromObject(scmhaConfiguration);
+
cluster = (MiniOzoneHAClusterImpl) MiniOzoneCluster.newHABuilder(conf)
.setClusterId(clusterId)
.setScmId(scmId)
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@ozone.apache.org
For additional commands, e-mail: commits-help@ozone.apache.org