You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by sa...@apache.org on 2020/09/04 07:11:54 UTC
[hadoop-ozone] branch master updated: HDDS-4186: Adjust RetryPolicy
of SCMConnectionManager for SCM/Recon (#1373)
This is an automated email from the ASF dual-hosted git repository.
sammichen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 642d660 HDDS-4186: Adjust RetryPolicy of SCMConnectionManager for SCM/Recon (#1373)
642d660 is described below
commit 642d6602c417406485be6a09e29c099de88c854b
Author: GlenGeng <gl...@tencent.com>
AuthorDate: Fri Sep 4 15:11:41 2020 +0800
HDDS-4186: Adjust RetryPolicy of SCMConnectionManager for SCM/Recon (#1373)
---
.../main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java | 5 +++++
hadoop-hdds/common/src/main/resources/ozone-default.xml | 10 ++++++++++
.../container/common/statemachine/SCMConnectionManager.java | 9 ++++++---
.../java/org/apache/hadoop/hdds/utils/HddsServerUtil.java | 13 +++++++++++++
hadoop-ozone/dist/src/main/compose/testlib.sh | 8 +++++---
5 files changed, 39 insertions(+), 6 deletions(-)
diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
index 4e624c6..672b440 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
@@ -241,6 +241,11 @@ public final class ScmConfigKeys {
public static final String OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT =
"1s";
+ public static final String OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT =
+ "ozone.scm.heartbeat.rpc-retry-count";
+ public static final int OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT =
+ 15;
+
/**
* Defines how frequently we will log the missing of heartbeat to a specific
* SCM. In the default case we will write a warning message for each 10
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 5770448..f16ff3f 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -978,6 +978,16 @@
</description>
</property>
<property>
+ <name>ozone.scm.heartbeat.rpc-retry-count</name>
+ <value>15</value>
+ <tag>OZONE, MANAGEMENT</tag>
+ <description>
+ Retry count for the RPC from Datanode to SCM. The rpc-retry-interval
+ is 1s. Make sure rpc-retry-count * (rpc-timeout + rpc-retry-interval)
+ is less than hdds.heartbeat.interval.
+ </description>
+ </property>
+ <property>
<name>ozone.scm.heartbeat.thread.interval</name>
<value>3s</value>
<tag>OZONE, MANAGEMENT</tag>
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java
index ebc53c9..c7dd9c6 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java
@@ -46,6 +46,7 @@ import org.apache.hadoop.security.UserGroupInformation;
import static java.util.Collections.unmodifiableList;
import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcTimeOutInMilliseconds;
+import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcRetryCount;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -149,7 +150,8 @@ public class SCMConnectionManager
RPC.getProtocolVersion(StorageContainerDatanodeProtocolPB.class);
RetryPolicy retryPolicy =
- RetryPolicies.retryForeverWithFixedSleep(
+ RetryPolicies.retryUpToMaximumCountWithFixedSleep(
+ getScmRpcRetryCount(conf),
1000, TimeUnit.MILLISECONDS);
StorageContainerDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy(
@@ -193,8 +195,9 @@ public class SCMConnectionManager
RPC.getProtocolVersion(ReconDatanodeProtocolPB.class);
RetryPolicy retryPolicy =
- RetryPolicies.retryUpToMaximumCountWithFixedSleep(10,
- 60000, TimeUnit.MILLISECONDS);
+ RetryPolicies.retryUpToMaximumCountWithFixedSleep(
+ getScmRpcRetryCount(conf),
+ 1000, TimeUnit.MILLISECONDS);
ReconDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy(
ReconDatanodeProtocolPB.class, version,
address, UserGroupInformation.getCurrentUser(), hadoopConfig,
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
index 8e7f326..13e08a1 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
@@ -65,6 +65,8 @@ import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_LOG_W
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL_DEFAULT;
import static org.apache.hadoop.hdds.server.ServerUtils.sanitizeUserArgs;
@@ -325,6 +327,17 @@ public final class HddsServerUtil {
}
/**
+ * Max retry count of rpcProxy for EndpointStateMachine of SCM.
+ *
+ * @param conf - Ozone Config
+ * @return - Max retry count.
+ */
+ public static int getScmRpcRetryCount(ConfigurationSource conf) {
+ return conf.getInt(OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT,
+ OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT);
+ }
+
+ /**
* Log Warn interval.
*
* @param conf - Ozone Config
diff --git a/hadoop-ozone/dist/src/main/compose/testlib.sh b/hadoop-ozone/dist/src/main/compose/testlib.sh
index db449b9..b122479 100755
--- a/hadoop-ozone/dist/src/main/compose/testlib.sh
+++ b/hadoop-ozone/dist/src/main/compose/testlib.sh
@@ -60,7 +60,7 @@ find_tests(){
echo $tests
}
-## @description wait until safemode exit (or 180 seconds)
+## @description wait until safemode exit (or 240 seconds)
wait_for_safemode_exit(){
# version-dependent
: ${OZONE_SAFEMODE_STATUS_COMMAND:=ozone admin safemode status --verbose}
@@ -68,8 +68,8 @@ wait_for_safemode_exit(){
#Reset the timer
SECONDS=0
- #Don't give it up until 180 seconds
- while [[ $SECONDS -lt 180 ]]; do
+ #Don't give it up until 240 seconds
+ while [[ $SECONDS -lt 240 ]]; do
#This line checks the safemode status in scm
local command="${OZONE_SAFEMODE_STATUS_COMMAND}"
@@ -79,6 +79,8 @@ wait_for_safemode_exit(){
status=$(docker-compose exec -T scm bash -c "$command")
fi
+ echo "SECONDS: $SECONDS"
+
echo $status
if [[ "$status" ]]; then
if [[ ${status} == "SCM is out of safe mode." ]]; then
---------------------------------------------------------------------
To unsubscribe, e-mail: ozone-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: ozone-commits-help@hadoop.apache.org