You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ozone.apache.org by sa...@apache.org on 2020/09/04 07:11:54 UTC

[hadoop-ozone] branch master updated: HDDS-4186: Adjust RetryPolicy of SCMConnectionManager for SCM/Recon (#1373)

This is an automated email from the ASF dual-hosted git repository.

sammichen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hadoop-ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 642d660  HDDS-4186: Adjust RetryPolicy of SCMConnectionManager for SCM/Recon (#1373)
642d660 is described below

commit 642d6602c417406485be6a09e29c099de88c854b
Author: GlenGeng <gl...@tencent.com>
AuthorDate: Fri Sep 4 15:11:41 2020 +0800

    HDDS-4186: Adjust RetryPolicy of SCMConnectionManager for SCM/Recon (#1373)
---
 .../main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java |  5 +++++
 hadoop-hdds/common/src/main/resources/ozone-default.xml     | 10 ++++++++++
 .../container/common/statemachine/SCMConnectionManager.java |  9 ++++++---
 .../java/org/apache/hadoop/hdds/utils/HddsServerUtil.java   | 13 +++++++++++++
 hadoop-ozone/dist/src/main/compose/testlib.sh               |  8 +++++---
 5 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
index 4e624c6..672b440 100644
--- a/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
+++ b/hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/scm/ScmConfigKeys.java
@@ -241,6 +241,11 @@ public final class ScmConfigKeys {
   public static final String OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT =
       "1s";
 
+  public static final String OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT =
+      "ozone.scm.heartbeat.rpc-retry-count";
+  public static final int OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT =
+      15;
+
   /**
    * Defines how frequently we will log the missing of heartbeat to a specific
    * SCM. In the default case we will write a warning message for each 10
diff --git a/hadoop-hdds/common/src/main/resources/ozone-default.xml b/hadoop-hdds/common/src/main/resources/ozone-default.xml
index 5770448..f16ff3f 100644
--- a/hadoop-hdds/common/src/main/resources/ozone-default.xml
+++ b/hadoop-hdds/common/src/main/resources/ozone-default.xml
@@ -978,6 +978,16 @@
     </description>
   </property>
   <property>
+    <name>ozone.scm.heartbeat.rpc-retry-count</name>
+    <value>15</value>
+    <tag>OZONE, MANAGEMENT</tag>
+    <description>
+      Retry count for the RPC from Datanode to SCM. The rpc-retry-interval
+      is 1s. Make sure rpc-retry-count * (rpc-timeout + rpc-retry-interval)
+      is less than hdds.heartbeat.interval.
+    </description>
+  </property>
+  <property>
     <name>ozone.scm.heartbeat.thread.interval</name>
     <value>3s</value>
     <tag>OZONE, MANAGEMENT</tag>
diff --git a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java
index ebc53c9..c7dd9c6 100644
--- a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java
+++ b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/statemachine/SCMConnectionManager.java
@@ -46,6 +46,7 @@ import org.apache.hadoop.security.UserGroupInformation;
 
 import static java.util.Collections.unmodifiableList;
 import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcTimeOutInMilliseconds;
+import static org.apache.hadoop.hdds.utils.HddsServerUtil.getScmRpcRetryCount;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -149,7 +150,8 @@ public class SCMConnectionManager
           RPC.getProtocolVersion(StorageContainerDatanodeProtocolPB.class);
 
       RetryPolicy retryPolicy =
-          RetryPolicies.retryForeverWithFixedSleep(
+          RetryPolicies.retryUpToMaximumCountWithFixedSleep(
+              getScmRpcRetryCount(conf),
               1000, TimeUnit.MILLISECONDS);
 
       StorageContainerDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy(
@@ -193,8 +195,9 @@ public class SCMConnectionManager
           RPC.getProtocolVersion(ReconDatanodeProtocolPB.class);
 
       RetryPolicy retryPolicy =
-          RetryPolicies.retryUpToMaximumCountWithFixedSleep(10,
-              60000, TimeUnit.MILLISECONDS);
+          RetryPolicies.retryUpToMaximumCountWithFixedSleep(
+              getScmRpcRetryCount(conf),
+              1000, TimeUnit.MILLISECONDS);
       ReconDatanodeProtocolPB rpcProxy = RPC.getProtocolProxy(
           ReconDatanodeProtocolPB.class, version,
           address, UserGroupInformation.getCurrentUser(), hadoopConfig,
diff --git a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
index 8e7f326..13e08a1 100644
--- a/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
+++ b/hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HddsServerUtil.java
@@ -65,6 +65,8 @@ import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_LOG_W
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_PROCESS_INTERVAL;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_TIMEOUT_DEFAULT;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT;
+import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
 import static org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL_DEFAULT;
 import static org.apache.hadoop.hdds.server.ServerUtils.sanitizeUserArgs;
@@ -325,6 +327,17 @@ public final class HddsServerUtil {
   }
 
   /**
+   * Max retry count of rpcProxy for EndpointStateMachine of SCM.
+   *
+   * @param conf - Ozone Config
+   * @return - Max retry count.
+   */
+  public static int getScmRpcRetryCount(ConfigurationSource conf) {
+    return conf.getInt(OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT,
+        OZONE_SCM_HEARTBEAT_RPC_RETRY_COUNT_DEFAULT);
+  }
+
+  /**
    * Log Warn interval.
    *
    * @param conf - Ozone Config
diff --git a/hadoop-ozone/dist/src/main/compose/testlib.sh b/hadoop-ozone/dist/src/main/compose/testlib.sh
index db449b9..b122479 100755
--- a/hadoop-ozone/dist/src/main/compose/testlib.sh
+++ b/hadoop-ozone/dist/src/main/compose/testlib.sh
@@ -60,7 +60,7 @@ find_tests(){
   echo $tests
 }
 
-## @description wait until safemode exit (or 180 seconds)
+## @description wait until safemode exit (or 240 seconds)
 wait_for_safemode_exit(){
   # version-dependent
   : ${OZONE_SAFEMODE_STATUS_COMMAND:=ozone admin safemode status --verbose}
@@ -68,8 +68,8 @@ wait_for_safemode_exit(){
   #Reset the timer
   SECONDS=0
 
-  #Don't give it up until 180 seconds
-  while [[ $SECONDS -lt 180 ]]; do
+  #Don't give it up until 240 seconds
+  while [[ $SECONDS -lt 240 ]]; do
 
      #This line checks the safemode status in scm
      local command="${OZONE_SAFEMODE_STATUS_COMMAND}"
@@ -79,6 +79,8 @@ wait_for_safemode_exit(){
          status=$(docker-compose exec -T scm bash -c "$command")
      fi
 
+     echo "SECONDS: $SECONDS"
+
      echo $status
      if [[ "$status" ]]; then
        if [[ ${status} == "SCM is out of safe mode." ]]; then


---------------------------------------------------------------------
To unsubscribe, e-mail: ozone-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: ozone-commits-help@hadoop.apache.org