You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@ozone.apache.org by GitBox <gi...@apache.org> on 2020/06/09 16:47:23 UTC

[GitHub] [hadoop-ozone] bshashikant commented on a change in pull request #1005: HDDS-3350. Ozone Retry Policy Improvements.

bshashikant commented on a change in pull request #1005:
URL: https://github.com/apache/hadoop-ozone/pull/1005#discussion_r437566605



##########
File path: hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
##########
@@ -269,23 +282,76 @@ static GrpcTlsConfig createTlsClientConfig(SecurityConfig conf,
     return tlsConfig;
   }
 
-  static RetryPolicy createRetryPolicy(ConfigurationSource conf) {
-    int maxRetryCount =
-        conf.getInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY,
+  public static RetryPolicy createRetryPolicy(ConfigurationSource conf) {
+    ExponentialBackoffRetry exponentialBackoffRetry =
+        createExponentialBackoffPolicy(conf);
+    MultipleLinearRandomRetry multipleLinearRandomRetry =
+        MultipleLinearRandomRetry.parseCommaSeparated(conf.get(
+            OzoneConfigKeys.DFS_RATIS_CLIENT_MULTILINEAR_RANDOM_RETRY_POLICY,
             OzoneConfigKeys.
-                DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT);
-    long retryInterval = conf.getTimeDuration(OzoneConfigKeys.
-        DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY, OzoneConfigKeys.
-        DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_DEFAULT
-        .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
-    TimeDuration sleepDuration =
-        TimeDuration.valueOf(retryInterval, TimeUnit.MILLISECONDS);
-    RetryPolicy retryPolicy = RetryPolicies
-        .retryUpToMaximumCountWithFixedSleep(maxRetryCount, sleepDuration);
-    return retryPolicy;
+                DFS_RATIS_CLIENT_MULTILINEAR_RANDOM_RETRY_POLICY_DEFAULT));
+
+    long writeTimeout = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_WRITE_TIMEOUT, OzoneConfigKeys.
+            DFS_RATIS_CLIENT_REQUEST_WRITE_TIMEOUT_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    long watchTimeout = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_WATCH_TIMEOUT, OzoneConfigKeys.
+            DFS_RATIS_CLIENT_REQUEST_WATCH_TIMEOUT_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+
+    return RequestTypeDependentRetryPolicy.newBuilder()
+        .setRetryPolicy(RaftProtos.RaftClientRequestProto.TypeCase.WRITE,
+            createExceptionDependentPolicy(exponentialBackoffRetry,
+                multipleLinearRandomRetry, exponentialBackoffRetry))
+        .setRetryPolicy(RaftProtos.RaftClientRequestProto.TypeCase.WATCH,
+            createExceptionDependentPolicy(exponentialBackoffRetry,
+                multipleLinearRandomRetry, RetryPolicies.noRetry()))
+        .setTimeout(RaftProtos.RaftClientRequestProto.TypeCase.WRITE,
+            TimeDuration.valueOf(writeTimeout, TimeUnit.MILLISECONDS))
+        .setTimeout(RaftProtos.RaftClientRequestProto.TypeCase.WATCH,
+            TimeDuration.valueOf(watchTimeout, TimeUnit.MILLISECONDS))
+        .build();
+  }
+
+  private static ExponentialBackoffRetry createExponentialBackoffPolicy(
+      ConfigurationSource conf) {
+    long exponentialBaseSleep = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_BASE_SLEEP,
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_BASE_SLEEP_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    long exponentialMaxSleep = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_MAX_SLEEP,
+        OzoneConfigKeys.
+            DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_MAX_SLEEP_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    return ExponentialBackoffRetry.newBuilder()
+        .setBaseSleepTime(
+            TimeDuration.valueOf(exponentialBaseSleep, TimeUnit.MILLISECONDS))
+        .setMaxSleepTime(
+            TimeDuration.valueOf(exponentialMaxSleep, TimeUnit.MILLISECONDS))
+        .build();
+  }
+
+  private static ExceptionDependentRetry createExceptionDependentPolicy(
+      ExponentialBackoffRetry exponentialBackoffRetry,
+      MultipleLinearRandomRetry multipleLinearRandomRetry,

Review comment:
       Instead of explicitly adding no retry for specific exception , can we define a static list wof exceptions for which there will be no retry and iterate here. ?
   

##########
File path: hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
##########
@@ -269,23 +282,76 @@ static GrpcTlsConfig createTlsClientConfig(SecurityConfig conf,
     return tlsConfig;
   }
 
-  static RetryPolicy createRetryPolicy(ConfigurationSource conf) {
-    int maxRetryCount =
-        conf.getInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY,
+  public static RetryPolicy createRetryPolicy(ConfigurationSource conf) {
+    ExponentialBackoffRetry exponentialBackoffRetry =
+        createExponentialBackoffPolicy(conf);
+    MultipleLinearRandomRetry multipleLinearRandomRetry =
+        MultipleLinearRandomRetry.parseCommaSeparated(conf.get(
+            OzoneConfigKeys.DFS_RATIS_CLIENT_MULTILINEAR_RANDOM_RETRY_POLICY,
             OzoneConfigKeys.
-                DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT);
-    long retryInterval = conf.getTimeDuration(OzoneConfigKeys.
-        DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY, OzoneConfigKeys.
-        DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_DEFAULT
-        .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
-    TimeDuration sleepDuration =
-        TimeDuration.valueOf(retryInterval, TimeUnit.MILLISECONDS);
-    RetryPolicy retryPolicy = RetryPolicies
-        .retryUpToMaximumCountWithFixedSleep(maxRetryCount, sleepDuration);
-    return retryPolicy;
+                DFS_RATIS_CLIENT_MULTILINEAR_RANDOM_RETRY_POLICY_DEFAULT));
+
+    long writeTimeout = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_WRITE_TIMEOUT, OzoneConfigKeys.
+            DFS_RATIS_CLIENT_REQUEST_WRITE_TIMEOUT_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    long watchTimeout = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_WATCH_TIMEOUT, OzoneConfigKeys.
+            DFS_RATIS_CLIENT_REQUEST_WATCH_TIMEOUT_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+
+    return RequestTypeDependentRetryPolicy.newBuilder()
+        .setRetryPolicy(RaftProtos.RaftClientRequestProto.TypeCase.WRITE,
+            createExceptionDependentPolicy(exponentialBackoffRetry,
+                multipleLinearRandomRetry, exponentialBackoffRetry))
+        .setRetryPolicy(RaftProtos.RaftClientRequestProto.TypeCase.WATCH,
+            createExceptionDependentPolicy(exponentialBackoffRetry,
+                multipleLinearRandomRetry, RetryPolicies.noRetry()))
+        .setTimeout(RaftProtos.RaftClientRequestProto.TypeCase.WRITE,
+            TimeDuration.valueOf(writeTimeout, TimeUnit.MILLISECONDS))
+        .setTimeout(RaftProtos.RaftClientRequestProto.TypeCase.WATCH,
+            TimeDuration.valueOf(watchTimeout, TimeUnit.MILLISECONDS))
+        .build();
+  }
+
+  private static ExponentialBackoffRetry createExponentialBackoffPolicy(
+      ConfigurationSource conf) {
+    long exponentialBaseSleep = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_BASE_SLEEP,
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_BASE_SLEEP_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    long exponentialMaxSleep = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_MAX_SLEEP,
+        OzoneConfigKeys.
+            DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_MAX_SLEEP_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    return ExponentialBackoffRetry.newBuilder()
+        .setBaseSleepTime(
+            TimeDuration.valueOf(exponentialBaseSleep, TimeUnit.MILLISECONDS))
+        .setMaxSleepTime(
+            TimeDuration.valueOf(exponentialMaxSleep, TimeUnit.MILLISECONDS))
+        .build();
+  }
+
+  private static ExceptionDependentRetry createExceptionDependentPolicy(
+      ExponentialBackoffRetry exponentialBackoffRetry,
+      MultipleLinearRandomRetry multipleLinearRandomRetry,

Review comment:
       I think we need to add AlreadyClosedException to the list of no retry here( as this can be generated from ratis server as well). Also, for RfatLogIOException , the policy should be of no retry.

##########
File path: hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
##########
@@ -269,23 +282,76 @@ static GrpcTlsConfig createTlsClientConfig(SecurityConfig conf,
     return tlsConfig;
   }
 

Review comment:
       can we add a table defining the exception no retry policy relationship for better understanding?

##########
File path: hadoop-hdds/common/src/main/java/org/apache/hadoop/ozone/OzoneConfigKeys.java
##########
@@ -281,15 +281,33 @@
 
   public static final String DFS_CONTAINER_RATIS_DATANODE_STORAGE_DIR =
       "dfs.container.ratis.datanode.storage.dir";
-  public static final String DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY =
-      ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY;
-  public static final int DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT =
-      ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT;
-  public static final String DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY =
-      ScmConfigKeys.DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY;
+

Review comment:
       i guess it would be better to define these configs in RatisClientConfig instead of defining here.

##########
File path: hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
##########
@@ -269,23 +282,76 @@ static GrpcTlsConfig createTlsClientConfig(SecurityConfig conf,
     return tlsConfig;
   }
 
-  static RetryPolicy createRetryPolicy(ConfigurationSource conf) {
-    int maxRetryCount =
-        conf.getInt(OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_KEY,
+  public static RetryPolicy createRetryPolicy(ConfigurationSource conf) {
+    ExponentialBackoffRetry exponentialBackoffRetry =
+        createExponentialBackoffPolicy(conf);
+    MultipleLinearRandomRetry multipleLinearRandomRetry =
+        MultipleLinearRandomRetry.parseCommaSeparated(conf.get(
+            OzoneConfigKeys.DFS_RATIS_CLIENT_MULTILINEAR_RANDOM_RETRY_POLICY,
             OzoneConfigKeys.
-                DFS_RATIS_CLIENT_REQUEST_MAX_RETRIES_DEFAULT);
-    long retryInterval = conf.getTimeDuration(OzoneConfigKeys.
-        DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_KEY, OzoneConfigKeys.
-        DFS_RATIS_CLIENT_REQUEST_RETRY_INTERVAL_DEFAULT
-        .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
-    TimeDuration sleepDuration =
-        TimeDuration.valueOf(retryInterval, TimeUnit.MILLISECONDS);
-    RetryPolicy retryPolicy = RetryPolicies
-        .retryUpToMaximumCountWithFixedSleep(maxRetryCount, sleepDuration);
-    return retryPolicy;
+                DFS_RATIS_CLIENT_MULTILINEAR_RANDOM_RETRY_POLICY_DEFAULT));
+
+    long writeTimeout = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_WRITE_TIMEOUT, OzoneConfigKeys.
+            DFS_RATIS_CLIENT_REQUEST_WRITE_TIMEOUT_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    long watchTimeout = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_REQUEST_WATCH_TIMEOUT, OzoneConfigKeys.
+            DFS_RATIS_CLIENT_REQUEST_WATCH_TIMEOUT_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+
+    return RequestTypeDependentRetryPolicy.newBuilder()
+        .setRetryPolicy(RaftProtos.RaftClientRequestProto.TypeCase.WRITE,
+            createExceptionDependentPolicy(exponentialBackoffRetry,
+                multipleLinearRandomRetry, exponentialBackoffRetry))
+        .setRetryPolicy(RaftProtos.RaftClientRequestProto.TypeCase.WATCH,
+            createExceptionDependentPolicy(exponentialBackoffRetry,
+                multipleLinearRandomRetry, RetryPolicies.noRetry()))
+        .setTimeout(RaftProtos.RaftClientRequestProto.TypeCase.WRITE,
+            TimeDuration.valueOf(writeTimeout, TimeUnit.MILLISECONDS))
+        .setTimeout(RaftProtos.RaftClientRequestProto.TypeCase.WATCH,
+            TimeDuration.valueOf(watchTimeout, TimeUnit.MILLISECONDS))
+        .build();
+  }
+
+  private static ExponentialBackoffRetry createExponentialBackoffPolicy(
+      ConfigurationSource conf) {
+    long exponentialBaseSleep = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_BASE_SLEEP,
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_BASE_SLEEP_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    long exponentialMaxSleep = conf.getTimeDuration(
+        OzoneConfigKeys.DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_MAX_SLEEP,
+        OzoneConfigKeys.
+            DFS_RATIS_CLIENT_EXPONENTIAL_BACKOFF_MAX_SLEEP_DEFAULT
+            .toIntExact(TimeUnit.MILLISECONDS), TimeUnit.MILLISECONDS);
+    return ExponentialBackoffRetry.newBuilder()
+        .setBaseSleepTime(
+            TimeDuration.valueOf(exponentialBaseSleep, TimeUnit.MILLISECONDS))
+        .setMaxSleepTime(
+            TimeDuration.valueOf(exponentialMaxSleep, TimeUnit.MILLISECONDS))
+        .build();
+  }
+
+  private static ExceptionDependentRetry createExceptionDependentPolicy(
+      ExponentialBackoffRetry exponentialBackoffRetry,
+      MultipleLinearRandomRetry multipleLinearRandomRetry,

Review comment:
       Instead of explicitly adding no retry for specific exception , can we define a static list of exceptions for which there will be no retry and iterate here. ?
   

##########
File path: hadoop-hdds/common/src/main/java/org/apache/hadoop/hdds/ratis/RatisHelper.java
##########
@@ -269,23 +282,76 @@ static GrpcTlsConfig createTlsClientConfig(SecurityConfig conf,
     return tlsConfig;
   }
 

Review comment:
       can we add a table defining the exception to retry policy relationship for better understanding?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: ozone-issues-unsubscribe@hadoop.apache.org
For additional commands, e-mail: ozone-issues-help@hadoop.apache.org