You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@ozone.apache.org by GitBox <gi...@apache.org> on 2021/05/19 09:58:49 UTC

[GitHub] [ozone] adoroszlai commented on a change in pull request #2266: HDDS-5246. Wait for ever to obtain CA list which is needed during OM/DN startup

adoroszlai commented on a change in pull request #2266:
URL: https://github.com/apache/ozone/pull/2266#discussion_r635084557



##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -413,47 +418,44 @@ public static File getMetaDir(DBDefinition definition,
     return caCertPemList;
   }
 
+  /**
+   * Retry for ever until CA list matches expected count.
+   * @param task - task to get CA list.
+   * @return CA list.
+   * @throws IOException
+   */
+  private static List<String> getCAListWithRetry(Callable task,

Review comment:
       ```suggestion
     private static List<String> getCAListWithRetry(Callable<List<String>> task,
   ```

##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -413,47 +418,44 @@ public static File getMetaDir(DBDefinition definition,
     return caCertPemList;
   }
 
+  /**
+   * Retry for ever until CA list matches expected count.
+   * @param task - task to get CA list.
+   * @return CA list.
+   * @throws IOException
+   */
+  private static List<String> getCAListWithRetry(Callable task,
+      long waitDuration) throws IOException {
+    RetryPolicy retryPolicy = RetryPolicies.retryForeverWithFixedSleep(
+        waitDuration, TimeUnit.SECONDS);
+    RetriableTask<List<String>> retriableTask =
+        new RetriableTask(retryPolicy, "getCAList", task);

Review comment:
       ```suggestion
           new RetriableTask<>(retryPolicy, "getCAList", task);
   ```

##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -413,47 +418,44 @@ public static File getMetaDir(DBDefinition definition,
     return caCertPemList;
   }
 
+  /**
+   * Retry for ever until CA list matches expected count.
+   * @param task - task to get CA list.
+   * @return CA list.
+   * @throws IOException
+   */
+  private static List<String> getCAListWithRetry(Callable task,
+      long waitDuration) throws IOException {
+    RetryPolicy retryPolicy = RetryPolicies.retryForeverWithFixedSleep(
+        waitDuration, TimeUnit.SECONDS);
+    RetriableTask<List<String>> retriableTask =
+        new RetriableTask(retryPolicy, "getCAList", task);
+    try {
+      return retriableTask.call();
+    } catch (Exception ex) {
+      throw new SCMSecurityException("Unable to obtain complete CA " +
+          "list", ex);
+    }
+  }
+
   private static List<String> waitForCACerts(
       final SupplierWithIOException<List<String>> applyFunction,
       int expectedCount) throws IOException {
-    //TODO: make wait time and sleep time configurable if needed.
     // TODO: If SCMs are bootstrapped later, then listCA need to be
     //  refetched if listCA size is less than scm ha config node list size.
     // For now when Client of SCM's are started we compare their node list
     // size and ca list size if it is as expected, we return the ca list.
     boolean caListUpToDate;
-    long waitTime = 5 * 60 * 1000L;
-    long retryTime = 10 * 1000L;
-    long currentTime = Time.monotonicNow();
     List<String> caCertPemList;
-    do {
-      caCertPemList = applyFunction.get();
-      caListUpToDate =
-          caCertPemList.size() == expectedCount ? true : false;
-      if (!caListUpToDate) {
-        LOG.info("Expected CA list size {}, where as received CA List size " +
-            "{}. Retry to fetch CA List after {} seconds", expectedCount,
-            caCertPemList.size(), retryTime);
-        try {
-          Thread.sleep(retryTime);
-        } catch (InterruptedException ex) {
-          Thread.currentThread().interrupt();
-        }
-      }
-    } while (!caListUpToDate &&
-        Time.monotonicNow() - currentTime < waitTime);
-    return caCertPemList;
-  }
-
-
-  private static void checkCertCount(int certCount, int expectedCount)
-      throws SCMSecurityException{
-    if (certCount != expectedCount) {
-      LOG.error("Unable to obtain CA list for SCM cluster, obtained CA list " +
-              "size is {}, where as expected list size is {}",
-          certCount, expectedCount);
-      throw new SCMSecurityException("Unable to obtain complete CA list");
+    caCertPemList = applyFunction.get();
+    caListUpToDate = caCertPemList.size() == expectedCount ? true : false;

Review comment:
       ```suggestion
       caListUpToDate = caCertPemList.size() == expectedCount;
   ```

##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -413,47 +418,44 @@ public static File getMetaDir(DBDefinition definition,
     return caCertPemList;
   }
 
+  /**
+   * Retry for ever until CA list matches expected count.
+   * @param task - task to get CA list.
+   * @return CA list.
+   * @throws IOException
+   */
+  private static List<String> getCAListWithRetry(Callable task,
+      long waitDuration) throws IOException {
+    RetryPolicy retryPolicy = RetryPolicies.retryForeverWithFixedSleep(
+        waitDuration, TimeUnit.SECONDS);
+    RetriableTask<List<String>> retriableTask =
+        new RetriableTask(retryPolicy, "getCAList", task);
+    try {
+      return retriableTask.call();
+    } catch (Exception ex) {
+      throw new SCMSecurityException("Unable to obtain complete CA " +
+          "list", ex);
+    }
+  }
+
   private static List<String> waitForCACerts(
       final SupplierWithIOException<List<String>> applyFunction,
       int expectedCount) throws IOException {
-    //TODO: make wait time and sleep time configurable if needed.
     // TODO: If SCMs are bootstrapped later, then listCA need to be
     //  refetched if listCA size is less than scm ha config node list size.
     // For now when Client of SCM's are started we compare their node list
     // size and ca list size if it is as expected, we return the ca list.
     boolean caListUpToDate;

Review comment:
       I think we can inline this variable (i.e. remove the variable and put the condition directly in `if (...)`).

##########
File path: hadoop-hdds/common/src/main/resources/ozone-default.xml
##########
@@ -2841,4 +2841,18 @@
     </description>
   </property>
 
+
+  <property>
+    <name>ozone.scm.ca.list.retry.wait.duration</name>
+    <tag>OZONE, SCM, OM, DATANODE</tag>
+    <value>10s</value>
+    <description>SCM client wait duration between each retry to get Scm CA
+      list. OM/Datanode obtain CA list during startup, and wait
+      for the CA List size to be matched with SCM node count size plus
+      1.(Additional one certificate is root CA certificate). If the received

Review comment:
       ```suggestion
         1. (Additional one certificate is root CA certificate). If the received
   ```

##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -413,47 +418,44 @@ public static File getMetaDir(DBDefinition definition,
     return caCertPemList;
   }
 
+  /**
+   * Retry for ever until CA list matches expected count.
+   * @param task - task to get CA list.
+   * @return CA list.
+   * @throws IOException
+   */
+  private static List<String> getCAListWithRetry(Callable task,
+      long waitDuration) throws IOException {
+    RetryPolicy retryPolicy = RetryPolicies.retryForeverWithFixedSleep(
+        waitDuration, TimeUnit.SECONDS);
+    RetriableTask<List<String>> retriableTask =
+        new RetriableTask(retryPolicy, "getCAList", task);
+    try {
+      return retriableTask.call();
+    } catch (Exception ex) {
+      throw new SCMSecurityException("Unable to obtain complete CA " +
+          "list", ex);
+    }
+  }
+
   private static List<String> waitForCACerts(
       final SupplierWithIOException<List<String>> applyFunction,
       int expectedCount) throws IOException {
-    //TODO: make wait time and sleep time configurable if needed.
     // TODO: If SCMs are bootstrapped later, then listCA need to be
     //  refetched if listCA size is less than scm ha config node list size.
     // For now when Client of SCM's are started we compare their node list
     // size and ca list size if it is as expected, we return the ca list.
     boolean caListUpToDate;
-    long waitTime = 5 * 60 * 1000L;
-    long retryTime = 10 * 1000L;
-    long currentTime = Time.monotonicNow();
     List<String> caCertPemList;
-    do {
-      caCertPemList = applyFunction.get();
-      caListUpToDate =
-          caCertPemList.size() == expectedCount ? true : false;
-      if (!caListUpToDate) {
-        LOG.info("Expected CA list size {}, where as received CA List size " +
-            "{}. Retry to fetch CA List after {} seconds", expectedCount,
-            caCertPemList.size(), retryTime);
-        try {
-          Thread.sleep(retryTime);
-        } catch (InterruptedException ex) {
-          Thread.currentThread().interrupt();
-        }
-      }
-    } while (!caListUpToDate &&
-        Time.monotonicNow() - currentTime < waitTime);
-    return caCertPemList;
-  }
-
-
-  private static void checkCertCount(int certCount, int expectedCount)
-      throws SCMSecurityException{
-    if (certCount != expectedCount) {
-      LOG.error("Unable to obtain CA list for SCM cluster, obtained CA list " +
-              "size is {}, where as expected list size is {}",
-          certCount, expectedCount);
-      throw new SCMSecurityException("Unable to obtain complete CA list");
+    caCertPemList = applyFunction.get();

Review comment:
       Join declaration and assignment.

##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -413,47 +418,44 @@ public static File getMetaDir(DBDefinition definition,
     return caCertPemList;
   }
 
+  /**
+   * Retry for ever until CA list matches expected count.
+   * @param task - task to get CA list.
+   * @return CA list.
+   * @throws IOException

Review comment:
       ```suggestion
   ```

##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -401,10 +407,9 @@ public static File getMetaDir(DBDefinition definition,
         Collection<String> scmNodes = SCMHAUtils.getSCMNodeIds(configuration);
         int expectedCount = scmNodes.size() + 1;
         if (scmNodes.size() > 1) {
-          caCertPemList = waitForCACerts(
+          caCertPemList = getCAListWithRetry(() -> waitForCACerts(
               () -> scmSecurityProtocolClient.listCACertificate(),

Review comment:
       ```suggestion
                 scmSecurityProtocolClient::listCACertificate,
   ```

##########
File path: hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/utils/HAUtils.java
##########
@@ -377,9 +383,9 @@ public static File getMetaDir(DBDefinition definition,
           if (caCertPemList != null && caCertPemList.size() == expectedCount) {
             return caCertPemList;
           }
-          caCertPemList = waitForCACerts(() -> certClient.updateCAList(),
-              expectedCount);
-          checkCertCount(caCertPemList.size(), expectedCount);
+          caCertPemList = getCAListWithRetry(() ->
+              waitForCACerts(() -> certClient.updateCAList(), expectedCount),

Review comment:
       ```suggestion
                 waitForCACerts(certClient::updateCAList, expectedCount),
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@ozone.apache.org
For additional commands, e-mail: issues-help@ozone.apache.org