You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ed...@apache.org on 2019/06/13 20:59:50 UTC

[accumulo] branch 2.0 updated: Allow master start to block waiting for a number of tservers (#1204)

This is an automated email from the ASF dual-hosted git repository.

edcoleman pushed a commit to branch 2.0
in repository https://gitbox.apache.org/repos/asf/accumulo.git


The following commit(s) were added to refs/heads/2.0 by this push:
     new de9ba68  Allow master start to block waiting for a number of tservers (#1204)
de9ba68 is described below

commit de9ba68633a0d7e0ed55ee98533c82ab0b4bb12b
Author: EdColeman <de...@etcoleman.com>
AuthorDate: Thu Jun 13 16:59:45 2019 -0400

    Allow master start to block waiting for a number of tservers (#1204)
    
    Adds two parameters:
    
     - MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT - sets desried number of tservers
     - MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT - sets maximum time to wait.
    
    This is the same changes submitted as pull request #1158 for 1.9.x, merged to
    2.0.  Request #1158 will be available as a patch, but is not expected to be merged
    becuase of semver requirements.
---
 .../org/apache/accumulo/core/conf/Property.java    | 13 ++-
 .../java/org/apache/accumulo/master/Master.java    | 92 ++++++++++++++++++++++
 2 files changed, 104 insertions(+), 1 deletion(-)

diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java
index 4c88785..69f9f63 100644
--- a/core/src/main/java/org/apache/accumulo/core/conf/Property.java
+++ b/core/src/main/java/org/apache/accumulo/core/conf/Property.java
@@ -285,7 +285,18 @@ public enum Property {
   MASTER_METADATA_SUSPENDABLE("master.metadata.suspendable", "false", PropertyType.BOOLEAN,
       "Allow tablets for the " + MetadataTable.NAME
           + " table to be suspended via table.suspend.duration."),
-
+  MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT("master.startup.tserver.avail.min.count", "0",
+      PropertyType.COUNT,
+      "Minimum number of tservers that need to be registered before master will "
+          + "start tablet assignment - checked at master initialization, when master gets lock. "
+          + " When set to 0 or less, no blocking occurs. Default is 0 (disabled) to keep original "
+          + " behaviour. Added with version 2.0"),
+  MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT("master.startup.tserver.avail.max.wait", "0",
+      PropertyType.TIMEDURATION,
+      "Maximum time master will wait for tserver available threshold "
+          + "to be reached before continuing. When set to 0 or less, will block "
+          + "indefinitely. Default is 0 to block indefinitely. Only valid when tserver available "
+          + "threshold is set greater than 0. Added with version 2.0"),
   // properties that are specific to tablet server behavior
   TSERV_PREFIX("tserver.", null, PropertyType.PREFIX,
       "Properties in this category affect the behavior of the tablet servers"),
diff --git a/server/master/src/main/java/org/apache/accumulo/master/Master.java b/server/master/src/main/java/org/apache/accumulo/master/Master.java
index 0efd08d..b5921d6 100644
--- a/server/master/src/main/java/org/apache/accumulo/master/Master.java
+++ b/server/master/src/main/java/org/apache/accumulo/master/Master.java
@@ -77,6 +77,7 @@ import org.apache.accumulo.core.trace.TraceUtil;
 import org.apache.accumulo.core.util.Daemon;
 import org.apache.accumulo.fate.AgeOffStore;
 import org.apache.accumulo.fate.Fate;
+import org.apache.accumulo.fate.util.Retry;
 import org.apache.accumulo.fate.zookeeper.ZooLock;
 import org.apache.accumulo.fate.zookeeper.ZooLock.LockLossReason;
 import org.apache.accumulo.fate.zookeeper.ZooReaderWriter;
@@ -1021,6 +1022,12 @@ public class Master extends AbstractServer
 
     tserverSet.startListeningForTabletServerChanges();
 
+    try {
+      blockForTservers();
+    } catch (InterruptedException ex) {
+      Thread.currentThread().interrupt();
+    }
+
     ZooReaderWriter zReaderWriter = context.getZooReaderWriter();
 
     try {
@@ -1207,6 +1214,91 @@ public class Master extends AbstractServer
     log.info("exiting");
   }
 
+  /**
+   * Allows property configuration to block master start-up waiting for a minimum number of tservers
+   * to register in zookeeper. It also accepts a maximum time to wait - if the time expires, the
+   * start-up will continue with any tservers available. This check is only performed at master
+   * initialization, when the master aquires the lock. The following properties are used to control
+   * the behaviour:
+   * <ul>
+   * <li>MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT - when set to 0 or less, no blocking occurs (default
+   * behaviour) otherwise will block until the number of tservers are available.</li>
+   * <li>MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT - time to wait in milliseconds. When set to 0 or
+   * less, will block indefinitely.</li>
+   * </ul>
+   *
+   * @throws InterruptedException
+   *           if interrupted while blocking, propagated for caller to handle.
+   */
+  private void blockForTservers() throws InterruptedException {
+
+    long waitStart = System.currentTimeMillis();
+
+    long minTserverCount =
+        getConfiguration().getCount(Property.MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT);
+
+    if (minTserverCount <= 0) {
+      log.info(
+          "tserver availability check disabled, contining with-{} servers." + "To enable, set {}",
+          tserverSet.size(), Property.MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT.getKey());
+      return;
+    }
+
+    long maxWait =
+        getConfiguration().getTimeInMillis(Property.MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT);
+
+    if (maxWait <= 0) {
+      log.info("tserver availability check set to block indefinitely, To change, set {} > 0.",
+          Property.MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT.getKey());
+      maxWait = Long.MAX_VALUE;
+    }
+
+    // honor Retry condition that initial wait < max wait, otherwise use small value to allow thread
+    // yield to happen
+    long initialWait = Math.min(50, maxWait / 2);
+
+    Retry tserverRetry =
+        Retry.builder().infiniteRetries().retryAfter(initialWait, TimeUnit.MILLISECONDS)
+            .incrementBy(15_000, TimeUnit.MILLISECONDS).maxWait(maxWait, TimeUnit.MILLISECONDS)
+            .backOffFactor(1).logInterval(30_000, TimeUnit.MILLISECONDS).createRetry();
+
+    log.info("Checking for tserver availability - need to reach {} servers. Have {}",
+        minTserverCount, tserverSet.size());
+
+    boolean needTservers = tserverSet.size() < minTserverCount;
+
+    while (needTservers && tserverRetry.canRetry()) {
+
+      tserverRetry.waitForNextAttempt();
+
+      needTservers = tserverSet.size() < minTserverCount;
+
+      // suppress last message once threshold reached.
+      if (needTservers) {
+        log.info(
+            "Blocking for tserver availability - need to reach {} servers. Have {}"
+                + " Time spent blocking {} sec.",
+            minTserverCount, tserverSet.size(),
+            TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart));
+      }
+    }
+
+    if (tserverSet.size() < minTserverCount) {
+      log.warn(
+          "tserver availability check time expired - continuing. Requested {}, have {} tservers on line. "
+              + " Time waiting {} ms",
+          tserverSet.size(), minTserverCount,
+          TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart));
+
+    } else {
+      log.info(
+          "tserver availability check completed. Requested {}, have {} tservers on line. "
+              + " Time waiting {} ms",
+          tserverSet.size(), minTserverCount,
+          TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart));
+    }
+  }
+
   private TServer setupReplication()
       throws UnknownHostException, KeeperException, InterruptedException {
     ServerContext context = getContext();