You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ed...@apache.org on 2019/06/13 20:59:50 UTC
[accumulo] branch 2.0 updated: Allow master start to block waiting
for a number of tservers (#1204)
This is an automated email from the ASF dual-hosted git repository.
edcoleman pushed a commit to branch 2.0
in repository https://gitbox.apache.org/repos/asf/accumulo.git
The following commit(s) were added to refs/heads/2.0 by this push:
new de9ba68 Allow master start to block waiting for a number of tservers (#1204)
de9ba68 is described below
commit de9ba68633a0d7e0ed55ee98533c82ab0b4bb12b
Author: EdColeman <de...@etcoleman.com>
AuthorDate: Thu Jun 13 16:59:45 2019 -0400
Allow master start to block waiting for a number of tservers (#1204)
Adds two parameters:
- MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT - sets desried number of tservers
- MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT - sets maximum time to wait.
This is the same changes submitted as pull request #1158 for 1.9.x, merged to
2.0. Request #1158 will be available as a patch, but is not expected to be merged
becuase of semver requirements.
---
.../org/apache/accumulo/core/conf/Property.java | 13 ++-
.../java/org/apache/accumulo/master/Master.java | 92 ++++++++++++++++++++++
2 files changed, 104 insertions(+), 1 deletion(-)
diff --git a/core/src/main/java/org/apache/accumulo/core/conf/Property.java b/core/src/main/java/org/apache/accumulo/core/conf/Property.java
index 4c88785..69f9f63 100644
--- a/core/src/main/java/org/apache/accumulo/core/conf/Property.java
+++ b/core/src/main/java/org/apache/accumulo/core/conf/Property.java
@@ -285,7 +285,18 @@ public enum Property {
MASTER_METADATA_SUSPENDABLE("master.metadata.suspendable", "false", PropertyType.BOOLEAN,
"Allow tablets for the " + MetadataTable.NAME
+ " table to be suspended via table.suspend.duration."),
-
+ MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT("master.startup.tserver.avail.min.count", "0",
+ PropertyType.COUNT,
+ "Minimum number of tservers that need to be registered before master will "
+ + "start tablet assignment - checked at master initialization, when master gets lock. "
+ + " When set to 0 or less, no blocking occurs. Default is 0 (disabled) to keep original "
+ + " behaviour. Added with version 2.0"),
+ MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT("master.startup.tserver.avail.max.wait", "0",
+ PropertyType.TIMEDURATION,
+ "Maximum time master will wait for tserver available threshold "
+ + "to be reached before continuing. When set to 0 or less, will block "
+ + "indefinitely. Default is 0 to block indefinitely. Only valid when tserver available "
+ + "threshold is set greater than 0. Added with version 2.0"),
// properties that are specific to tablet server behavior
TSERV_PREFIX("tserver.", null, PropertyType.PREFIX,
"Properties in this category affect the behavior of the tablet servers"),
diff --git a/server/master/src/main/java/org/apache/accumulo/master/Master.java b/server/master/src/main/java/org/apache/accumulo/master/Master.java
index 0efd08d..b5921d6 100644
--- a/server/master/src/main/java/org/apache/accumulo/master/Master.java
+++ b/server/master/src/main/java/org/apache/accumulo/master/Master.java
@@ -77,6 +77,7 @@ import org.apache.accumulo.core.trace.TraceUtil;
import org.apache.accumulo.core.util.Daemon;
import org.apache.accumulo.fate.AgeOffStore;
import org.apache.accumulo.fate.Fate;
+import org.apache.accumulo.fate.util.Retry;
import org.apache.accumulo.fate.zookeeper.ZooLock;
import org.apache.accumulo.fate.zookeeper.ZooLock.LockLossReason;
import org.apache.accumulo.fate.zookeeper.ZooReaderWriter;
@@ -1021,6 +1022,12 @@ public class Master extends AbstractServer
tserverSet.startListeningForTabletServerChanges();
+ try {
+ blockForTservers();
+ } catch (InterruptedException ex) {
+ Thread.currentThread().interrupt();
+ }
+
ZooReaderWriter zReaderWriter = context.getZooReaderWriter();
try {
@@ -1207,6 +1214,91 @@ public class Master extends AbstractServer
log.info("exiting");
}
+ /**
+ * Allows property configuration to block master start-up waiting for a minimum number of tservers
+ * to register in zookeeper. It also accepts a maximum time to wait - if the time expires, the
+ * start-up will continue with any tservers available. This check is only performed at master
+ * initialization, when the master aquires the lock. The following properties are used to control
+ * the behaviour:
+ * <ul>
+ * <li>MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT - when set to 0 or less, no blocking occurs (default
+ * behaviour) otherwise will block until the number of tservers are available.</li>
+ * <li>MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT - time to wait in milliseconds. When set to 0 or
+ * less, will block indefinitely.</li>
+ * </ul>
+ *
+ * @throws InterruptedException
+ * if interrupted while blocking, propagated for caller to handle.
+ */
+ private void blockForTservers() throws InterruptedException {
+
+ long waitStart = System.currentTimeMillis();
+
+ long minTserverCount =
+ getConfiguration().getCount(Property.MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT);
+
+ if (minTserverCount <= 0) {
+ log.info(
+ "tserver availability check disabled, contining with-{} servers." + "To enable, set {}",
+ tserverSet.size(), Property.MASTER_STARTUP_TSERVER_AVAIL_MIN_COUNT.getKey());
+ return;
+ }
+
+ long maxWait =
+ getConfiguration().getTimeInMillis(Property.MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT);
+
+ if (maxWait <= 0) {
+ log.info("tserver availability check set to block indefinitely, To change, set {} > 0.",
+ Property.MASTER_STARTUP_TSERVER_AVAIL_MAX_WAIT.getKey());
+ maxWait = Long.MAX_VALUE;
+ }
+
+ // honor Retry condition that initial wait < max wait, otherwise use small value to allow thread
+ // yield to happen
+ long initialWait = Math.min(50, maxWait / 2);
+
+ Retry tserverRetry =
+ Retry.builder().infiniteRetries().retryAfter(initialWait, TimeUnit.MILLISECONDS)
+ .incrementBy(15_000, TimeUnit.MILLISECONDS).maxWait(maxWait, TimeUnit.MILLISECONDS)
+ .backOffFactor(1).logInterval(30_000, TimeUnit.MILLISECONDS).createRetry();
+
+ log.info("Checking for tserver availability - need to reach {} servers. Have {}",
+ minTserverCount, tserverSet.size());
+
+ boolean needTservers = tserverSet.size() < minTserverCount;
+
+ while (needTservers && tserverRetry.canRetry()) {
+
+ tserverRetry.waitForNextAttempt();
+
+ needTservers = tserverSet.size() < minTserverCount;
+
+ // suppress last message once threshold reached.
+ if (needTservers) {
+ log.info(
+ "Blocking for tserver availability - need to reach {} servers. Have {}"
+ + " Time spent blocking {} sec.",
+ minTserverCount, tserverSet.size(),
+ TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart));
+ }
+ }
+
+ if (tserverSet.size() < minTserverCount) {
+ log.warn(
+ "tserver availability check time expired - continuing. Requested {}, have {} tservers on line. "
+ + " Time waiting {} ms",
+ tserverSet.size(), minTserverCount,
+ TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart));
+
+ } else {
+ log.info(
+ "tserver availability check completed. Requested {}, have {} tservers on line. "
+ + " Time waiting {} ms",
+ tserverSet.size(), minTserverCount,
+ TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - waitStart));
+ }
+ }
+
private TServer setupReplication()
throws UnknownHostException, KeeperException, InterruptedException {
ServerContext context = getContext();