You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by an...@apache.org on 2018/11/12 21:36:37 UTC
zookeeper git commit: ZOOKEEPER-3071: Add a config parameter to
control transaction log size
Repository: zookeeper
Updated Branches:
refs/heads/master 477fa0724 -> fe25fed93
ZOOKEEPER-3071: Add a config parameter to control transaction log size
Author: Suyog Mapara <su...@fb.com>
Author: Suyog Mapara <su...@devvm30625.prn1.facebook.com>
Reviewers: breed@apache.org, andor@apache.org
Closes #567 from suyogmapara/ZOOKEEPER-3071 and squashes the following commits:
0a2899994 [Suyog Mapara] Addressing comments
9d5765181 [Suyog Mapara] Update documentation per comments.
5201329bc [Suyog Mapara] Updating documentation
6d2b72058 [Suyog Mapara] Addressing comments
9386e2b6c [Suyog Mapara] ZOOKEEPER-3071: Updating comment to include more details about the feature
5a28efd72 [Suyog Mapara] ZOOKEEPER-3071: Addressing comments and fixing incorrect merge.
1aa12f9d3 [Suyog Mapara] ZOOKEEPER-3071: Add a config parameter to control transaction log size
Project: http://git-wip-us.apache.org/repos/asf/zookeeper/repo
Commit: http://git-wip-us.apache.org/repos/asf/zookeeper/commit/fe25fed9
Tree: http://git-wip-us.apache.org/repos/asf/zookeeper/tree/fe25fed9
Diff: http://git-wip-us.apache.org/repos/asf/zookeeper/diff/fe25fed9
Branch: refs/heads/master
Commit: fe25fed9390a159b24f4c4fa31e3a7911f2c3b81
Parents: 477fa07
Author: Suyog Mapara <su...@fb.com>
Authored: Mon Nov 12 13:36:32 2018 -0800
Committer: Andor Molnar <an...@apache.org>
Committed: Mon Nov 12 13:36:32 2018 -0800
----------------------------------------------------------------------
.../main/resources/markdown/zookeeperAdmin.md | 19 +++
.../server/persistence/FileTxnLog.java | 52 ++++++++
.../server/persistence/FileTxnLogTest.java | 128 ++++++++++++++++++-
3 files changed, 197 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/zookeeper/blob/fe25fed9/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
----------------------------------------------------------------------
diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
index cceaec3..bb7d792 100644
--- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
+++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
@@ -618,6 +618,25 @@ property, when available, is noted below.
reaches a runtime generated random value in the \[snapCount/2+1, snapCount]
range.The default snapCount is 100,000.
+* *txnLogSizeLimitInKb* :
+ (Java system property: **zookeeper.txnLogSizeLimitInKb**)
+ Zookeeper transaction log file can also be controlled more
+ directly using txnLogSizeLimitInKb. Larger txn logs can lead to
+ slower follower syncs when sync is done using transaction log.
+ This is because leader has to scan through the appropriate log
+ file on disk to find the transaction to start sync from.
+ This feature is turned off by this default and snapCount is the
+ only value that limits transaction log size. When enabled
+ Zookeeper will roll the log when either of the limit is hit.
+ Please note that actual log size can exceed this value by the size
+ of the serialized transaction. On the other hand, if this value is
+ set too close to (or smaller than) **preAllocSize**,
+ it can cause Zookeeper to roll the log for every tranasaction. While
+ this is not a correctness issue, this may cause severly degraded
+ performance. To avoid this and to get most out of this feature, it is
+ recommended to set the value to N * **preAllocSize**
+ where N >= 2.
+
* *maxClientCnxns* :
(No Java system property)
Limits the number of concurrent connections (at the socket
http://git-wip-us.apache.org/repos/asf/zookeeper/blob/fe25fed9/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
----------------------------------------------------------------------
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
index 25ef4a0..d95dac8 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
@@ -106,6 +106,21 @@ public class FileTxnLog implements TxnLog {
/** Maximum time we allow for elapsed fsync before WARNing */
private final static long fsyncWarningThresholdMS;
+ /**
+ * This parameter limit the size of each txnlog to a given limit (KB).
+ * It does not affect how often the system will take a snapshot [zookeeper.snapCount]
+ * We roll the txnlog when either of the two limits are reached.
+ * Also since we only roll the logs at transaction boundaries, actual file size can exceed
+ * this limit by the maximum size of a serialized transaction.
+ * The feature is disabled by default (-1)
+ */
+ private static final String txnLogSizeLimitSetting = "zookeeper.txnLogSizeLimitInKb";
+
+ /**
+ * The actual txnlog size limit in bytes.
+ */
+ private static long txnLogSizeLimit = -1;
+
static {
LOG = LoggerFactory.getLogger(FileTxnLog.class);
@@ -114,6 +129,15 @@ public class FileTxnLog implements TxnLog {
if ((fsyncWarningThreshold = Long.getLong(ZOOKEEPER_FSYNC_WARNING_THRESHOLD_MS_PROPERTY)) == null)
fsyncWarningThreshold = Long.getLong(FSYNC_WARNING_THRESHOLD_MS_PROPERTY, 1000);
fsyncWarningThresholdMS = fsyncWarningThreshold;
+
+ Long logSize = Long.getLong(txnLogSizeLimitSetting, -1);
+ if (logSize > 0) {
+ LOG.info("{} = {}", txnLogSizeLimitSetting, logSize);
+
+ // Convert to bytes
+ logSize = logSize * 1024;
+ txnLogSizeLimit = logSize;
+ }
}
long lastZxidSeen;
@@ -161,6 +185,24 @@ public class FileTxnLog implements TxnLog {
}
/**
+ * Set log size limit
+ */
+ public static void setTxnLogSizeLimit(long size) {
+ txnLogSizeLimit = size;
+ }
+
+ /**
+ * Return the current on-disk size of log size. This will be accurate only
+ * after commit() is called. Otherwise, unflushed txns may not be included.
+ */
+ public synchronized long getCurrentLogSize() {
+ if (logFileWrite != null) {
+ return logFileWrite.length();
+ }
+ return 0;
+ }
+
+ /**
* creates a checksum algorithm to be used
* @return the checksum used for this txnlog
*/
@@ -352,6 +394,16 @@ public class FileTxnLog implements TxnLog {
while (streamsToFlush.size() > 1) {
streamsToFlush.removeFirst().close();
}
+
+ // Roll the log file if we exceed the size limit
+ if(txnLogSizeLimit > 0) {
+ long logSize = getCurrentLogSize();
+
+ if (logSize > txnLogSizeLimit) {
+ LOG.debug("Log size limit reached: {}", logSize);
+ rollLog();
+ }
+ }
}
/**
http://git-wip-us.apache.org/repos/asf/zookeeper/blob/fe25fed9/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
----------------------------------------------------------------------
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
index 77b7210..e0d34ab 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
@@ -17,9 +17,13 @@
*/
package org.apache.zookeeper.server.persistence;
-import org.apache.zookeeper.ZKTestCase;
-import org.apache.zookeeper.ZooDefs;
+import org.apache.zookeeper.*;
+import org.apache.zookeeper.data.Stat;
+import org.apache.zookeeper.proto.CreateRequest;
+import org.apache.zookeeper.server.ServerCnxnFactory;
import org.apache.zookeeper.server.ServerStats;
+import org.apache.zookeeper.server.ZKDatabase;
+import org.apache.zookeeper.server.ZooKeeperServer;
import org.apache.zookeeper.test.ClientBase;
import org.apache.zookeeper.txn.CreateTxn;
import org.apache.zookeeper.txn.TxnHeader;
@@ -31,6 +35,8 @@ import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Random;
import static org.hamcrest.core.Is.is;
import static org.hamcrest.core.IsEqual.equalTo;
@@ -135,4 +141,122 @@ public class FileTxnLogTest extends ZKTestCase {
Assert.assertEquals((long) i + 1 , serverStats.getFsyncThresholdExceedCount());
}
}
+
+ private static String HOSTPORT = "127.0.0.1:" + PortAssignment.unique();
+ private static final int CONNECTION_TIMEOUT = 3000;
+
+ // Overhead is about 150 bytes for txn created in this test
+ private static final int NODE_SIZE = 1024;
+ private final long PREALLOCATE = 512;
+ private final long LOG_SIZE_LIMIT = 1024 * 4;
+
+ /**
+ * Test that log size get update correctly
+ */
+ @Test
+ public void testGetCurrentLogSize() throws Exception {
+ FileTxnLog.setTxnLogSizeLimit(-1);
+ File tmpDir = ClientBase.createTmpDir();
+ FileTxnLog log = new FileTxnLog(tmpDir);
+ FileTxnLog.setPreallocSize(PREALLOCATE);
+ CreateRequest record = new CreateRequest(null, new byte[NODE_SIZE],
+ ZooDefs.Ids.OPEN_ACL_UNSAFE, 0);
+ int zxid = 1;
+ for (int i = 0; i < 4; i++) {
+ log.append(new TxnHeader(0, 0, zxid++, 0, 0), record);
+ LOG.debug("Current log size: " + log.getCurrentLogSize());
+ }
+ log.commit();
+ LOG.info("Current log size: " + log.getCurrentLogSize());
+ Assert.assertTrue(log.getCurrentLogSize() > (zxid - 1) * NODE_SIZE);
+ for (int i = 0; i < 4; i++) {
+ log.append(new TxnHeader(0, 0, zxid++, 0, 0), record);
+ LOG.debug("Current log size: " + log.getCurrentLogSize());
+ }
+ log.commit();
+ LOG.info("Current log size: " + log.getCurrentLogSize());
+ Assert.assertTrue(log.getCurrentLogSize() > (zxid - 1) * NODE_SIZE);
+ }
+
+ /**
+ * Test that the server can correctly load the data when there are multiple
+ * txnlogs per snapshot
+ */
+ @Test
+ public void testLogSizeLimit() throws Exception {
+ File tmpDir = ClientBase.createTmpDir();
+ ClientBase.setupTestEnv();
+
+ // Need to override preallocate set by setupTestEnv()
+ // We don't need to unset these values since each unit test run in
+ // a separate JVM instance
+ FileTxnLog.setPreallocSize(PREALLOCATE);
+ FileTxnLog.setTxnLogSizeLimit(LOG_SIZE_LIMIT);
+
+ ZooKeeperServer zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
+ final int PORT = Integer.parseInt(HOSTPORT.split(":")[1]);
+ ServerCnxnFactory f = ServerCnxnFactory.createFactory(PORT, -1);
+ f.startup(zks);
+ Assert.assertTrue("waiting for server being up ",
+ ClientBase.waitForServerUp(HOSTPORT, CONNECTION_TIMEOUT));
+ ZooKeeper zk = new ZooKeeper(HOSTPORT, CONNECTION_TIMEOUT, event -> { });
+
+ // Generate transactions
+ HashSet<Long> zxids = new HashSet<Long>();
+ byte[] bytes = new byte[NODE_SIZE];
+ Random random = new Random();
+ random.nextBytes(bytes);
+
+ // We will create enough txn to generate 3 logs
+ long txnCount = LOG_SIZE_LIMIT / NODE_SIZE / 2 * 5;
+
+ LOG.info("Creating " + txnCount + " txns");
+
+ try {
+ for (long i = 0; i < txnCount; i++) {
+ Stat stat = new Stat();
+ zk.create("/node-" + i, bytes, ZooDefs.Ids.OPEN_ACL_UNSAFE,
+ CreateMode.PERSISTENT);
+ zk.getData("/node-" + i, null, stat);
+ zxids.add(stat.getCzxid());
+ }
+
+ } finally {
+ zk.close();
+ }
+
+ // shutdown
+ f.shutdown();
+ Assert.assertTrue("waiting for server to shutdown",
+ ClientBase.waitForServerDown(HOSTPORT, CONNECTION_TIMEOUT));
+
+ File logDir = new File(tmpDir, FileTxnSnapLog.version + FileTxnSnapLog.VERSION);
+ File[] txnLogs = FileTxnLog.getLogFiles(logDir.listFiles(), 0);
+
+ Assert.assertEquals("Unexpected number of logs", 3, txnLogs.length);
+
+ // Log size should not exceed limit by more than one node size;
+ long threshold = LOG_SIZE_LIMIT + NODE_SIZE;
+ LOG.info(txnLogs[0].getAbsolutePath());
+ Assert.assertTrue(
+ "Exceed log size limit: " + txnLogs[0].length(),
+ threshold > txnLogs[0].length());
+ LOG.info(txnLogs[1].getAbsolutePath());
+ Assert.assertTrue(
+ "Exceed log size limit " + txnLogs[1].length(),
+ threshold > txnLogs[1].length());
+
+ // Start database only
+ zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
+ zks.startdata();
+
+ ZKDatabase db = zks.getZKDatabase();
+
+ for (long i = 0; i < txnCount; i++) {
+ Stat stat = new Stat();
+ byte[] data = db.getData("/node-" + i, stat, null);
+ Assert.assertArrayEquals("Missmatch data", bytes, data);
+ Assert.assertTrue("Unknown zxid ", zxids.contains(stat.getMzxid()));
+ }
+ }
}