You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by an...@apache.org on 2018/11/12 21:36:37 UTC

zookeeper git commit: ZOOKEEPER-3071: Add a config parameter to control transaction log size

Repository: zookeeper
Updated Branches:
  refs/heads/master 477fa0724 -> fe25fed93


ZOOKEEPER-3071: Add a config parameter to control transaction log size

Author: Suyog Mapara <su...@fb.com>
Author: Suyog Mapara <su...@devvm30625.prn1.facebook.com>

Reviewers: breed@apache.org, andor@apache.org

Closes #567 from suyogmapara/ZOOKEEPER-3071 and squashes the following commits:

0a2899994 [Suyog Mapara] Addressing comments
9d5765181 [Suyog Mapara] Update documentation per comments.
5201329bc [Suyog Mapara] Updating documentation
6d2b72058 [Suyog Mapara] Addressing comments
9386e2b6c [Suyog Mapara] ZOOKEEPER-3071: Updating comment to include more details about the feature
5a28efd72 [Suyog Mapara] ZOOKEEPER-3071: Addressing comments and fixing incorrect merge.
1aa12f9d3 [Suyog Mapara] ZOOKEEPER-3071: Add a config parameter to control transaction log size


Project: http://git-wip-us.apache.org/repos/asf/zookeeper/repo
Commit: http://git-wip-us.apache.org/repos/asf/zookeeper/commit/fe25fed9
Tree: http://git-wip-us.apache.org/repos/asf/zookeeper/tree/fe25fed9
Diff: http://git-wip-us.apache.org/repos/asf/zookeeper/diff/fe25fed9

Branch: refs/heads/master
Commit: fe25fed9390a159b24f4c4fa31e3a7911f2c3b81
Parents: 477fa07
Author: Suyog Mapara <su...@fb.com>
Authored: Mon Nov 12 13:36:32 2018 -0800
Committer: Andor Molnar <an...@apache.org>
Committed: Mon Nov 12 13:36:32 2018 -0800

----------------------------------------------------------------------
 .../main/resources/markdown/zookeeperAdmin.md   |  19 +++
 .../server/persistence/FileTxnLog.java          |  52 ++++++++
 .../server/persistence/FileTxnLogTest.java      | 128 ++++++++++++++++++-
 3 files changed, 197 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/zookeeper/blob/fe25fed9/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
----------------------------------------------------------------------
diff --git a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
index cceaec3..bb7d792 100644
--- a/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
+++ b/zookeeper-docs/src/main/resources/markdown/zookeeperAdmin.md
@@ -618,6 +618,25 @@ property, when available, is noted below.
     reaches a runtime generated random value in the \[snapCount/2+1, snapCount]
     range.The default snapCount is 100,000.
 
+* *txnLogSizeLimitInKb* :
+    (Java system property: **zookeeper.txnLogSizeLimitInKb**)
+    Zookeeper transaction log file can also be controlled more
+    directly using txnLogSizeLimitInKb. Larger txn logs can lead to
+    slower follower syncs when sync is done using transaction log.
+    This is because leader has to scan through the appropriate log
+    file on disk to find the transaction to start sync from.
+    This feature is turned off by this default and snapCount is the
+    only value that limits transaction log size. When enabled
+    Zookeeper will roll the log when either of the limit is hit.
+    Please note that actual log size can exceed this value by the size
+    of the serialized transaction. On the other hand, if this value is
+    set too close to (or smaller than) **preAllocSize**,
+    it can cause Zookeeper to roll the log for every tranasaction. While
+    this is not a correctness issue, this may cause severly degraded
+    performance. To avoid this and to get most out of this feature, it is
+    recommended to set the value to N * **preAllocSize**
+    where N >= 2.
+
 * *maxClientCnxns* :
     (No Java system property)
     Limits the number of concurrent connections (at the socket

http://git-wip-us.apache.org/repos/asf/zookeeper/blob/fe25fed9/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
----------------------------------------------------------------------
diff --git a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
index 25ef4a0..d95dac8 100644
--- a/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
+++ b/zookeeper-server/src/main/java/org/apache/zookeeper/server/persistence/FileTxnLog.java
@@ -106,6 +106,21 @@ public class FileTxnLog implements TxnLog {
     /** Maximum time we allow for elapsed fsync before WARNing */
     private final static long fsyncWarningThresholdMS;
 
+    /**
+     * This parameter limit the size of each txnlog to a given limit (KB).
+     * It does not affect how often the system will take a snapshot [zookeeper.snapCount]
+     * We roll the txnlog when either of the two limits are reached.
+     * Also since we only roll the logs at transaction boundaries, actual file size can exceed
+     * this limit by the maximum size of a serialized transaction.
+     * The feature is disabled by default (-1)
+     */
+    private static final String txnLogSizeLimitSetting = "zookeeper.txnLogSizeLimitInKb";
+
+    /**
+     * The actual txnlog size limit in bytes.
+     */
+    private static long txnLogSizeLimit = -1;
+
     static {
         LOG = LoggerFactory.getLogger(FileTxnLog.class);
 
@@ -114,6 +129,15 @@ public class FileTxnLog implements TxnLog {
         if ((fsyncWarningThreshold = Long.getLong(ZOOKEEPER_FSYNC_WARNING_THRESHOLD_MS_PROPERTY)) == null)
             fsyncWarningThreshold = Long.getLong(FSYNC_WARNING_THRESHOLD_MS_PROPERTY, 1000);
         fsyncWarningThresholdMS = fsyncWarningThreshold;
+
+        Long logSize = Long.getLong(txnLogSizeLimitSetting, -1);
+        if (logSize > 0) {
+            LOG.info("{} = {}", txnLogSizeLimitSetting, logSize);
+
+            // Convert to bytes
+            logSize = logSize * 1024;
+            txnLogSizeLimit = logSize;
+        }
     }
 
     long lastZxidSeen;
@@ -161,6 +185,24 @@ public class FileTxnLog implements TxnLog {
     }
 
     /**
+     * Set log size limit
+     */
+    public static void setTxnLogSizeLimit(long size) {
+        txnLogSizeLimit = size;
+    }
+
+    /**
+     * Return the current on-disk size of log size. This will be accurate only
+     * after commit() is called. Otherwise, unflushed txns may not be included.
+     */
+    public synchronized long getCurrentLogSize() {
+        if (logFileWrite != null) {
+            return logFileWrite.length();
+        }
+        return 0;
+    }
+
+    /**
      * creates a checksum algorithm to be used
      * @return the checksum used for this txnlog
      */
@@ -352,6 +394,16 @@ public class FileTxnLog implements TxnLog {
         while (streamsToFlush.size() > 1) {
             streamsToFlush.removeFirst().close();
         }
+
+        // Roll the log file if we exceed the size limit
+        if(txnLogSizeLimit > 0) {
+            long logSize = getCurrentLogSize();
+
+            if (logSize > txnLogSizeLimit) {
+                LOG.debug("Log size limit reached: {}", logSize);
+                rollLog();
+            }
+        }
     }
 
     /**

http://git-wip-us.apache.org/repos/asf/zookeeper/blob/fe25fed9/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
----------------------------------------------------------------------
diff --git a/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java b/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
index 77b7210..e0d34ab 100644
--- a/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
+++ b/zookeeper-server/src/test/java/org/apache/zookeeper/server/persistence/FileTxnLogTest.java
@@ -17,9 +17,13 @@
  */
 package org.apache.zookeeper.server.persistence;
 
-import org.apache.zookeeper.ZKTestCase;
-import org.apache.zookeeper.ZooDefs;
+import org.apache.zookeeper.*;
+import org.apache.zookeeper.data.Stat;
+import org.apache.zookeeper.proto.CreateRequest;
+import org.apache.zookeeper.server.ServerCnxnFactory;
 import org.apache.zookeeper.server.ServerStats;
+import org.apache.zookeeper.server.ZKDatabase;
+import org.apache.zookeeper.server.ZooKeeperServer;
 import org.apache.zookeeper.test.ClientBase;
 import org.apache.zookeeper.txn.CreateTxn;
 import org.apache.zookeeper.txn.TxnHeader;
@@ -31,6 +35,8 @@ import org.slf4j.LoggerFactory;
 import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Random;
 
 import static org.hamcrest.core.Is.is;
 import static org.hamcrest.core.IsEqual.equalTo;
@@ -135,4 +141,122 @@ public class FileTxnLogTest  extends ZKTestCase {
       Assert.assertEquals((long) i + 1 , serverStats.getFsyncThresholdExceedCount());
     }
   }
+
+  private static String HOSTPORT = "127.0.0.1:" + PortAssignment.unique();
+  private static final int CONNECTION_TIMEOUT = 3000;
+
+  // Overhead is about 150 bytes for txn created in this test
+  private static final int NODE_SIZE = 1024;
+  private final long PREALLOCATE = 512;
+  private final long LOG_SIZE_LIMIT = 1024 * 4;
+
+  /**
+   * Test that log size get update correctly
+   */
+  @Test
+  public void testGetCurrentLogSize() throws Exception {
+    FileTxnLog.setTxnLogSizeLimit(-1);
+    File tmpDir = ClientBase.createTmpDir();
+    FileTxnLog log = new FileTxnLog(tmpDir);
+    FileTxnLog.setPreallocSize(PREALLOCATE);
+    CreateRequest record = new CreateRequest(null, new byte[NODE_SIZE],
+            ZooDefs.Ids.OPEN_ACL_UNSAFE, 0);
+    int zxid = 1;
+    for (int i = 0; i < 4; i++) {
+      log.append(new TxnHeader(0, 0, zxid++, 0, 0), record);
+      LOG.debug("Current log size: " + log.getCurrentLogSize());
+    }
+    log.commit();
+    LOG.info("Current log size: " + log.getCurrentLogSize());
+    Assert.assertTrue(log.getCurrentLogSize() > (zxid - 1) * NODE_SIZE);
+    for (int i = 0; i < 4; i++) {
+      log.append(new TxnHeader(0, 0, zxid++, 0, 0), record);
+      LOG.debug("Current log size: " + log.getCurrentLogSize());
+    }
+    log.commit();
+    LOG.info("Current log size: " + log.getCurrentLogSize());
+    Assert.assertTrue(log.getCurrentLogSize() > (zxid - 1) * NODE_SIZE);
+  }
+
+  /**
+   * Test that the server can correctly load the data when there are multiple
+   * txnlogs per snapshot
+   */
+  @Test
+  public void testLogSizeLimit() throws Exception {
+    File tmpDir = ClientBase.createTmpDir();
+    ClientBase.setupTestEnv();
+
+    // Need to override preallocate set by setupTestEnv()
+    // We don't need to unset these values since each unit test run in
+    // a separate JVM instance
+    FileTxnLog.setPreallocSize(PREALLOCATE);
+    FileTxnLog.setTxnLogSizeLimit(LOG_SIZE_LIMIT);
+
+    ZooKeeperServer zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
+    final int PORT = Integer.parseInt(HOSTPORT.split(":")[1]);
+    ServerCnxnFactory f = ServerCnxnFactory.createFactory(PORT, -1);
+    f.startup(zks);
+    Assert.assertTrue("waiting for server being up ",
+            ClientBase.waitForServerUp(HOSTPORT, CONNECTION_TIMEOUT));
+    ZooKeeper zk = new ZooKeeper(HOSTPORT, CONNECTION_TIMEOUT, event -> { });
+
+    // Generate transactions
+    HashSet<Long> zxids = new HashSet<Long>();
+    byte[] bytes = new byte[NODE_SIZE];
+    Random random = new Random();
+    random.nextBytes(bytes);
+
+    // We will create enough txn to generate 3 logs
+    long txnCount = LOG_SIZE_LIMIT / NODE_SIZE / 2 * 5;
+
+    LOG.info("Creating " + txnCount + " txns");
+
+    try {
+      for (long i = 0; i < txnCount; i++) {
+        Stat stat = new Stat();
+        zk.create("/node-" + i, bytes, ZooDefs.Ids.OPEN_ACL_UNSAFE,
+                CreateMode.PERSISTENT);
+        zk.getData("/node-" + i, null, stat);
+        zxids.add(stat.getCzxid());
+      }
+
+    } finally {
+      zk.close();
+    }
+
+    // shutdown
+    f.shutdown();
+    Assert.assertTrue("waiting for server to shutdown",
+            ClientBase.waitForServerDown(HOSTPORT, CONNECTION_TIMEOUT));
+
+    File logDir = new File(tmpDir, FileTxnSnapLog.version + FileTxnSnapLog.VERSION);
+    File[] txnLogs = FileTxnLog.getLogFiles(logDir.listFiles(), 0);
+
+    Assert.assertEquals("Unexpected number of logs", 3, txnLogs.length);
+
+    // Log size should not exceed limit by more than one node size;
+    long threshold = LOG_SIZE_LIMIT + NODE_SIZE;
+    LOG.info(txnLogs[0].getAbsolutePath());
+    Assert.assertTrue(
+            "Exceed log size limit: " + txnLogs[0].length(),
+            threshold > txnLogs[0].length());
+    LOG.info(txnLogs[1].getAbsolutePath());
+    Assert.assertTrue(
+            "Exceed log size limit " + txnLogs[1].length(),
+            threshold > txnLogs[1].length());
+
+    // Start database only
+    zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
+    zks.startdata();
+
+    ZKDatabase db = zks.getZKDatabase();
+
+    for (long i = 0; i < txnCount; i++) {
+      Stat stat = new Stat();
+      byte[] data = db.getData("/node-" + i, stat, null);
+      Assert.assertArrayEquals("Missmatch data", bytes, data);
+      Assert.assertTrue("Unknown zxid ", zxids.contains(stat.getMzxid()));
+    }
+  }
 }