You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bookkeeper.apache.org by hs...@apache.org on 2021/04/19 16:42:30 UTC

[bookkeeper] branch master updated: Rocksdb tombstones

This is an automated email from the ASF dual-hosted git repository.

hsaputra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/bookkeeper.git


The following commit(s) were added to refs/heads/master by this push:
     new 874a38b  Rocksdb tombstones
874a38b is described below

commit 874a38b738773ff873f127a0e63fafd7a4ca4299
Author: mauricebarnum <ma...@gmail.com>
AuthorDate: Mon Apr 19 09:42:22 2021 -0700

    Rocksdb tombstones
    
    ### Motivation
    
    After deleting many ledgers, seeking to the end of the RocksDB metadata can take a long time and trigger timeouts upstream. Address this by improving the seek logic as well as compacting out tombstones in situations where we've just deleted many entries.  This affects the entry location index and the ledger metadata index.
    
    
    Reviewers: Andrey Yegorov, Prashant Kumar
    
    This closes #2686 from mauricebarnum/rocksdb-tombstones and squashes the following commits:
    
    c02996853 [Maurice Barnum] Partially compact RocksDB after removing deleted ledgers
    0be68a1d0 [Maurice Barnum] KeyValueStorageRocksDB bounded key iterator: use RocksDB upper bound
    0c5ef8f53 [Maurice Barnum] KeyValueStorageRocksDB.getFloor() - reimplement to avoid two seeks
    114cc73b4 [Maurice Barnum] update rocksdb to 6.16.4
    f489a056d [Surinder Singh] Fix test cases using gradle build (#2689)
    9256dbab3 [Lari Hotari] Add current ip address, long hostname and short hostname to /etc/hosts (#2688)
    f1a1f1219 [Surinder Singh] staging and setting up vote for release candidates (#2681)
    2dd4afecd [Prashant Kumar] ISSUE-2640: BP-43: Gradle integration with RAT plugin (#2683)
    79768fee1 [Jack Vanlightly] ISSUE #2615: Fix for invalid ensemble issue during ledger recovery
    646e59089 [Prashant Kumar] ISSUE #2640: BP-43 integrate gradle javadoc plugin
    d70153f76 [hangc0276] Update documentation with default value for openLedgerRereplicationGracePeriod config option
---
 .../src/main/resources/LICENSE-all.bin.txt         |  6 +--
 .../src/main/resources/LICENSE-server.bin.txt      |  6 +--
 .../bookie/storage/ldb/EntryLocationIndex.java     |  9 +++-
 .../bookie/storage/ldb/KeyValueStorage.java        | 10 +++++
 .../bookie/storage/ldb/KeyValueStorageRocksDB.java | 51 ++++++++++------------
 .../bookie/storage/ldb/LedgerMetadataIndex.java    |  8 +++-
 pom.xml                                            |  2 +-
 7 files changed, 53 insertions(+), 39 deletions(-)

diff --git a/bookkeeper-dist/src/main/resources/LICENSE-all.bin.txt b/bookkeeper-dist/src/main/resources/LICENSE-all.bin.txt
index 7273c24..627ad63 100644
--- a/bookkeeper-dist/src/main/resources/LICENSE-all.bin.txt
+++ b/bookkeeper-dist/src/main/resources/LICENSE-all.bin.txt
@@ -253,7 +253,7 @@ Apache Software License, Version 2.
 - lib/org.eclipse.jetty-jetty-server-9.4.33.v20201020.jar [22]
 - lib/org.eclipse.jetty-jetty-servlet-9.4.33.v20201020.jar [22]
 - lib/org.eclipse.jetty-jetty-util-9.4.33.v20201020.jar [22]
-- lib/org.rocksdb-rocksdbjni-6.10.2.jar [23]
+- lib/org.rocksdb-rocksdbjni-6.16.4.jar [23]
 - lib/com.beust-jcommander-1.78.jar [24]
 - lib/com.yahoo.datasketches-memory-0.8.3.jar [25]
 - lib/com.yahoo.datasketches-sketches-core-0.8.3.jar [25]
@@ -327,7 +327,7 @@ Apache Software License, Version 2.
 [20] Source available at https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=shortlog;h=refs/tags/LANG_3_6
 [21] Source available at https://github.com/apache/zookeeper/tree/release-3.6.2
 [22] Source available at https://github.com/eclipse/jetty.project/tree/jetty-9.4.33.v20201020
-[23] Source available at https://github.com/facebook/rocksdb/tree/v5.13.1
+[23] Source available at https://github.com/facebook/rocksdb/tree/v6.16.4
 [24] Source available at https://github.com/cbeust/jcommander/tree/1.78
 [25] Source available at https://github.com/DataSketches/sketches-core/tree/sketches-0.8.3
 [26] Source available at https://github.com/lz4/lz4-java/tree/1.3.0
@@ -584,7 +584,7 @@ This private header is also used by Apple's open source
     * http://www.opensource.apple.com/source/configd/configd-453.19/dnsinfo/dnsinfo.h
 
 ------------------------------------------------------------------------------------
-lib/org.rocksdb-rocksdbjni-6.10.2.jar is derived from leveldb, which is under the following license.
+lib/org.rocksdb-rocksdbjni-6.16.4.jar is derived from leveldb, which is under the following license.
 
 Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 
diff --git a/bookkeeper-dist/src/main/resources/LICENSE-server.bin.txt b/bookkeeper-dist/src/main/resources/LICENSE-server.bin.txt
index 4311a94..61bf0b8 100644
--- a/bookkeeper-dist/src/main/resources/LICENSE-server.bin.txt
+++ b/bookkeeper-dist/src/main/resources/LICENSE-server.bin.txt
@@ -253,7 +253,7 @@ Apache Software License, Version 2.
 - lib/org.eclipse.jetty-jetty-server-9.4.33.v20201020.jar [22]
 - lib/org.eclipse.jetty-jetty-servlet-9.4.33.v20201020.jar [22]
 - lib/org.eclipse.jetty-jetty-util-9.4.33.v20201020.jar [22]
-- lib/org.rocksdb-rocksdbjni-6.10.2.jar [23]
+- lib/org.rocksdb-rocksdbjni-6.16.4.jar [23]
 - lib/com.beust-jcommander-1.78.jar [24]
 - lib/com.yahoo.datasketches-memory-0.8.3.jar [25]
 - lib/com.yahoo.datasketches-sketches-core-0.8.3.jar [25]
@@ -325,7 +325,7 @@ Apache Software License, Version 2.
 [20] Source available at https://git-wip-us.apache.org/repos/asf?p=commons-lang.git;a=shortlog;h=refs/tags/LANG_3_6
 [21] Source available at https://github.com/apache/zookeeper/tree/release-3.6.2
 [22] Source available at https://github.com/eclipse/jetty.project/tree/jetty-9.4.33.v20201020
-[23] Source available at https://github.com/facebook/rocksdb/tree/v5.13.1
+[23] Source available at https://github.com/facebook/rocksdb/tree/v6.16.4
 [24] Source available at https://github.com/cbeust/jcommander/tree/1.78
 [25] Source available at https://github.com/DataSketches/sketches-core/tree/sketches-0.8.3
 [26] Source available at https://github.com/lz4/lz4-java/tree/1.3.0
@@ -582,7 +582,7 @@ This private header is also used by Apple's open source
     * http://www.opensource.apple.com/source/configd/configd-453.19/dnsinfo/dnsinfo.h
 
 ------------------------------------------------------------------------------------
-lib/org.rocksdb-rocksdbjni-6.10.2.jar is derived from leveldb, which is under the following license.
+lib/org.rocksdb-rocksdbjni-6.16.4.jar is derived from leveldb, which is under the following license.
 
 Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/EntryLocationIndex.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/EntryLocationIndex.java
index 6b01c50..f60b8e4 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/EntryLocationIndex.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/EntryLocationIndex.java
@@ -196,6 +196,7 @@ public class EntryLocationIndex implements Closeable {
         long deletedEntriesInBatch = 0;
 
         Batch batch = locationsDb.newBatch();
+        final byte[] firstDeletedKey = new byte[keyToDelete.array.length];
 
         try {
             for (long ledgerId : ledgersToDelete) {
@@ -238,7 +239,9 @@ public class EntryLocationIndex implements Closeable {
                     }
                     batch.remove(keyToDelete.array);
                     ++deletedEntriesInBatch;
-                    ++deletedEntries;
+                    if (deletedEntries++ == 0) {
+                        System.arraycopy(keyToDelete.array, 0, firstDeletedKey, 0, firstDeletedKey.length);
+                    }
                 }
 
                 if (deletedEntriesInBatch > DELETE_ENTRIES_BATCH_SIZE) {
@@ -251,8 +254,10 @@ public class EntryLocationIndex implements Closeable {
             try {
                 batch.flush();
                 batch.clear();
+                if (deletedEntries != 0) {
+                    locationsDb.compact(firstDeletedKey, keyToDelete.array);
+                }
             } finally {
-
                 firstKeyWrapper.recycle();
                 lastKeyWrapper.recycle();
                 keyToDelete.recycle();
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorage.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorage.java
index aa0119a..27f987c 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorage.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorage.java
@@ -97,6 +97,16 @@ public interface KeyValueStorage extends Closeable {
     void delete(byte[] key) throws IOException;
 
     /**
+     * Compact storage within a specified range.
+     *
+     * @param firstKey
+     *            the first key in the range (included)
+     * @param lastKey
+     *            the last key in the range (not included)
+     */
+    default void compact(byte[] firstKey, byte[] lastKey) throws IOException {}
+
+    /**
      * Get an iterator over to scan sequentially through all the keys in the
      * database.
      *
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java
index 6a71009..2856d92 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/KeyValueStorageRocksDB.java
@@ -22,14 +22,11 @@ package org.apache.bookkeeper.bookie.storage.ldb;
 
 import static com.google.common.base.Preconditions.checkState;
 
-import com.google.common.primitives.UnsignedBytes;
-
 //CHECKSTYLE.OFF: IllegalImport
 import io.netty.util.internal.PlatformDependent;
 //CHECKSTYLE.ON: IllegalImport
 
 import java.io.IOException;
-import java.util.Comparator;
 import java.util.Map.Entry;
 import java.util.concurrent.TimeUnit;
 
@@ -47,6 +44,7 @@ import org.rocksdb.ReadOptions;
 import org.rocksdb.RocksDB;
 import org.rocksdb.RocksDBException;
 import org.rocksdb.RocksIterator;
+import org.rocksdb.Slice;
 import org.rocksdb.WriteBatch;
 import org.rocksdb.WriteOptions;
 import org.slf4j.Logger;
@@ -239,31 +237,15 @@ public class KeyValueStorageRocksDB implements KeyValueStorage {
 
     @Override
     public Entry<byte[], byte[]> getFloor(byte[] key) throws IOException {
-        try (RocksIterator iterator = db.newIterator(optionCache)) {
-            // Position the iterator on the record whose key is >= to the supplied key
-            iterator.seek(key);
-
-            if (!iterator.isValid()) {
-                // There are no entries >= key
-                iterator.seekToLast();
-                if (iterator.isValid()) {
-                    return new EntryWrapper(iterator.key(), iterator.value());
-                } else {
-                    // Db is empty
-                    return null;
-                }
-            }
-
-            iterator.prev();
-
-            if (!iterator.isValid()) {
-                // Iterator is on the 1st entry of the db and this entry key is >= to the target
-                // key
-                return null;
-            } else {
+        try (Slice upperBound = new Slice(key);
+                 ReadOptions option = new ReadOptions(optionCache).setIterateUpperBound(upperBound);
+                 RocksIterator iterator = db.newIterator(option)) {
+            iterator.seekToLast();
+            if (iterator.isValid()) {
                 return new EntryWrapper(iterator.key(), iterator.value());
             }
         }
+        return null;
     }
 
     @Override
@@ -290,6 +272,15 @@ public class KeyValueStorageRocksDB implements KeyValueStorage {
     }
 
     @Override
+    public void compact(byte[] firstKey, byte[] lastKey) throws IOException {
+        try {
+            db.compactRange(firstKey, lastKey);
+        } catch (RocksDBException e) {
+            throw new IOException("Error in RocksDB compact", e);
+        }
+    }
+
+    @Override
     public void sync() throws IOException {
         try {
             db.write(optionSync, emptyBatch);
@@ -326,13 +317,15 @@ public class KeyValueStorageRocksDB implements KeyValueStorage {
 
     @Override
     public CloseableIterator<byte[]> keys(byte[] firstKey, byte[] lastKey) {
-        final RocksIterator iterator = db.newIterator(optionCache);
+        final Slice upperBound = new Slice(lastKey);
+        final ReadOptions option = new ReadOptions(optionCache).setIterateUpperBound(upperBound);
+        final RocksIterator iterator = db.newIterator(option);
         iterator.seek(firstKey);
 
         return new CloseableIterator<byte[]>() {
             @Override
             public boolean hasNext() {
-                return iterator.isValid() && ByteComparator.compare(iterator.key(), lastKey) < 0;
+                return iterator.isValid();
             }
 
             @Override
@@ -346,6 +339,8 @@ public class KeyValueStorageRocksDB implements KeyValueStorage {
             @Override
             public void close() {
                 iterator.close();
+                option.close();
+                upperBound.close();
             }
         };
     }
@@ -474,7 +469,5 @@ public class KeyValueStorageRocksDB implements KeyValueStorage {
         }
     }
 
-    private static final Comparator<byte[]> ByteComparator = UnsignedBytes.lexicographicalComparator();
-
     private static final Logger log = LoggerFactory.getLogger(KeyValueStorageRocksDB.class);
 }
diff --git a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/LedgerMetadataIndex.java b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/LedgerMetadataIndex.java
index 517681f..aeec6af 100644
--- a/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/LedgerMetadataIndex.java
+++ b/bookkeeper-server/src/main/java/org/apache/bookkeeper/bookie/storage/ldb/LedgerMetadataIndex.java
@@ -228,13 +228,16 @@ public class LedgerMetadataIndex implements Closeable {
 
     public void removeDeletedLedgers() throws IOException {
         LongWrapper key = LongWrapper.get();
+        final byte[] startKey = new byte[key.array.length];
 
         int deletedLedgers = 0;
         while (!pendingDeletedLedgers.isEmpty()) {
             long ledgerId = pendingDeletedLedgers.poll();
             key.set(ledgerId);
             ledgersDb.delete(key.array);
-            deletedLedgers++;
+            if (deletedLedgers++ == 0) {
+                System.arraycopy(key.array, 0, startKey, 0, startKey.length);
+            }
         }
 
         if (log.isDebugEnabled()) {
@@ -242,6 +245,9 @@ public class LedgerMetadataIndex implements Closeable {
         }
 
         ledgersDb.sync();
+        if (deletedLedgers != 0) {
+            ledgersDb.compact(startKey, key.array);
+        }
         key.recycle();
     }
 
diff --git a/pom.xml b/pom.xml
index daa39d5..de66916 100644
--- a/pom.xml
+++ b/pom.xml
@@ -160,7 +160,7 @@
     <protoc3.version>3.14.0</protoc3.version>
     <protoc-gen-grpc-java.version>${grpc.version}</protoc-gen-grpc-java.version>
     <reflections.version>0.9.11</reflections.version>
-    <rocksdb.version>6.10.2</rocksdb.version>
+    <rocksdb.version>6.16.4</rocksdb.version>
     <shrinkwrap.version>3.0.1</shrinkwrap.version>
     <slf4j.version>1.7.25</slf4j.version>
     <snakeyaml.version>1.19</snakeyaml.version>