You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by al...@apache.org on 2016/06/03 08:27:59 UTC

svn commit: r1746686 - in /jackrabbit/oak/trunk/oak-segment-tar/src: main/java/org/apache/jackrabbit/oak/segment/ main/java/org/apache/jackrabbit/oak/segment/compaction/ main/java/org/apache/jackrabbit/oak/segment/file/ test/java/org/apache/jackrabbit/...

Author: alexparvulescu
Date: Fri Jun  3 08:27:59 2016
New Revision: 1746686

URL: http://svn.apache.org/viewvc?rev=1746686&view=rev
Log:
OAK-4279 Rework offline compaction
 - introduced flags to control binary content based de-duplication

Modified:
    jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/Compactor.java
    jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/compaction/SegmentGCOptions.java
    jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/FileStore.java
    jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactionAndCleanupIT.java
    jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactorTest.java

Modified: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/Compactor.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/Compactor.java?rev=1746686&r1=1746685&r2=1746686&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/Compactor.java (original)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/Compactor.java Fri Jun  3 08:27:59 2016
@@ -36,6 +36,7 @@ import org.apache.jackrabbit.oak.commons
 import org.apache.jackrabbit.oak.plugins.memory.BinaryPropertyState;
 import org.apache.jackrabbit.oak.plugins.memory.MultiBinaryPropertyState;
 import org.apache.jackrabbit.oak.plugins.memory.PropertyStates;
+import org.apache.jackrabbit.oak.segment.compaction.SegmentGCOptions;
 import org.apache.jackrabbit.oak.spi.blob.BlobStore;
 import org.apache.jackrabbit.oak.spi.state.ApplyDiff;
 import org.apache.jackrabbit.oak.spi.state.NodeBuilder;
@@ -70,6 +71,17 @@ public class Compactor {
     private final ProgressTracker progress = new ProgressTracker();
 
     /**
+     * Enables content based de-duplication of binaries. Involves a fair amount
+     * of I/O when reading/comparing potentially equal blobs.
+     */
+    private final boolean binaryDedup;
+
+    /**
+     * Set the upper bound for the content based de-duplication checks.
+     */
+    private final long binaryDedupMaxSize;
+
+    /**
      * Map from {@link #getBlobKey(Blob) blob keys} to matching compacted blob
      * record identifiers. Used to de-duplicate copies of the same binary
      * values.
@@ -109,11 +121,13 @@ public class Compactor {
             cacheSize).get();
 
     public Compactor(SegmentReader reader, SegmentWriter writer,
-            BlobStore blobStore, Supplier<Boolean> cancel) {
+            BlobStore blobStore, Supplier<Boolean> cancel, SegmentGCOptions gc) {
         this.reader = reader;
         this.writer = writer;
         this.blobStore = blobStore;
         this.cancel = cancel;
+        this.binaryDedup = gc.isBinaryDeduplication();
+        this.binaryDedupMaxSize = gc.getBinaryDeduplicationMaxSize();
     }
 
     private SegmentNodeBuilder process(NodeState before, NodeState after,
@@ -311,6 +325,8 @@ public class Compactor {
             try {
                 // Check if we've already cloned this specific record
                 RecordId id = sb.getRecordId();
+
+                // TODO verify binary impact on cache
                 RecordId compactedId = cache.get(id);
                 if (compactedId != null) {
                     return new SegmentBlob(blobStore, compactedId);
@@ -331,27 +347,37 @@ public class Compactor {
                     return clone;
                 }
 
-                // alternatively look if the exact same binary has been cloned
-                String key = getBlobKey(blob);
-                List<RecordId> ids = binaries.get(key);
-                if (ids != null) {
-                    for (RecordId duplicateId : ids) {
-                        if (new SegmentBlob(blobStore, duplicateId).equals(sb)) {
-                            cache.put(id, duplicateId);
-                            return new SegmentBlob(blobStore, duplicateId);
+                List<RecordId> ids = null;
+                String key = null;
+                boolean dedup = binaryDedup
+                        && blob.length() <= binaryDedupMaxSize;
+                if (dedup) {
+                    // alternatively look if the exact same binary has been
+                    // cloned
+                    key = getBlobKey(blob);
+                    ids = binaries.get(key);
+                    if (ids != null) {
+                        for (RecordId duplicateId : ids) {
+                            if (new SegmentBlob(blobStore, duplicateId)
+                                    .equals(sb)) {
+                                cache.put(id, duplicateId);
+                                return new SegmentBlob(blobStore, duplicateId);
+                            }
                         }
                     }
                 }
 
                 // if not, clone the large blob and keep track of the result
                 sb = writer.writeBlob(blob);
-
                 cache.put(id, sb.getRecordId());
-                if (ids == null) {
-                    ids = newArrayList();
-                    binaries.put(key, ids);
+
+                if (dedup) {
+                    if (ids == null) {
+                        ids = newArrayList();
+                        binaries.put(key, ids);
+                    }
+                    ids.add(sb.getRecordId());
                 }
-                ids.add(sb.getRecordId());
 
                 return sb;
             } catch (IOException e) {

Modified: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/compaction/SegmentGCOptions.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/compaction/SegmentGCOptions.java?rev=1746686&r1=1746685&r2=1746686&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/compaction/SegmentGCOptions.java (original)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/compaction/SegmentGCOptions.java Fri Jun  3 08:27:59 2016
@@ -84,6 +84,13 @@ public class SegmentGCOptions {
 
     private boolean offline = false;
 
+    private boolean ocBinDeduplication = Boolean
+            .getBoolean("oak.segment.compaction.binaryDeduplication");
+
+    private long ocBinMaxSize = Long.getLong(
+            "oak.segment.compaction.binaryDeduplicationMaxSize",
+            100 * 1024 * 1024);
+
     public SegmentGCOptions(boolean paused, int memoryThreshold, int gainThreshold,
                             int retryCount, boolean forceAfterFail, int lockWaitTime) {
         this.paused = paused;
@@ -240,15 +247,22 @@ public class SegmentGCOptions {
 
     @Override
     public String toString() {
-        return getClass().getSimpleName() + "{" +
-                "paused=" + paused +
-                ", memoryThreshold=" + memoryThreshold +
-                ", gainThreshold=" + gainThreshold +
-                ", retryCount=" + retryCount +
-                ", forceAfterFail=" + forceAfterFail +
-                ", lockWaitTime=" + lockWaitTime +
-                ", retainedGenerations=" + retainedGenerations +
-                ", offline=" + offline + "}";
+        if (offline) {
+            return getClass().getSimpleName() + "{" +
+                    "offline=" + offline +
+                    ", retainedGenerations=" + retainedGenerations +
+                    ", ocBinDeduplication=" + ocBinDeduplication +
+                    ", ocBinMaxSize=" + ocBinMaxSize + "}";
+        } else {
+            return getClass().getSimpleName() + "{" +
+                    "paused=" + paused +
+                    ", memoryThreshold=" + memoryThreshold +
+                    ", gainThreshold=" + gainThreshold +
+                    ", retryCount=" + retryCount +
+                    ", forceAfterFail=" + forceAfterFail +
+                    ", lockWaitTime=" + lockWaitTime +
+                    ", retainedGenerations=" + retainedGenerations + "}";
+        }
     }
 
     /**
@@ -279,4 +293,35 @@ public class SegmentGCOptions {
         this.retainedGenerations = 1;
         return this;
     }
+
+    /**
+     * Offline compaction only. Enables content based de-duplication of
+     * binaries. Involves a fair amount of I/O when reading/comparing
+     * potentially equal blobs. set via the
+     * 'oak.segment.compaction.binaryDeduplication' system property
+     * @return this instance.
+     */
+    public SegmentGCOptions withBinaryDeduplication() {
+        this.ocBinDeduplication = true;
+        return this;
+    }
+
+    public boolean isBinaryDeduplication() {
+        return this.ocBinDeduplication;
+    }
+
+    /**
+     * Offline compaction only. Set the upper bound for the content based
+     * de-duplication checks.
+     * @param binMaxSize
+     * @return this instance
+     */
+    public SegmentGCOptions setBinaryDeduplicationMaxSize(long binMaxSize) {
+        this.ocBinMaxSize = binMaxSize;
+        return this;
+    }
+
+    public long getBinaryDeduplicationMaxSize() {
+        return this.ocBinMaxSize;
+    }
 }

Modified: jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/FileStore.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/FileStore.java?rev=1746686&r1=1746685&r2=1746686&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/FileStore.java (original)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/main/java/org/apache/jackrabbit/oak/segment/file/FileStore.java Fri Jun  3 08:27:59 2016
@@ -1141,7 +1141,7 @@ public class FileStore implements Segmen
         if (gcOptions.isOffline()) {
             // Capital C to indicate offline compaction
             SegmentWriter writer = new SegmentWriter(this, segmentReader, blobStore, tracker, bufferWriter);
-            return new Compactor(segmentReader, writer, blobStore, cancel)
+            return new Compactor(segmentReader, writer, blobStore, cancel, gcOptions)
                     .compact(EMPTY_NODE, head, EMPTY_NODE);
         } else {
             return segmentWriter.writeNode(head, bufferWriter, cancel);

Modified: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactionAndCleanupIT.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactionAndCleanupIT.java?rev=1746686&r1=1746685&r2=1746686&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactionAndCleanupIT.java (original)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactionAndCleanupIT.java Fri Jun  3 08:27:59 2016
@@ -256,6 +256,219 @@ public class CompactionAndCleanupIT {
         }
     }
 
+    /**
+     * Create a lot of data nodes (no binaries) and a few checkpoints, verify
+     * that compacting checkpoints will not cause the size to explode
+     */
+    @Test
+    public void offlineCompactionCps() throws IOException,
+            CommitFailedException {
+        SegmentGCOptions gcOptions = DEFAULT.setOffline();
+        FileStore fileStore = FileStore.builder(getFileStoreFolder())
+                .withMaxFileSize(1)
+                .withGCOptions(gcOptions)
+                .build();
+        SegmentNodeStore nodeStore = SegmentNodeStoreBuilders.builder(fileStore).build();
+        try {
+            // Create ~2MB of data
+            NodeBuilder extra = nodeStore.getRoot().builder();
+            NodeBuilder content = extra.child("content");
+            for (int i = 0; i < 10000; i++) {
+                NodeBuilder c = content.child("c" + i);
+                for (int j = 0; j < 1000; j++) {
+                    c.setProperty("p" + i, "v" + i);
+                }
+            }
+            nodeStore.merge(extra, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+            fileStore.flush();
+            fileStore.compact();
+            fileStore.cleanup();
+            // Compacts to 548Kb
+            long size0 = fileStore.size();
+
+            int cpNo = 4;
+            Set<String> cps = new HashSet<String>();
+            for (int i = 0; i < cpNo; i++) {
+                cps.add(nodeStore.checkpoint(60000));
+            }
+            assertEquals(cpNo, cps.size());
+            for (String cp : cps) {
+                assertTrue(nodeStore.retrieve(cp) != null);
+            }
+
+            long size1 = fileStore.size();
+            assertSize("with checkpoints added", size1, size0, size0 * 11 / 10);
+            fileStore.compact();
+            fileStore.cleanup();
+            long size2 = fileStore.size();
+            assertSize("with checkpoints compacted", size2, size1 * 9/10, size1 * 11 / 10);
+        } finally {
+            fileStore.close();
+        }
+    }
+
+    /**
+     * Create 2 binary nodes with same content but not same reference. Verify
+     * de-duplication capabilities of compaction.
+     */
+    @Test
+    public void offlineCompactionBinC1() throws IOException,
+            CommitFailedException {
+        SegmentGCOptions gcOptions = DEFAULT.setOffline()
+                .withBinaryDeduplication();
+        FileStore fileStore = FileStore.builder(getFileStoreFolder())
+                .withMaxFileSize(1).withGCOptions(gcOptions).build();
+        SegmentNodeStore nodeStore = SegmentNodeStoreBuilders
+                .builder(fileStore).build();
+
+        try {
+            NodeBuilder extra = nodeStore.getRoot().builder();
+            NodeBuilder content = extra.child("content");
+
+            int blobSize = 5 * 1024 * 1024;
+            byte[] data = new byte[blobSize];
+            new Random().nextBytes(data);
+
+            NodeBuilder c1 = content.child("c1");
+            Blob b1 = nodeStore.createBlob(new ByteArrayInputStream(data));
+            c1.setProperty("blob1", b1);
+            NodeBuilder c2 = content.child("c2");
+            Blob b2 = nodeStore.createBlob(new ByteArrayInputStream(data));
+            c2.setProperty("blob2", b2);
+            nodeStore.merge(extra, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+            fileStore.flush();
+
+            int cpNo = 4;
+            Set<String> cps = new HashSet<String>();
+            for (int i = 0; i < cpNo; i++) {
+                cps.add(nodeStore.checkpoint(60000));
+            }
+            assertEquals(cpNo, cps.size());
+            for (String cp : cps) {
+                assertTrue(nodeStore.retrieve(cp) != null);
+            }
+
+            long size1 = fileStore.size();
+            fileStore.compact();
+            fileStore.cleanup();
+            long size2 = fileStore.size();
+            assertSize("with compacted binaries", size2, 0, size1 - blobSize);
+        } finally {
+            fileStore.close();
+        }
+    }
+
+    /**
+     * Create 2 binary nodes with same content but not same reference. Reduce
+     * the max size if de-duplicated binaries under the binary length. Verify
+     * de-duplication capabilities of compaction.
+     */
+    @Test
+    public void offlineCompactionBinC2() throws IOException,
+            CommitFailedException {
+        int blobSize = 5 * 1024 * 1024;
+
+        SegmentGCOptions gcOptions = DEFAULT.setOffline()
+                .withBinaryDeduplication()
+                .setBinaryDeduplicationMaxSize(blobSize / 2);
+        FileStore fileStore = FileStore.builder(getFileStoreFolder())
+                .withMaxFileSize(1).withGCOptions(gcOptions).build();
+        SegmentNodeStore nodeStore = SegmentNodeStoreBuilders
+                .builder(fileStore).build();
+
+        try {
+            NodeBuilder extra = nodeStore.getRoot().builder();
+            NodeBuilder content = extra.child("content");
+
+            byte[] data = new byte[blobSize];
+            new Random().nextBytes(data);
+
+            NodeBuilder c1 = content.child("c1");
+            Blob b1 = nodeStore.createBlob(new ByteArrayInputStream(data));
+            c1.setProperty("blob1", b1);
+            NodeBuilder c2 = content.child("c2");
+            Blob b2 = nodeStore.createBlob(new ByteArrayInputStream(data));
+            c2.setProperty("blob2", b2);
+            nodeStore.merge(extra, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+            fileStore.flush();
+
+            int cpNo = 4;
+            Set<String> cps = new HashSet<String>();
+            for (int i = 0; i < cpNo; i++) {
+                cps.add(nodeStore.checkpoint(60000));
+            }
+            assertEquals(cpNo, cps.size());
+            for (String cp : cps) {
+                assertTrue(nodeStore.retrieve(cp) != null);
+            }
+
+            long size1 = fileStore.size();
+            fileStore.compact();
+            fileStore.cleanup();
+            long size2 = fileStore.size();
+
+            // not expected to reduce the size too much, as the binaries are
+            // above the threshold
+            assertSize("with compacted binaries", size2, size1 * 9 / 10,
+                    size1 * 11 / 10);
+        } finally {
+            fileStore.close();
+        }
+    }
+
+    /**
+     * Create 2 binary nodes with same content and same reference. Verify
+     * de-duplication capabilities of compaction
+     */
+    @Test
+    public void offlineCompactionBinR1() throws IOException,
+            CommitFailedException {
+        SegmentGCOptions gcOptions = DEFAULT.setOffline();
+        FileStore fileStore = FileStore.builder(getFileStoreFolder())
+                .withMaxFileSize(1).withGCOptions(gcOptions).build();
+        SegmentNodeStore nodeStore = SegmentNodeStoreBuilders
+                .builder(fileStore).build();
+
+        try {
+            NodeBuilder extra = nodeStore.getRoot().builder();
+            NodeBuilder content = extra.child("content");
+
+            int blobSize = 5 * 1024 * 1024;
+            byte[] data = new byte[blobSize];
+            new Random().nextBytes(data);
+            Blob b = nodeStore.createBlob(new ByteArrayInputStream(data));
+
+            NodeBuilder c1 = content.child("c1");
+            c1.setProperty("blob1", b);
+            NodeBuilder c2 = content.child("c2");
+            c2.setProperty("blob2", b);
+            nodeStore.merge(extra, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+            fileStore.flush();
+
+            int cpNo = 4;
+            Set<String> cps = new HashSet<String>();
+            for (int i = 0; i < cpNo; i++) {
+                cps.add(nodeStore.checkpoint(60000));
+            }
+            assertEquals(cpNo, cps.size());
+            for (String cp : cps) {
+                assertTrue(nodeStore.retrieve(cp) != null);
+            }
+
+            // 5Mb, de-duplication by the SegmentWriter
+            long size1 = fileStore.size();
+            fileStore.compact();
+            fileStore.cleanup();
+            long size2 = fileStore.size();
+            assertSize("with compacted binaries", size2, 0, size1 * 11 / 10);
+            
+            System.err.println(size2);
+            
+        } finally {
+            fileStore.close();
+        }
+    }
+
     private static void assertSize(String info, long size, long lower, long upper) {
         log.debug("File Store {} size {}, expected in interval [{},{}]",
                 info, size, lower, upper);

Modified: jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactorTest.java?rev=1746686&r1=1746685&r2=1746686&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactorTest.java (original)
+++ jackrabbit/oak/trunk/oak-segment-tar/src/test/java/org/apache/jackrabbit/oak/segment/CompactorTest.java Fri Jun  3 08:27:59 2016
@@ -25,8 +25,10 @@ import static org.junit.Assert.assertFal
 import java.io.IOException;
 
 import com.google.common.base.Suppliers;
+
 import org.apache.jackrabbit.oak.Oak;
 import org.apache.jackrabbit.oak.api.CommitFailedException;
+import org.apache.jackrabbit.oak.segment.compaction.SegmentGCOptions;
 import org.apache.jackrabbit.oak.segment.memory.MemoryStore;
 import org.apache.jackrabbit.oak.spi.commit.CommitInfo;
 import org.apache.jackrabbit.oak.spi.commit.EmptyHook;
@@ -53,7 +55,7 @@ public class CompactorTest {
 
         SegmentWriter writer = SegmentWriters.segmentWriter(memoryStore, LATEST_VERSION, "c", 1);
         Compactor compactor = new Compactor(memoryStore.getReader(), writer,
-                memoryStore.getBlobStore(), Suppliers.ofInstance(false));
+                memoryStore.getBlobStore(), Suppliers.ofInstance(false), SegmentGCOptions.DEFAULT);
         addTestContent(store, 0);
 
         NodeState initial = store.getRoot();
@@ -75,8 +77,9 @@ public class CompactorTest {
 
         NodeStore store = SegmentNodeStoreBuilders.builder(memoryStore).build();
         SegmentWriter writer = SegmentWriters.segmentWriter(memoryStore, LATEST_VERSION, "c", 1);
-        Compactor compactor = new Compactor(memoryStore.getReader(), writer, memoryStore.getBlobStore(),
-                Suppliers.ofInstance(true));
+        Compactor compactor = new Compactor(memoryStore.getReader(), writer,
+                memoryStore.getBlobStore(), Suppliers.ofInstance(true),
+                SegmentGCOptions.DEFAULT);
         SegmentNodeState sns = compactor.compact(store.getRoot(),
                 addChild(store.getRoot(), "b"), store.getRoot());
         assertFalse(sns.hasChildNode("b"));