You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by da...@apache.org on 2023/06/19 17:34:45 UTC

[jackrabbit-oak] branch OAK-10199 updated: OAK-10199 : added check to include oldestId when running detailedGc very first time

This is an automated email from the ASF dual-hosted git repository.

daim pushed a commit to branch OAK-10199
in repository https://gitbox.apache.org/repos/asf/jackrabbit-oak.git


The following commit(s) were added to refs/heads/OAK-10199 by this push:
     new dc7092527e OAK-10199 : added check to include oldestId when running detailedGc very first time
dc7092527e is described below

commit dc7092527edbdd565dca86da055fc45fd06ab2f2
Author: Rishabh Kumar <di...@adobe.com>
AuthorDate: Mon Jun 19 23:04:33 2023 +0530

    OAK-10199 : added check to include oldestId when running detailedGc very first time
---
 .../oak/plugins/document/VersionGCSupport.java     | 20 +++++---
 .../plugins/document/VersionGarbageCollector.java  |  5 +-
 .../document/mongo/MongoVersionGCSupport.java      | 18 ++++---
 .../plugins/document/rdb/RDBVersionGCSupport.java  | 22 +++++----
 .../oak/plugins/document/VersionGCSupportTest.java | 39 ++++++++++++---
 .../document/VersionGarbageCollectorIT.java        | 57 ++++++++++++++++++++++
 6 files changed, 128 insertions(+), 33 deletions(-)

diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java
index db54553061..96fa2bbaea 100644
--- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java
+++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupport.java
@@ -74,21 +74,25 @@ public class VersionGCSupport {
 
     /**
      * Returns documents that have a {@link NodeDocument#MODIFIED_IN_SECS} value
-     * within the given range .The two passed modified timestamps are in milliseconds
+     * within the given range and are greater than given @{@link NodeDocument#ID}.
+     * <p>
+     * The two passed modified timestamps are in milliseconds
      * since the epoch and the implementation will convert them to seconds at
      * the granularity of the {@link NodeDocument#MODIFIED_IN_SECS} field and
      * then perform the comparison.
+     * <p/>
      *
-     * @param fromModified the lower bound modified timestamp (inclusive)
-     * @param toModified the upper bound modified timestamp (exclusive)
-     * @param limit the limit of documents to return
-     * @param fromId the lower bound {@link NodeDocument#ID} (exclusive)
+     * @param fromModified  the lower bound modified timestamp (inclusive)
+     * @param toModified    the upper bound modified timestamp (exclusive)
+     * @param limit         the limit of documents to return
+     * @param fromId        the lower bound {@link NodeDocument#ID}
+     * @param includeFromId boolean indicating whether {@code fromId} is inclusive or not
      * @return matching documents.
      */
     public Iterable<NodeDocument> getModifiedDocs(final long fromModified, final long toModified, final int limit,
-                                                  @NotNull final String fromId) {
+                                                  @NotNull final String fromId, boolean includeFromId) {
         return StreamSupport
-                .stream(getSelectedDocuments(store, MODIFIED_IN_SECS, 1, fromId).spliterator(), false)
+                .stream(getSelectedDocuments(store, MODIFIED_IN_SECS, 1, includeFromId ? "\0"+fromId : fromId).spliterator(), false)
                 .filter(input -> modifiedGreaterThanEquals(input, fromModified) && modifiedLessThan(input, toModified))
                 .sorted((o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2))
                 .limit(limit)
@@ -193,7 +197,7 @@ public class VersionGCSupport {
 
         LOG.info("find oldest modified document");
         try {
-            docs = getModifiedDocs(ts, now, 1, MIN_ID_VALUE);
+            docs = getModifiedDocs(ts, now, 1, MIN_ID_VALUE, false);
             if (docs.iterator().hasNext()) {
                 return docs.iterator().next();
             }
diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
index f54299e3fd..0c7da0d4fa 100644
--- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
+++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollector.java
@@ -615,6 +615,7 @@ public class VersionGarbageCollector {
                 throws IOException {
             int docsTraversed = 0;
             boolean foundDoc = true;
+            boolean includeFromId = true;
             long oldestModifiedDocTimeStamp = rec.scopeDetailedGC.fromMs;
             String oldestModifiedDocId = rec.detailedGCId;
             try (DetailedGC gc = new DetailedGC(headRevision, monitor, cancel)) {
@@ -624,7 +625,9 @@ public class VersionGarbageCollector {
                     while (foundDoc && oldestModifiedDocTimeStamp < toModified && docsTraversed <= PROGRESS_BATCH_SIZE) {
                         // set foundDoc to false to allow exiting the while loop
                         foundDoc = false;
-                        Iterable<NodeDocument> itr = versionStore.getModifiedDocs(oldestModifiedDocTimeStamp, toModified, 1000, oldestModifiedDocId);
+                        Iterable<NodeDocument> itr = versionStore.getModifiedDocs(oldestModifiedDocTimeStamp, toModified, 1000, oldestModifiedDocId, includeFromId);
+                        // set includeFromId to false for subsequent queries
+                        includeFromId = false;
                         try {
                             for (NodeDocument doc : itr) {
                                 foundDoc = true;
diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java
index 9896857e36..ca9a8a955b 100644
--- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java
+++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/mongo/MongoVersionGCSupport.java
@@ -131,23 +131,27 @@ public class MongoVersionGCSupport extends VersionGCSupport {
 
     /**
      * Returns documents that have a {@link NodeDocument#MODIFIED_IN_SECS} value
-     * within the given range .The two passed modified timestamps are in milliseconds
+     * within the given range and are greater than given @{@link NodeDocument#ID}.
+     * <p>
+     * The two passed modified timestamps are in milliseconds
      * since the epoch and the implementation will convert them to seconds at
      * the granularity of the {@link NodeDocument#MODIFIED_IN_SECS} field and
      * then perform the comparison.
+     * <p/>
      *
-     * @param fromModified the lower bound modified timestamp (inclusive)
-     * @param toModified the upper bound modified timestamp (exclusive)
-     * @param limit the limit of documents to return
-     * @param fromId the lower bound {@link NodeDocument#ID} (exclusive)
+     * @param fromModified  the lower bound modified timestamp (inclusive)
+     * @param toModified    the upper bound modified timestamp (exclusive)
+     * @param limit         the limit of documents to return
+     * @param fromId        the lower bound {@link NodeDocument#ID}
+     * @param includeFromId boolean indicating whether {@code fromId} is inclusive or not
      * @return matching documents.
      */
     @Override
     public Iterable<NodeDocument> getModifiedDocs(final long fromModified, final long toModified, final int limit,
-                                                  @NotNull final String fromId) {
+                                                  @NotNull final String fromId, boolean includeFromId) {
         // _modified >= fromModified && _modified < toModified && _id > fromId
         final Bson query = and(gte(MODIFIED_IN_SECS, getModifiedInSecs(fromModified)),
-                lt(MODIFIED_IN_SECS, getModifiedInSecs(toModified)), gt(ID, fromId));
+                lt(MODIFIED_IN_SECS, getModifiedInSecs(toModified)), includeFromId ? gte(ID, fromId) :gt(ID, fromId));
         // first sort by _modified and then by _id
         final Bson sort = and(eq(MODIFIED_IN_SECS, 1), eq(ID, 1));
 
diff --git a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java
index 7006c18683..efce4b8006 100644
--- a/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java
+++ b/oak-store-document/src/main/java/org/apache/jackrabbit/oak/plugins/document/rdb/RDBVersionGCSupport.java
@@ -96,27 +96,31 @@ public class RDBVersionGCSupport extends VersionGCSupport {
 
     /**
      * Returns documents that have a {@link NodeDocument#MODIFIED_IN_SECS} value
-     * within the given range .The two passed modified timestamps are in milliseconds
+     * within the given range and are greater than given @{@link NodeDocument#ID}.
+     * <p>
+     * The two passed modified timestamps are in milliseconds
      * since the epoch and the implementation will convert them to seconds at
      * the granularity of the {@link NodeDocument#MODIFIED_IN_SECS} field and
      * then perform the comparison.
+     * <p/>
      *
-     * @param fromModified the lower bound modified timestamp (inclusive)
-     * @param toModified the upper bound modified timestamp (exclusive)
-     * @param limit the limit of documents to return
-     * @param fromId the lower bound {@link NodeDocument#ID} (exclusive)
+     * @param fromModified  the lower bound modified timestamp (inclusive)
+     * @param toModified    the upper bound modified timestamp (exclusive)
+     * @param limit         the limit of documents to return
+     * @param fromId        the lower bound {@link NodeDocument#ID}
+     * @param includeFromId boolean indicating whether {@code fromId} is inclusive or not
      * @return matching documents.
      */
     @Override
     public Iterable<NodeDocument> getModifiedDocs(final long fromModified, final long toModified, final int limit,
-                                                  @NotNull final String fromId) {
+                                                  @NotNull final String fromId, boolean includeFromId) {
         List<QueryCondition> conditions = of(new QueryCondition(MODIFIED_IN_SECS, "<", getModifiedInSecs(toModified)),
                 new QueryCondition(MODIFIED_IN_SECS, ">=", getModifiedInSecs(fromModified)),
-                new QueryCondition(ID, ">", of(fromId)));
+                new QueryCondition(ID, includeFromId ? ">=" : ">", of(fromId)));
         if (MODE == 1) {
             return getIterator(EMPTY_KEY_PATTERN, conditions);
         } else {
-            return store.queryAsIterable(NODES, fromId, null, EMPTY_KEY_PATTERN, conditions, limit, of(MODIFIED_IN_SECS, ID));
+            return store.queryAsIterable(NODES, null, null, EMPTY_KEY_PATTERN, conditions, limit, of(MODIFIED_IN_SECS, ID));
         }
     }
 
@@ -287,7 +291,7 @@ public class RDBVersionGCSupport extends VersionGCSupport {
         LOG.info("getOldestModifiedDoc() <- start");
         Iterable<NodeDocument> modifiedDocs = null;
         try {
-            modifiedDocs = getModifiedDocs(0L, clock.getTime(), 1, MIN_ID_VALUE);
+            modifiedDocs = getModifiedDocs(0L, clock.getTime(), 1, MIN_ID_VALUE, false);
             doc = modifiedDocs.iterator().hasNext() ? modifiedDocs.iterator().next() : NULL;
         } catch (DocumentStoreException ex) {
             LOG.error("getOldestModifiedDoc()", ex);
diff --git a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java
index 0061771383..cff9511a66 100644
--- a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java
+++ b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGCSupportTest.java
@@ -33,6 +33,7 @@ import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
 
+import static java.lang.Long.MAX_VALUE;
 import static java.util.Comparator.comparing;
 import static java.util.List.of;
 import static java.util.Optional.ofNullable;
@@ -206,8 +207,8 @@ public class VersionGCSupportTest {
     public void findModifiedDocsWhenModifiedIsDifferent() {
         long secs = 42;
         long offset = SECONDS.toMillis(secs);
-        List<UpdateOp> updateOps = new ArrayList<>(5_001);
-        for (int i = 0; i < 5_001; i++) {
+        List<UpdateOp> updateOps = new ArrayList<>(5_000);
+        for (int i = 0; i < 5_000; i++) {
             Revision r = new Revision(offset + (i * 5), 0, 1);
             String id = getIdFromPath("/x" + i);
             ids.add(id);
@@ -223,9 +224,10 @@ public class VersionGCSupportTest {
         long oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L);
         assertEquals(40L, oldestModifiedDocTs);
         assertEquals("1:/x0", oldestModifiedDocId);
+        boolean includeFromId = true;
 
         for(int i = 0; i < 5; i++) {
-            Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), Long.MAX_VALUE, 1000, oldestModifiedDocId);
+            Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, includeFromId);
             assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2)));
             long count = stream(modifiedDocs.spliterator(), false).count();
             assertEquals(1000, count);
@@ -234,6 +236,7 @@ public class VersionGCSupportTest {
             }
             oldestModifiedDocId = oldestModifiedDoc.getId();
             oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L);
+            includeFromId = false;
         }
     }
 
@@ -249,7 +252,7 @@ public class VersionGCSupportTest {
             setModified(op, r);
             updateOps.add(op);
         }
-        // create 5_000 nodes
+        // create 5_001 nodes
         store.create(NODES, updateOps);
 
         NodeDocument oldestModifiedDoc = gcSupport.getOldestModifiedDoc(SIMPLE);
@@ -257,9 +260,10 @@ public class VersionGCSupportTest {
         long oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L);
         assertEquals(40L, oldestModifiedDocTs);
         assertEquals("1:/x0", oldestModifiedDocId);
+        boolean includeFromId = true;
 
         for(int i = 0; i < 5; i++) {
-            Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), Long.MAX_VALUE, 1000, oldestModifiedDocId);
+            Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, includeFromId);
             assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2)));
             long count = stream(modifiedDocs.spliterator(), false).count();
             assertEquals(1000, count);
@@ -268,7 +272,21 @@ public class VersionGCSupportTest {
             }
             oldestModifiedDocId = oldestModifiedDoc.getId();
             oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L);
+            includeFromId = false;
         }
+
+        // fetch last remaining document now
+        Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, false);
+        assertEquals(1, stream(modifiedDocs.spliterator(), false).count());
+        assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2)));
+        oldestModifiedDoc = modifiedDocs.iterator().next();
+        oldestModifiedDocId = oldestModifiedDoc.getId();
+        oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L);
+
+        // all documents had been fetched, now we won't get any document
+        modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, false);
+        assertEquals(0, stream(modifiedDocs.spliterator(), false).count());
+
     }
 
     @Test
@@ -291,10 +309,10 @@ public class VersionGCSupportTest {
         }
         // create 5_000 nodes
         store.create(NODES, updateOps);
-
+        boolean includeFromId = true;
 
         for(int i = 0; i < 5; i++) {
-            Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), Long.MAX_VALUE, 1000, oldestModifiedDocId);
+            Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, includeFromId);
             assertTrue(isInOrder(modifiedDocs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2)));
             long count = stream(modifiedDocs.spliterator(), false).count();
             assertEquals(1000, count);
@@ -303,7 +321,12 @@ public class VersionGCSupportTest {
             }
             oldestModifiedDocId = oldestModifiedDoc.getId();
             oldestModifiedDocTs = ofNullable(oldestModifiedDoc.getModified()).orElse(0L);
+            includeFromId = false;
         }
+
+        // all documents had been fetched, now we won't get any document
+        Iterable<NodeDocument> modifiedDocs = gcSupport.getModifiedDocs(SECONDS.toMillis(oldestModifiedDocTs), MAX_VALUE, 1000, oldestModifiedDocId, false);
+        assertEquals(0, stream(modifiedDocs.spliterator(), false).count());
     }
 
     private void assertPossiblyDeleted(long fromSeconds, long toSeconds, long num) {
@@ -312,7 +335,7 @@ public class VersionGCSupportTest {
     }
 
     private void assertModified(long fromSeconds, long toSeconds, long num) {
-        Iterable<NodeDocument> docs = gcSupport.getModifiedDocs(SECONDS.toMillis(fromSeconds), SECONDS.toMillis(toSeconds), 10, MIN_ID_VALUE);
+        Iterable<NodeDocument> docs = gcSupport.getModifiedDocs(SECONDS.toMillis(fromSeconds), SECONDS.toMillis(toSeconds), 10, MIN_ID_VALUE, false);
         assertEquals(num, stream(docs.spliterator(), false).count());
         assertTrue(isInOrder(docs, (o1, o2) -> comparing(NodeDocument::getModified).thenComparing(Document::getId).compare(o1, o2)));
     }
diff --git a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
index caa156a6d4..f6e8554252 100644
--- a/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
+++ b/oak-store-document/src/test/java/org/apache/jackrabbit/oak/plugins/document/VersionGarbageCollectorIT.java
@@ -34,6 +34,7 @@ import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
 
+import static java.util.concurrent.TimeUnit.SECONDS;
 import static org.apache.commons.lang3.reflect.FieldUtils.writeField;
 import static org.apache.jackrabbit.guava.common.collect.Iterables.filter;
 import static org.apache.jackrabbit.guava.common.collect.Iterables.size;
@@ -83,6 +84,7 @@ import org.apache.jackrabbit.oak.stats.Clock;
 import org.jetbrains.annotations.NotNull;
 import org.junit.After;
 import org.junit.Before;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
@@ -337,6 +339,61 @@ public class VersionGarbageCollectorIT {
         assertEquals(50_000, stats.deletedPropsGCCount);
 
     }
+
+    // Test when we have more than 1000 deleted properties with different revisions
+    @Test
+    @Ignore
+    public void testGCDeletedProps_2() throws Exception {
+        //1. Create nodes with properties
+        NodeBuilder b1 = null;
+        for (int k = 0; k < 50; k ++) {
+            b1 = store.getRoot().builder();
+            // Add property to node & save
+            for (int i = 0; i < 100; i++) {
+                for (int j = 0; j < 10; j++) {
+                    b1.child(k + "z" + i).setProperty("prop" + j, "foo", STRING);
+                }
+            }
+            store.merge(b1, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+            // increase the clock to create new revision for next batch
+            clock.waitUntil(Revision.getCurrentTimestamp() + SECONDS.toMillis(k * 5));
+        }
+
+        // enable the detailed gc flag
+        writeField(gc, "detailedGCEnabled", true, true);
+        long maxAge = 1; //hours
+        long delta = TimeUnit.MINUTES.toMillis(10);
+        //1. Go past GC age and check no GC done as nothing deleted
+        clock.waitUntil(Revision.getCurrentTimestamp() + maxAge);
+        VersionGCStats stats = gc.gc(maxAge, HOURS);
+        assertEquals(0, stats.deletedPropsGCCount);
+
+        //Remove property
+        NodeBuilder b2 = store.getRoot().builder();
+        for (int k = 0; k < 50; k ++) {
+            for (int i = 0; i < 100; i++) {
+                for (int j = 0; j < 10; j++) {
+                    b2.getChildNode(k + "z" + i).removeProperty("prop" + j);
+                }
+            }
+        }
+        store.merge(b2, EmptyHook.INSTANCE, CommitInfo.EMPTY);
+
+        store.runBackgroundOperations();
+
+        //2. Check that a deleted property is not collected before maxAge
+        //Clock cannot move back (it moved forward in #1) so double the maxAge
+        clock.waitUntil(clock.getTime() + delta);
+        stats = gc.gc(maxAge*2, HOURS);
+        assertEquals(0, stats.deletedPropsGCCount);
+
+        //3. Check that deleted property does get collected post maxAge
+        clock.waitUntil(clock.getTime() + HOURS.toMillis(maxAge*2) + delta);
+
+        stats = gc.gc(maxAge*2, HOURS);
+        assertEquals(50_000, stats.deletedPropsGCCount);
+
+    }
     
     private void gcSplitDocsInternal(String subNodeName) throws Exception {
         long maxAge = 1; //hrs