You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/10/21 13:23:28 UTC

[lucene] branch main updated: LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)

This is an automated email from the ASF dual-hosted git repository.

mikemccand pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git


The following commit(s) were added to refs/heads/main by this push:
     new e3151d6  LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)
e3151d6 is described below

commit e3151d6c7dea187ed99d349f1435b38b31aa6dd9
Author: Michael McCandless <mi...@apache.org>
AuthorDate: Thu Oct 21 09:23:17 2021 -0400

    LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)
---
 .../java/org/apache/lucene/index/MergePolicy.java  |  15 +-
 .../org/apache/lucene/index/TieredMergePolicy.java |  51 ++++---
 .../apache/lucene/index/TestTieredMergePolicy.java | 162 ++++++++++++++-------
 3 files changed, 151 insertions(+), 77 deletions(-)

diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
index 2d9717a..bead219 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
@@ -447,6 +447,7 @@ public abstract class MergePolicy {
       merges.add(merge);
     }
 
+    // TODO: deprecate me (dir is never used!  and is sometimes difficult to provide!)
     /** Returns a description of the merges in this specification. */
     public String segString(Directory dir) {
       StringBuilder b = new StringBuilder();
@@ -458,6 +459,17 @@ public abstract class MergePolicy {
       return b.toString();
     }
 
+    @Override
+    public String toString() {
+      StringBuilder b = new StringBuilder();
+      b.append("MergeSpec:");
+      final int count = merges.size();
+      for (int i = 0; i < count; i++) {
+        b.append("\n  ").append(1 + i).append(": ").append(merges.get(i).segString());
+      }
+      return b.toString();
+    }
+
     /** Waits if necessary for at most the given time for all merges. */
     boolean await(long timeout, TimeUnit unit) {
       try {
@@ -562,8 +574,7 @@ public abstract class MergePolicy {
    * one thread at a time will call this method.
    *
    * @param segmentInfos the total set of segments in the index
-   * @param maxSegmentCount requested maximum number of segments in the index (currently this is
-   *     always 1)
+   * @param maxSegmentCount requested maximum number of segments in the index
    * @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away.
    *     This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo,
    *     that means this segment was an original segment present in the to-be-merged index; else, it
diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
index 76863eb..a71a063 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
@@ -51,16 +51,16 @@ import java.util.Set;
  * <p><b>NOTE</b>: This policy always merges by byte size of the segments, always pro-rates by
  * percent deletes
  *
- * <p><b>NOTE</b> Starting with Lucene 7.5, there are several changes:
+ * <p><b>NOTE</b> Starting with Lucene 7.5, if you call {@link IndexWriter#forceMerge(int)} with
+ * this (default) merge policy, if {@link #setMaxMergedSegmentMB} is in conflict with {@code
+ * maxNumSegments} passed to {@link IndexWriter#forceMerge} then {@code maxNumSegments} wins. For
+ * example, if your index has 50 1 GB segments, and you have {@link #setMaxMergedSegmentMB} at 1024
+ * (1 GB), and you call {@code forceMerge(10)}, the two settings are clearly in conflict. {@code
+ * TieredMergePolicy} will choose to break the {@link #setMaxMergedSegmentMB} constraint and try to
+ * merge down to at most ten segments, each up to 5 * 1.25 GB in size (since an extra 25% buffer
+ * increase in the expected segment size is targetted).
  *
- * <p>- findForcedMerges and findForcedDeletesMerges) respect the max segment size by default.
- *
- * <p>- When findforcedmerges is called with maxSegmentCount other than 1, the resulting index is
- * not guaranteed to have &lt;= maxSegmentCount segments. Rather it is on a "best effort" basis.
- * Specifically the theoretical ideal segment size is calculated and a "fudge factor" of 25% is
- * added as the new maxSegmentSize, which is respected.
- *
- * <p>- findForcedDeletesMerges will not produce segments greater than maxSegmentSize.
+ * <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
  *
  * @lucene.experimental
  */
@@ -707,8 +707,8 @@ public class TieredMergePolicy extends MergePolicy {
     final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
 
     // Trim the list down, remove if we're respecting max segment size and it's not original.
-    // Presumably it's been merged before and
-    //   is close enough to the max segment size we shouldn't add it in again.
+    // Presumably it's been merged before and is close enough to the max segment size we
+    // shouldn't add it in again.
     Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator();
     boolean forceMergeRunning = false;
     while (iter.hasNext()) {
@@ -729,17 +729,20 @@ public class TieredMergePolicy extends MergePolicy {
     long maxMergeBytes = maxMergedSegmentBytes;
 
     // Set the maximum segment size based on how many segments have been specified.
-    if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE;
-    else if (maxSegmentCount != Integer.MAX_VALUE) {
-      // Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use
-      // the exact size,
-      // it's almost guaranteed that the segments won't fit perfectly and we'll be left with more
-      // segments than
-      // we want and have to re-merge in the code at the bottom of this method.
+    if (maxSegmentCount == 1) {
+      maxMergeBytes = Long.MAX_VALUE;
+    } else if (maxSegmentCount != Integer.MAX_VALUE) {
       maxMergeBytes =
           Math.max(
               (long) (((double) totalMergeBytes / (double) maxSegmentCount)),
               maxMergedSegmentBytes);
+      // Fudge this up a bit so we have a better chance of not having to do a second pass of merging
+      // to get
+      // down to the requested target segment count. If we use the exact size, it's almost
+      // guaranteed
+      // that the segments selected below won't fit perfectly and we'll be left with more segments
+      // than
+      // we want and have to re-merge in the code at the bottom of this method.
       maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
     }
 
@@ -748,8 +751,8 @@ public class TieredMergePolicy extends MergePolicy {
     while (iter.hasNext()) {
       SegmentSizeAndDocs segSizeDocs = iter.next();
       Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
-      if (segSizeDocs.delCount
-          != 0) { // This is forceMerge, all segments with deleted docs should be merged.
+      if (segSizeDocs.delCount != 0) {
+        // This is forceMerge; all segments with deleted docs should be merged.
         if (isOriginal != null && isOriginal) {
           foundDeletes = true;
         }
@@ -770,8 +773,7 @@ public class TieredMergePolicy extends MergePolicy {
       return null;
     }
 
-    // We should never bail if there are segments that have deleted documents, all deleted docs
-    // should be purged.
+    // We only bail if there are no deletions
     if (foundDeletes == false) {
       SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
       if ((maxSegmentCount != Integer.MAX_VALUE
@@ -794,6 +796,11 @@ public class TieredMergePolicy extends MergePolicy {
 
     final int startingSegmentCount = sortedSizeAndDocs.size();
     if (forceMergeRunning) {
+      // hmm this is a little dangerous -- if a user kicks off a forceMerge, it is taking forever,
+      // lots of
+      // new indexing/segments happened since, and they want to kick off another to ensure those
+      // newly
+      // indexed segments partake in the force merge, they (silently) won't due to this?
       return null;
     }
 
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
index 863da7f..09c06b3 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
@@ -17,7 +17,6 @@
 package org.apache.lucene.index;
 
 import java.io.IOException;
-import java.io.UncheckedIOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@@ -411,55 +410,132 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
     dir.close();
   }
 
+  /**
+   * Returns how many segments are in the index after applying all merges from the {@code spec} to
+   * an index with {@code startingSegmentCount} segments
+   */
+  private int postMergesSegmentCount(int startingSegmentCount, MergeSpecification spec) {
+    int count = startingSegmentCount;
+    // remove the segments that are merged away
+    for (var merge : spec.merges) {
+      count -= merge.segments.size();
+    }
+
+    // add one for each newly merged segment
+    count += spec.merges.size();
+
+    return count;
+  }
+
+  // verify that all merges in the spec do not create a final merged segment size too much bigger
+  // than the configured maxMergedSegmentSizeMb
+  private static void assertMaxMergedSize(
+      MergeSpecification specification,
+      double maxMergedSegmentSizeMB,
+      double indexTotalSizeInMB,
+      int maxMergedSegmentCount)
+      throws IOException {
+
+    // When the two configs conflict, TMP favors the requested number of segments. I.e., it will
+    // produce
+    // too-large (> maxMergedSegmentMB) merged segments.
+    double maxMBPerSegment = indexTotalSizeInMB / maxMergedSegmentCount;
+
+    for (OneMerge merge : specification.merges) {
+      // compute total size of all segments being merged
+      long mergeTotalSizeInBytes = 0;
+      for (var segment : merge.segments) {
+        mergeTotalSizeInBytes += segment.sizeInBytes();
+      }
+
+      // we allow up to 50% "violation" of the configured maxMergedSegmentSizeMb (why? TMP itself is
+      // on only adding 25% fudge factor):
+      // TODO: drop this fudge factor back down to 25% -- TooMuchFudgeException!
+      long limitBytes =
+          (long) (1024 * 1024 * Math.max(maxMBPerSegment, maxMergedSegmentSizeMB) * 1.5);
+      assertTrue(
+          "mergeTotalSizeInBytes="
+              + mergeTotalSizeInBytes
+              + " limitBytes="
+              + limitBytes
+              + " maxMergedSegmentSizeMb="
+              + maxMergedSegmentSizeMB,
+          mergeTotalSizeInBytes < limitBytes);
+    }
+  }
+
   // LUCENE-8688 reports that force merges merged more segments that necessary to respect
   // maxSegmentCount as a result
   // of LUCENE-7976 so we ensure that it only does the minimum number of merges here.
   public void testForcedMergesUseLeastNumberOfMerges() throws Exception {
-    final TieredMergePolicy tmp = new TieredMergePolicy();
-    final double oneSegmentSize = 1.0D;
-    final double maxSegmentSize = 10 * oneSegmentSize;
-    tmp.setMaxMergedSegmentMB(maxSegmentSize);
+    TieredMergePolicy tmp = new TieredMergePolicy();
+    double oneSegmentSizeMB = 1.0D;
+    double maxMergedSegmentSizeMB = 10 * oneSegmentSizeMB;
+    tmp.setMaxMergedSegmentMB(maxMergedSegmentSizeMB);
+    if (VERBOSE) {
+      System.out.println(
+          String.format(Locale.ROOT, "TEST: maxMergedSegmentSizeMB=%.2f", maxMergedSegmentSizeMB));
+    }
 
+    // create simulated 30 segment index where each segment is 1 MB
     SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
-    for (int j = 0; j < 30; ++j) {
-      infos.add(makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSize, IndexWriter.SOURCE_MERGE));
+    int segmentCount = 30;
+    for (int j = 0; j < segmentCount; j++) {
+      infos.add(
+          makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
     }
 
-    final int expectedCount = random().nextInt(10) + 3;
-    final MergeSpecification specification =
+    double indexTotalSizeMB = segmentCount * oneSegmentSizeMB;
+
+    int maxSegmentCountAfterForceMerge = random().nextInt(10) + 3;
+    if (VERBOSE) {
+      System.out.println("TEST: maxSegmentCountAfterForceMerge=" + maxSegmentCountAfterForceMerge);
+    }
+    MergeSpecification specification =
         tmp.findForcedMerges(
             infos,
-            expectedCount,
+            maxSegmentCountAfterForceMerge,
             segmentsToMerge(infos),
             new MockMergeContext(SegmentCommitInfo::getDelCount));
-    assertMaxSize(specification, maxSegmentSize);
-    final int resultingCount =
-        infos.size()
-            + specification.merges.size()
-            - specification.merges.stream().mapToInt(spec -> spec.segments.size()).sum();
-    assertEquals(expectedCount, resultingCount);
-
-    SegmentInfos manySegmentsInfos = new SegmentInfos(Version.LATEST.major);
+    if (VERBOSE) {
+      System.out.println("TEST: specification=" + specification);
+    }
+    assertMaxMergedSize(
+        specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
+
+    // verify we achieved exactly the max segment count post-force-merge (which is a bit odd -- the
+    // API only ensures <= segments, not ==)
+    // TODO: change this to a <= equality like the last assert below?
+    assertEquals(
+        maxSegmentCountAfterForceMerge, postMergesSegmentCount(infos.size(), specification));
+
+    // now create many segments index, containing 0.1 MB sized segments
+    infos = new SegmentInfos(Version.LATEST.major);
     final int manySegmentsCount = atLeast(100);
-    for (int j = 0; j < manySegmentsCount; ++j) {
-      manySegmentsInfos.add(
-          makeSegmentCommitInfo("_" + j, 1000, 0, 0.1D, IndexWriter.SOURCE_MERGE));
+    if (VERBOSE) {
+      System.out.println("TEST: manySegmentsCount=" + manySegmentsCount);
+    }
+    oneSegmentSizeMB = 0.1D;
+    for (int j = 0; j < manySegmentsCount; j++) {
+      infos.add(
+          makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
     }
 
-    final MergeSpecification specificationManySegments =
+    indexTotalSizeMB = manySegmentsCount * oneSegmentSizeMB;
+
+    specification =
         tmp.findForcedMerges(
-            manySegmentsInfos,
-            expectedCount,
-            segmentsToMerge(manySegmentsInfos),
+            infos,
+            maxSegmentCountAfterForceMerge,
+            segmentsToMerge(infos),
             new MockMergeContext(SegmentCommitInfo::getDelCount));
-    assertMaxSize(specificationManySegments, maxSegmentSize);
-    final int resultingCountManySegments =
-        manySegmentsInfos.size()
-            + specificationManySegments.merges.size()
-            - specificationManySegments.merges.stream()
-                .mapToInt(spec -> spec.segments.size())
-                .sum();
-    assertTrue(resultingCountManySegments >= expectedCount);
+    if (VERBOSE) {
+      System.out.println("TEST: specification=" + specification);
+    }
+    assertMaxMergedSize(
+        specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
+    assertTrue(
+        postMergesSegmentCount(infos.size(), specification) >= maxSegmentCountAfterForceMerge);
   }
 
   // Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but
@@ -493,26 +569,6 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
     return segmentsToMerge;
   }
 
-  private static void assertMaxSize(MergeSpecification specification, double maxSegmentSizeMb) {
-    for (OneMerge merge : specification.merges) {
-      long size =
-          merge.segments.stream()
-              .mapToLong(
-                  s -> {
-                    try {
-                      return s.sizeInBytes();
-                    } catch (IOException e) {
-                      throw new UncheckedIOException(e);
-                    }
-                  })
-              .sum();
-      long limit = (long) (1024 * 1024 * maxSegmentSizeMb * 1.5);
-      assertTrue(
-          "size=" + size + ",limit=" + limit + ",maxSegmentSizeMb=" + maxSegmentSizeMb,
-          size < limit);
-    }
-  }
-
   // Having a segment with very few documents in it can happen because of the random nature of the
   // docs added to the index. For instance, let's say it just happens that the last segment has 3
   // docs in it.