You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2021/10/21 13:23:28 UTC
[lucene] branch main updated: LUCENE-10093: fix conflicting test
assert to match how TieredMergePolicy (TMP) works;
improv TMP javadocs (#375)
This is an automated email from the ASF dual-hosted git repository.
mikemccand pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/lucene.git
The following commit(s) were added to refs/heads/main by this push:
new e3151d6 LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)
e3151d6 is described below
commit e3151d6c7dea187ed99d349f1435b38b31aa6dd9
Author: Michael McCandless <mi...@apache.org>
AuthorDate: Thu Oct 21 09:23:17 2021 -0400
LUCENE-10093: fix conflicting test assert to match how TieredMergePolicy (TMP) works; improv TMP javadocs (#375)
---
.../java/org/apache/lucene/index/MergePolicy.java | 15 +-
.../org/apache/lucene/index/TieredMergePolicy.java | 51 ++++---
.../apache/lucene/index/TestTieredMergePolicy.java | 162 ++++++++++++++-------
3 files changed, 151 insertions(+), 77 deletions(-)
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
index 2d9717a..bead219 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
@@ -447,6 +447,7 @@ public abstract class MergePolicy {
merges.add(merge);
}
+ // TODO: deprecate me (dir is never used! and is sometimes difficult to provide!)
/** Returns a description of the merges in this specification. */
public String segString(Directory dir) {
StringBuilder b = new StringBuilder();
@@ -458,6 +459,17 @@ public abstract class MergePolicy {
return b.toString();
}
+ @Override
+ public String toString() {
+ StringBuilder b = new StringBuilder();
+ b.append("MergeSpec:");
+ final int count = merges.size();
+ for (int i = 0; i < count; i++) {
+ b.append("\n ").append(1 + i).append(": ").append(merges.get(i).segString());
+ }
+ return b.toString();
+ }
+
/** Waits if necessary for at most the given time for all merges. */
boolean await(long timeout, TimeUnit unit) {
try {
@@ -562,8 +574,7 @@ public abstract class MergePolicy {
* one thread at a time will call this method.
*
* @param segmentInfos the total set of segments in the index
- * @param maxSegmentCount requested maximum number of segments in the index (currently this is
- * always 1)
+ * @param maxSegmentCount requested maximum number of segments in the index
* @param segmentsToMerge contains the specific SegmentInfo instances that must be merged away.
* This may be a subset of all SegmentInfos. If the value is True for a given SegmentInfo,
* that means this segment was an original segment present in the to-be-merged index; else, it
diff --git a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
index 76863eb..a71a063 100644
--- a/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
+++ b/lucene/core/src/java/org/apache/lucene/index/TieredMergePolicy.java
@@ -51,16 +51,16 @@ import java.util.Set;
* <p><b>NOTE</b>: This policy always merges by byte size of the segments, always pro-rates by
* percent deletes
*
- * <p><b>NOTE</b> Starting with Lucene 7.5, there are several changes:
+ * <p><b>NOTE</b> Starting with Lucene 7.5, if you call {@link IndexWriter#forceMerge(int)} with
+ * this (default) merge policy, if {@link #setMaxMergedSegmentMB} is in conflict with {@code
+ * maxNumSegments} passed to {@link IndexWriter#forceMerge} then {@code maxNumSegments} wins. For
+ * example, if your index has 50 1 GB segments, and you have {@link #setMaxMergedSegmentMB} at 1024
+ * (1 GB), and you call {@code forceMerge(10)}, the two settings are clearly in conflict. {@code
+ * TieredMergePolicy} will choose to break the {@link #setMaxMergedSegmentMB} constraint and try to
+ * merge down to at most ten segments, each up to 5 * 1.25 GB in size (since an extra 25% buffer
+ * increase in the expected segment size is targetted).
*
- * <p>- findForcedMerges and findForcedDeletesMerges) respect the max segment size by default.
- *
- * <p>- When findforcedmerges is called with maxSegmentCount other than 1, the resulting index is
- * not guaranteed to have <= maxSegmentCount segments. Rather it is on a "best effort" basis.
- * Specifically the theoretical ideal segment size is calculated and a "fudge factor" of 25% is
- * added as the new maxSegmentSize, which is respected.
- *
- * <p>- findForcedDeletesMerges will not produce segments greater than maxSegmentSize.
+ * <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
*
* @lucene.experimental
*/
@@ -707,8 +707,8 @@ public class TieredMergePolicy extends MergePolicy {
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
// Trim the list down, remove if we're respecting max segment size and it's not original.
- // Presumably it's been merged before and
- // is close enough to the max segment size we shouldn't add it in again.
+ // Presumably it's been merged before and is close enough to the max segment size we
+ // shouldn't add it in again.
Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator();
boolean forceMergeRunning = false;
while (iter.hasNext()) {
@@ -729,17 +729,20 @@ public class TieredMergePolicy extends MergePolicy {
long maxMergeBytes = maxMergedSegmentBytes;
// Set the maximum segment size based on how many segments have been specified.
- if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE;
- else if (maxSegmentCount != Integer.MAX_VALUE) {
- // Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use
- // the exact size,
- // it's almost guaranteed that the segments won't fit perfectly and we'll be left with more
- // segments than
- // we want and have to re-merge in the code at the bottom of this method.
+ if (maxSegmentCount == 1) {
+ maxMergeBytes = Long.MAX_VALUE;
+ } else if (maxSegmentCount != Integer.MAX_VALUE) {
maxMergeBytes =
Math.max(
(long) (((double) totalMergeBytes / (double) maxSegmentCount)),
maxMergedSegmentBytes);
+ // Fudge this up a bit so we have a better chance of not having to do a second pass of merging
+ // to get
+ // down to the requested target segment count. If we use the exact size, it's almost
+ // guaranteed
+ // that the segments selected below won't fit perfectly and we'll be left with more segments
+ // than
+ // we want and have to re-merge in the code at the bottom of this method.
maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
}
@@ -748,8 +751,8 @@ public class TieredMergePolicy extends MergePolicy {
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
- if (segSizeDocs.delCount
- != 0) { // This is forceMerge, all segments with deleted docs should be merged.
+ if (segSizeDocs.delCount != 0) {
+ // This is forceMerge; all segments with deleted docs should be merged.
if (isOriginal != null && isOriginal) {
foundDeletes = true;
}
@@ -770,8 +773,7 @@ public class TieredMergePolicy extends MergePolicy {
return null;
}
- // We should never bail if there are segments that have deleted documents, all deleted docs
- // should be purged.
+ // We only bail if there are no deletions
if (foundDeletes == false) {
SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
if ((maxSegmentCount != Integer.MAX_VALUE
@@ -794,6 +796,11 @@ public class TieredMergePolicy extends MergePolicy {
final int startingSegmentCount = sortedSizeAndDocs.size();
if (forceMergeRunning) {
+ // hmm this is a little dangerous -- if a user kicks off a forceMerge, it is taking forever,
+ // lots of
+ // new indexing/segments happened since, and they want to kick off another to ensure those
+ // newly
+ // indexed segments partake in the force merge, they (silently) won't due to this?
return null;
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
index 863da7f..09c06b3 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestTieredMergePolicy.java
@@ -17,7 +17,6 @@
package org.apache.lucene.index;
import java.io.IOException;
-import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -411,55 +410,132 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
dir.close();
}
+ /**
+ * Returns how many segments are in the index after applying all merges from the {@code spec} to
+ * an index with {@code startingSegmentCount} segments
+ */
+ private int postMergesSegmentCount(int startingSegmentCount, MergeSpecification spec) {
+ int count = startingSegmentCount;
+ // remove the segments that are merged away
+ for (var merge : spec.merges) {
+ count -= merge.segments.size();
+ }
+
+ // add one for each newly merged segment
+ count += spec.merges.size();
+
+ return count;
+ }
+
+ // verify that all merges in the spec do not create a final merged segment size too much bigger
+ // than the configured maxMergedSegmentSizeMb
+ private static void assertMaxMergedSize(
+ MergeSpecification specification,
+ double maxMergedSegmentSizeMB,
+ double indexTotalSizeInMB,
+ int maxMergedSegmentCount)
+ throws IOException {
+
+ // When the two configs conflict, TMP favors the requested number of segments. I.e., it will
+ // produce
+ // too-large (> maxMergedSegmentMB) merged segments.
+ double maxMBPerSegment = indexTotalSizeInMB / maxMergedSegmentCount;
+
+ for (OneMerge merge : specification.merges) {
+ // compute total size of all segments being merged
+ long mergeTotalSizeInBytes = 0;
+ for (var segment : merge.segments) {
+ mergeTotalSizeInBytes += segment.sizeInBytes();
+ }
+
+ // we allow up to 50% "violation" of the configured maxMergedSegmentSizeMb (why? TMP itself is
+ // on only adding 25% fudge factor):
+ // TODO: drop this fudge factor back down to 25% -- TooMuchFudgeException!
+ long limitBytes =
+ (long) (1024 * 1024 * Math.max(maxMBPerSegment, maxMergedSegmentSizeMB) * 1.5);
+ assertTrue(
+ "mergeTotalSizeInBytes="
+ + mergeTotalSizeInBytes
+ + " limitBytes="
+ + limitBytes
+ + " maxMergedSegmentSizeMb="
+ + maxMergedSegmentSizeMB,
+ mergeTotalSizeInBytes < limitBytes);
+ }
+ }
+
// LUCENE-8688 reports that force merges merged more segments that necessary to respect
// maxSegmentCount as a result
// of LUCENE-7976 so we ensure that it only does the minimum number of merges here.
public void testForcedMergesUseLeastNumberOfMerges() throws Exception {
- final TieredMergePolicy tmp = new TieredMergePolicy();
- final double oneSegmentSize = 1.0D;
- final double maxSegmentSize = 10 * oneSegmentSize;
- tmp.setMaxMergedSegmentMB(maxSegmentSize);
+ TieredMergePolicy tmp = new TieredMergePolicy();
+ double oneSegmentSizeMB = 1.0D;
+ double maxMergedSegmentSizeMB = 10 * oneSegmentSizeMB;
+ tmp.setMaxMergedSegmentMB(maxMergedSegmentSizeMB);
+ if (VERBOSE) {
+ System.out.println(
+ String.format(Locale.ROOT, "TEST: maxMergedSegmentSizeMB=%.2f", maxMergedSegmentSizeMB));
+ }
+ // create simulated 30 segment index where each segment is 1 MB
SegmentInfos infos = new SegmentInfos(Version.LATEST.major);
- for (int j = 0; j < 30; ++j) {
- infos.add(makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSize, IndexWriter.SOURCE_MERGE));
+ int segmentCount = 30;
+ for (int j = 0; j < segmentCount; j++) {
+ infos.add(
+ makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
}
- final int expectedCount = random().nextInt(10) + 3;
- final MergeSpecification specification =
+ double indexTotalSizeMB = segmentCount * oneSegmentSizeMB;
+
+ int maxSegmentCountAfterForceMerge = random().nextInt(10) + 3;
+ if (VERBOSE) {
+ System.out.println("TEST: maxSegmentCountAfterForceMerge=" + maxSegmentCountAfterForceMerge);
+ }
+ MergeSpecification specification =
tmp.findForcedMerges(
infos,
- expectedCount,
+ maxSegmentCountAfterForceMerge,
segmentsToMerge(infos),
new MockMergeContext(SegmentCommitInfo::getDelCount));
- assertMaxSize(specification, maxSegmentSize);
- final int resultingCount =
- infos.size()
- + specification.merges.size()
- - specification.merges.stream().mapToInt(spec -> spec.segments.size()).sum();
- assertEquals(expectedCount, resultingCount);
-
- SegmentInfos manySegmentsInfos = new SegmentInfos(Version.LATEST.major);
+ if (VERBOSE) {
+ System.out.println("TEST: specification=" + specification);
+ }
+ assertMaxMergedSize(
+ specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
+
+ // verify we achieved exactly the max segment count post-force-merge (which is a bit odd -- the
+ // API only ensures <= segments, not ==)
+ // TODO: change this to a <= equality like the last assert below?
+ assertEquals(
+ maxSegmentCountAfterForceMerge, postMergesSegmentCount(infos.size(), specification));
+
+ // now create many segments index, containing 0.1 MB sized segments
+ infos = new SegmentInfos(Version.LATEST.major);
final int manySegmentsCount = atLeast(100);
- for (int j = 0; j < manySegmentsCount; ++j) {
- manySegmentsInfos.add(
- makeSegmentCommitInfo("_" + j, 1000, 0, 0.1D, IndexWriter.SOURCE_MERGE));
+ if (VERBOSE) {
+ System.out.println("TEST: manySegmentsCount=" + manySegmentsCount);
+ }
+ oneSegmentSizeMB = 0.1D;
+ for (int j = 0; j < manySegmentsCount; j++) {
+ infos.add(
+ makeSegmentCommitInfo("_" + j, 1000, 0, oneSegmentSizeMB, IndexWriter.SOURCE_MERGE));
}
- final MergeSpecification specificationManySegments =
+ indexTotalSizeMB = manySegmentsCount * oneSegmentSizeMB;
+
+ specification =
tmp.findForcedMerges(
- manySegmentsInfos,
- expectedCount,
- segmentsToMerge(manySegmentsInfos),
+ infos,
+ maxSegmentCountAfterForceMerge,
+ segmentsToMerge(infos),
new MockMergeContext(SegmentCommitInfo::getDelCount));
- assertMaxSize(specificationManySegments, maxSegmentSize);
- final int resultingCountManySegments =
- manySegmentsInfos.size()
- + specificationManySegments.merges.size()
- - specificationManySegments.merges.stream()
- .mapToInt(spec -> spec.segments.size())
- .sum();
- assertTrue(resultingCountManySegments >= expectedCount);
+ if (VERBOSE) {
+ System.out.println("TEST: specification=" + specification);
+ }
+ assertMaxMergedSize(
+ specification, maxMergedSegmentSizeMB, indexTotalSizeMB, maxSegmentCountAfterForceMerge);
+ assertTrue(
+ postMergesSegmentCount(infos.size(), specification) >= maxSegmentCountAfterForceMerge);
}
// Make sure that TieredMergePolicy doesn't do the final merge while there are merges ongoing, but
@@ -493,26 +569,6 @@ public class TestTieredMergePolicy extends BaseMergePolicyTestCase {
return segmentsToMerge;
}
- private static void assertMaxSize(MergeSpecification specification, double maxSegmentSizeMb) {
- for (OneMerge merge : specification.merges) {
- long size =
- merge.segments.stream()
- .mapToLong(
- s -> {
- try {
- return s.sizeInBytes();
- } catch (IOException e) {
- throw new UncheckedIOException(e);
- }
- })
- .sum();
- long limit = (long) (1024 * 1024 * maxSegmentSizeMb * 1.5);
- assertTrue(
- "size=" + size + ",limit=" + limit + ",maxSegmentSizeMb=" + maxSegmentSizeMb,
- size < limit);
- }
- }
-
// Having a segment with very few documents in it can happen because of the random nature of the
// docs added to the index. For instance, let's say it just happens that the last segment has 3
// docs in it.