You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2010/11/24 20:47:43 UTC
svn commit: r1038783 - in /lucene/java/branches/lucene_3_0: ./
contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/
src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/
Author: mikemccand
Date: Wed Nov 24 19:47:43 2010
New Revision: 1038783
URL: http://svn.apache.org/viewvc?rev=1038783&view=rev
Log:
LUCENE-2773: don't build compound files for large merged segments (by default)
Modified:
lucene/java/branches/lucene_3_0/CHANGES.txt
lucene/java/branches/lucene_3_0/common-build.xml
lucene/java/branches/lucene_3_0/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/LogMergePolicy.java
lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
Modified: lucene/java/branches/lucene_3_0/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/CHANGES.txt?rev=1038783&r1=1038782&r2=1038783&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/CHANGES.txt (original)
+++ lucene/java/branches/lucene_3_0/CHANGES.txt Wed Nov 24 19:47:43 2010
@@ -15,6 +15,13 @@ Changes in runtime behavior
worst-case free disk space required during optimize is now 3X the
index size, when compound file is enabled (else 2X). (Mike
McCandless)
+
+* LUCENE-2773: LogMergePolicy accepts a double noCFSRatio (default =
+ 0.1), which means any time a merged segment is greater than 10% of
+ the index size, it will be left in non-compound format even if
+ compound format is on. This change was made to reduce peak
+ transient disk usage during optimize which increased due to
+ LUCENE-2762. (Mike McCandless)
Bug fixes
@@ -108,6 +115,15 @@ Bug fixes
* LUCENE-2216: OpenBitSet.hashCode returned different hash codes for
sets that only differed by trailing zeros. (Dawid Weiss, yonik)
+API Changes
+
+* LUCENE-2773: LogMergePolicy accepts a double noCFSRatio (default =
+ 0.1), which means any time a merged segment is greater than 10% of
+ the index size, it will be left in non-compound format even if
+ compound format is on. This change was made to reduce peak
+ transient disk usage during optimize which increased due to
+ LUCENE-2762. (Mike McCandless)
+
Optimizations
* LUCENE-2556: Improve memory usage after cloning TermAttribute.
Modified: lucene/java/branches/lucene_3_0/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/common-build.xml?rev=1038783&r1=1038782&r2=1038783&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/common-build.xml (original)
+++ lucene/java/branches/lucene_3_0/common-build.xml Wed Nov 24 19:47:43 2010
@@ -42,7 +42,7 @@
<property name="Name" value="Lucene"/>
<property name="dev.version" value="3.0.3-dev"/>
<property name="version" value="${dev.version}"/>
- <property name="compatibility.tag" value="lucene_2_9_back_compat_tests_20101123"/>
+ <property name="compatibility.tag" value="lucene_2_9_back_compat_tests_20101124"/>
<property name="spec.version" value="${version}"/>
<property name="year" value="2000-${current.year}"/>
<property name="final.name" value="lucene-${name}-${version}"/>
Modified: lucene/java/branches/lucene_3_0/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java?rev=1038783&r1=1038782&r2=1038783&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java (original)
+++ lucene/java/branches/lucene_3_0/contrib/benchmark/src/test/org/apache/lucene/benchmark/byTask/TestPerfTasksLogic.java Wed Nov 24 19:47:43 2010
@@ -17,33 +17,34 @@
package org.apache.lucene.benchmark.byTask;
-import java.io.IOException;
-import java.io.StringReader;
+import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
-import java.io.BufferedReader;
-import java.util.List;
+import java.io.IOException;
+import java.io.StringReader;
import java.util.Iterator;
+import java.util.List;
+
+import junit.framework.TestCase;
import org.apache.lucene.benchmark.byTask.feeds.DocData;
import org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException;
import org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource;
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
-import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
-import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
import org.apache.lucene.benchmark.byTask.stats.TaskStats;
+import org.apache.lucene.benchmark.byTask.tasks.CountingHighlighterTestTask;
+import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.TermEnum;
-import org.apache.lucene.index.TermDocs;
-import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.LogDocMergePolicy;
+import org.apache.lucene.index.SegmentInfos;
+import org.apache.lucene.index.SerialMergeScheduler;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.store.Directory;
import org.apache.lucene.search.FieldCache.StringIndex;
import org.apache.lucene.search.FieldCache;
-
-import junit.framework.TestCase;
+import org.apache.lucene.store.Directory;
/**
* Test very simply that perf tasks - simple algorithms - are doing what they should.
@@ -776,12 +777,9 @@ public class TestPerfTasksLogic extends
ir.close();
// Make sure we have 3 segments:
- final String[] files = benchmark.getRunData().getDirectory().listAll();
- int cfsCount = 0;
- for(int i=0;i<files.length;i++)
- if (files[i].endsWith(".cfs"))
- cfsCount++;
- assertEquals(3, cfsCount);
+ SegmentInfos infos = new SegmentInfos();
+ infos.read(benchmark.getRunData().getDirectory());
+ assertEquals(3, infos.size());
}
/**
Modified: lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=1038783&r1=1038782&r2=1038783&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/java/branches/lucene_3_0/src/java/org/apache/lucene/index/LogMergePolicy.java Wed Nov 24 19:47:43 2010
@@ -54,12 +54,19 @@ public abstract class LogMergePolicy ext
* or larger will never be merged. @see setMaxMergeDocs */
public static final int DEFAULT_MAX_MERGE_DOCS = Integer.MAX_VALUE;
+ /** Default noCFSRatio. If a merge's size is >= 10% of
+ * the index, then we disable compound file for it.
+ * @see setNoCFSRatio */
+ public static final double DEFAULT_NO_CFS_RATIO = 0.1;
+
private int mergeFactor = DEFAULT_MERGE_FACTOR;
long minMergeSize;
long maxMergeSize;
int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
+ protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
+
/* TODO 3.0: change this default to true */
protected boolean calibrateSizeByDeletes = false;
@@ -73,6 +80,23 @@ public abstract class LogMergePolicy ext
protected boolean verbose() {
return writer != null && writer.verbose();
}
+
+ /** @see setNoCFSRatio */
+ public double getNoCFSRatio() {
+ return noCFSRatio;
+ }
+
+ /** If a merged segment will be more than this percentage
+ * of the total size of the index, leave the segment as
+ * non-compound file even if compound file is enabled.
+ * Set to 1.0 to always use CFS regardless of merge
+ * size. */
+ public void setNoCFSRatio(double noCFSRatio) {
+ if (noCFSRatio < 0.0 || noCFSRatio > 1.0) {
+ throw new IllegalArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
+ }
+ this.noCFSRatio = noCFSRatio;
+ }
private void message(String message) {
if (verbose())
@@ -203,7 +227,7 @@ public abstract class LogMergePolicy ext
return !hasDeletions &&
!info.hasSeparateNorms() &&
info.dir == writer.getDirectory() &&
- info.getUseCompoundFile() == useCompoundFile;
+ (info.getUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
}
/** Returns the merges necessary to optimize the index.
@@ -242,7 +266,7 @@ public abstract class LogMergePolicy ext
// First, enroll all "full" merges (size
// mergeFactor) to potentially be run concurrently:
while (last - maxNumSegments + 1 >= mergeFactor) {
- spec.add(new OneMerge(infos.range(last-mergeFactor, last), useCompoundFile));
+ spec.add(makeOneMerge(infos, infos.range(last-mergeFactor, last)));
last -= mergeFactor;
}
@@ -254,7 +278,7 @@ public abstract class LogMergePolicy ext
// Since we must optimize down to 1 segment, the
// choice is simple:
if (last > 1 || !isOptimized(infos.info(0)))
- spec.add(new OneMerge(infos.range(0, last), useCompoundFile));
+ spec.add(makeOneMerge(infos, infos.range(0, last)));
} else if (last > maxNumSegments) {
// Take care to pick a partial merge that is
@@ -282,7 +306,7 @@ public abstract class LogMergePolicy ext
}
}
- spec.add(new OneMerge(infos.range(bestStart, bestStart+finalMergeSize), useCompoundFile));
+ spec.add(makeOneMerge(infos, infos.range(bestStart, bestStart+finalMergeSize)));
}
}
@@ -322,7 +346,7 @@ public abstract class LogMergePolicy ext
// deletions, so force a merge now:
if (verbose())
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
- spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
+ spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions, i)));
firstSegmentWithDeletions = i;
}
} else if (firstSegmentWithDeletions != -1) {
@@ -331,7 +355,7 @@ public abstract class LogMergePolicy ext
// mergeFactor segments
if (verbose())
message(" add merge " + firstSegmentWithDeletions + " to " + (i-1) + " inclusive");
- spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, i), useCompoundFile));
+ spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions, i)));
firstSegmentWithDeletions = -1;
}
}
@@ -339,7 +363,7 @@ public abstract class LogMergePolicy ext
if (firstSegmentWithDeletions != -1) {
if (verbose())
message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments-1) + " inclusive");
- spec.add(new OneMerge(segmentInfos.range(firstSegmentWithDeletions, numSegments), useCompoundFile));
+ spec.add(makeOneMerge(segmentInfos, segmentInfos.range(firstSegmentWithDeletions, numSegments)));
}
return spec;
@@ -439,7 +463,7 @@ public abstract class LogMergePolicy ext
spec = new MergeSpecification();
if (verbose())
message(" " + start + " to " + end + ": add this merge");
- spec.add(new OneMerge(infos.range(start, end), useCompoundFile));
+ spec.add(makeOneMerge(infos, infos.range(start, end)));
} else if (verbose())
message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
@@ -453,6 +477,29 @@ public abstract class LogMergePolicy ext
return spec;
}
+ protected OneMerge makeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge) throws IOException {
+ final boolean doCFS;
+ if (!useCompoundFile) {
+ doCFS = false;
+ } else if (noCFSRatio == 1.0) {
+ doCFS = true;
+ } else {
+
+ long totSize = 0;
+ for(SegmentInfo info : infos) {
+ totSize += size(info);
+ }
+ long mergeSize = 0;
+ for(SegmentInfo info : infosToMerge) {
+ mergeSize += size(info);
+ }
+
+ doCFS = mergeSize <= noCFSRatio * totSize;
+ }
+
+ return new OneMerge(infosToMerge, doCFS);
+ }
+
/** <p>Determines the largest segment (measured by
* document count) that may be merged with other segments.
* Small values (e.g., less than 10,000) are best for
Modified: lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java?rev=1038783&r1=1038782&r2=1038783&view=diff
==============================================================================
--- lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java (original)
+++ lucene/java/branches/lucene_3_0/src/test/org/apache/lucene/index/TestIndexWriterMergePolicy.java Wed Nov 24 19:47:43 2010
@@ -244,25 +244,5 @@ public class TestIndexWriterMergePolicy
if (upperBound * mergeFactor <= maxMergeDocs) {
assertTrue(numSegments < mergeFactor);
}
-
- String[] files = writer.getDirectory().listAll();
- int segmentCfsCount = 0;
- for (int i = 0; i < files.length; i++) {
- if (files[i].endsWith(".cfs")) {
- segmentCfsCount++;
- }
- }
- assertEquals("index=" + writer.segString(), segmentCount, segmentCfsCount);
- }
-
- /*
- private void printSegmentDocCounts(IndexWriter writer) {
- int segmentCount = writer.getSegmentCount();
- System.out.println("" + segmentCount + " segments total");
- for (int i = 0; i < segmentCount; i++) {
- System.out.println(" segment " + i + " has " + writer.getDocCount(i)
- + " docs");
- }
}
- */
}