You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/11/30 11:09:47 UTC
svn commit: r599766 - in /lucene/java/trunk: ./
src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/
Author: mikemccand
Date: Fri Nov 30 02:09:45 2007
New Revision: 599766
URL: http://svn.apache.org/viewvc?rev=599766&view=rev
Log:
LUCENE-982: add new method optimize(int maxNumSegments) to IndexWriter
Modified:
lucene/java/trunk/CHANGES.txt
lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java
lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java
lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Nov 30 02:09:45 2007
@@ -72,6 +72,11 @@
setData(byte[] data, int offset, int length), getData(), getOffset()
and clone() methods to o.a.l.index.Payload. Also add the field name
as arg to Similarity.scorePayload(). (Michael Busch)
+
+ 9. LUCENE-982: Add IndexWriter.optimize(int maxNumSegments) method to
+ "partially optimize" an index down to maxNumSegments segments.
+ (Mike McCandless)
+
Bug fixes
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Fri Nov 30 02:09:45 2007
@@ -1654,14 +1654,37 @@
optimize(true);
}
+ /**
+ * Optimize the index down to <= maxNumSegments. If
+ * maxNumSegments==1 then this is the same as {@link
+ * #optimize()}.
+ * @param maxNumSegments maximum number of segments left
+ * in the index after optimization finishes
+ */
+ public void optimize(int maxNumSegments) throws CorruptIndexException, IOException {
+ optimize(maxNumSegments, true);
+ }
+
/** Just like {@link #optimize()}, except you can specify
* whether the call should block until the optimize
* completes. This is only meaningful with a
* {@link MergeScheduler} that is able to run merges in
* background threads. */
public void optimize(boolean doWait) throws CorruptIndexException, IOException {
+ optimize(1, true);
+ }
+
+ /** Just like {@link #optimize(int)}, except you can
+ * specify whether the call should block until the
+ * optimize completes. This is only meaningful with a
+ * {@link MergeScheduler} that is able to run merges in
+ * background threads. */
+ public void optimize(int maxNumSegments, boolean doWait) throws CorruptIndexException, IOException {
ensureOpen();
+ if (maxNumSegments < 1)
+ throw new IllegalArgumentException("maxNumSegments must be >= 1; got " + maxNumSegments);
+
if (infoStream != null)
message("optimize: index now " + segString());
@@ -1677,15 +1700,21 @@
// Now mark all pending & running merges as optimize
// merge:
Iterator it = pendingMerges.iterator();
- while(it.hasNext())
- ((MergePolicy.OneMerge) it.next()).optimize = true;
+ while(it.hasNext()) {
+ final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.next();
+ merge.optimize = true;
+ merge.maxNumSegmentsOptimize = maxNumSegments;
+ }
it = runningMerges.iterator();
- while(it.hasNext())
- ((MergePolicy.OneMerge) it.next()).optimize = true;
+ while(it.hasNext()) {
+ final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.next();
+ merge.optimize = true;
+ merge.maxNumSegmentsOptimize = maxNumSegments;
+ }
}
- maybeMerge(true);
+ maybeMerge(maxNumSegments, true);
if (doWait) {
synchronized(this) {
@@ -1748,25 +1777,29 @@
}
private final void maybeMerge(boolean optimize) throws CorruptIndexException, IOException {
- updatePendingMerges(optimize);
+ maybeMerge(1, optimize);
+ }
+
+ private final void maybeMerge(int maxNumSegmentsOptimize, boolean optimize) throws CorruptIndexException, IOException {
+ updatePendingMerges(maxNumSegmentsOptimize, optimize);
mergeScheduler.merge(this);
}
- private synchronized void updatePendingMerges(boolean optimize)
+ private synchronized void updatePendingMerges(int maxNumSegmentsOptimize, boolean optimize)
throws CorruptIndexException, IOException {
+ assert !optimize || maxNumSegmentsOptimize > 0;
final MergePolicy.MergeSpecification spec;
if (optimize) {
- // Currently hardwired to 1, but once we add method to
- // IndexWriter to allow "optimizing to <= N segments"
- // then we will change this.
- final int maxSegmentCount = 1;
- spec = mergePolicy.findMergesForOptimize(segmentInfos, this, maxSegmentCount, segmentsToOptimize);
+ spec = mergePolicy.findMergesForOptimize(segmentInfos, this, maxNumSegmentsOptimize, segmentsToOptimize);
if (spec != null) {
final int numMerges = spec.merges.size();
- for(int i=0;i<numMerges;i++)
- ((MergePolicy.OneMerge) spec.merges.get(i)).optimize = true;
+ for(int i=0;i<numMerges;i++) {
+ final MergePolicy.OneMerge merge = ((MergePolicy.OneMerge) spec.merges.get(i));
+ merge.optimize = true;
+ merge.maxNumSegmentsOptimize = maxNumSegmentsOptimize;
+ }
}
} else
@@ -2737,6 +2770,7 @@
throws CorruptIndexException, IOException {
assert merge.registerDone;
+ assert !merge.optimize || merge.maxNumSegmentsOptimize > 0;
boolean success = false;
@@ -2753,23 +2787,24 @@
success = true;
} finally {
synchronized(this) {
- if (!success && infoStream != null)
- message("hit exception during merge");
+ try {
+ if (!success && infoStream != null)
+ message("hit exception during merge");
- mergeFinish(merge);
+ mergeFinish(merge);
- // This merge (and, generally, any change to the
- // segments) may now enable new merges, so we call
- // merge policy & update pending merges.
- if (success && !merge.isAborted() && !closed && !closing)
- updatePendingMerges(merge.optimize);
-
- runningMerges.remove(merge);
-
- // Optimize may be waiting on the final optimize
- // merge to finish; and finishMerges() may be
- // waiting for all merges to finish:
- notifyAll();
+ // This merge (and, generally, any change to the
+ // segments) may now enable new merges, so we call
+ // merge policy & update pending merges.
+ if (success && !merge.isAborted() && !closed && !closing)
+ updatePendingMerges(merge.maxNumSegmentsOptimize, merge.optimize);
+ } finally {
+ runningMerges.remove(merge);
+ // Optimize may be waiting on the final optimize
+ // merge to finish; and finishMerges() may be
+ // waiting for all merges to finish:
+ notifyAll();
+ }
}
}
}
@@ -2992,8 +3027,7 @@
SegmentInfo si = sourceSegmentsClone.info(i);
IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE, merge.mergeDocStores); // no need to set deleter (yet)
merger.add(reader);
- if (infoStream != null)
- totDocCount += reader.numDocs();
+ totDocCount += reader.numDocs();
}
if (infoStream != null) {
message("merge: total "+totDocCount+" docs");
@@ -3001,8 +3035,7 @@
mergedDocCount = merge.info.docCount = merger.merge(merge.mergeDocStores);
- if (infoStream != null)
- assert mergedDocCount == totDocCount;
+ assert mergedDocCount == totDocCount;
success = true;
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java Fri Nov 30 02:09:45 2007
@@ -169,35 +169,74 @@
* in use may make use of concurrency. */
public MergeSpecification findMergesForOptimize(SegmentInfos infos, IndexWriter writer, int maxNumSegments, Set segmentsToOptimize) throws IOException {
MergeSpecification spec;
-
+
+ assert maxNumSegments > 0;
+
if (!isOptimized(infos, writer, maxNumSegments, segmentsToOptimize)) {
- int numSegments = infos.size();
- while(numSegments > 0) {
- final SegmentInfo info = infos.info(--numSegments);
+ // Find the newest (rightmost) segment that needs to
+ // be optimized (other segments may have been flushed
+ // since optimize started):
+ int last = infos.size();
+ while(last > 0) {
+ final SegmentInfo info = infos.info(--last);
if (segmentsToOptimize.contains(info)) {
- numSegments++;
+ last++;
break;
}
}
- if (numSegments > 0) {
+ if (last > 0) {
spec = new MergeSpecification();
- while (numSegments > 0) {
-
- final int first;
- if (numSegments > mergeFactor)
- first = numSegments-mergeFactor;
- else
- first = 0;
-
- if (numSegments > 1 || !isOptimized(writer, infos.info(0)))
- spec.add(new OneMerge(infos.range(first, numSegments), useCompoundFile));
- numSegments -= mergeFactor;
+ // First, enroll all "full" merges (size
+ // mergeFactor) to potentially be run concurrently:
+ while (last - maxNumSegments + 1 >= mergeFactor) {
+ spec.add(new OneMerge(infos.range(last-mergeFactor, last), useCompoundFile));
+ last -= mergeFactor;
}
+ // Only if there are no full merges pending do we
+ // add a final partial (< mergeFactor segments) merge:
+ if (0 == spec.merges.size()) {
+ if (maxNumSegments == 1) {
+
+ // Since we must optimize down to 1 segment, the
+ // choice is simple:
+ if (last > 1 || !isOptimized(writer, infos.info(0)))
+ spec.add(new OneMerge(infos.range(0, last), useCompoundFile));
+ } else if (last > maxNumSegments) {
+
+ // Take care to pick a partial merge that is
+ // least cost, but does not make the index too
+ // lopsided. If we always just picked the
+ // partial tail then we could produce a highly
+ // lopsided index over time:
+
+ // We must merge this many segments to leave
+ // maxNumSegments in the index (from when
+ // optimize was first kicked off):
+ final int finalMergeSize = last - maxNumSegments + 1;
+
+ // Consider all possible starting points:
+ long bestSize = 0;
+ int bestStart = 0;
+
+ for(int i=0;i<last-finalMergeSize+1;i++) {
+ long sumSize = 0;
+ for(int j=0;j<finalMergeSize;j++)
+ sumSize += size(infos.info(j+i));
+ if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) {
+ bestStart = i;
+ bestSize = sumSize;
+ }
+ }
+
+ spec.add(new OneMerge(infos.range(bestStart, bestStart+finalMergeSize), useCompoundFile));
+ }
+ }
+
} else
spec = null;
} else
Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java Fri Nov 30 02:09:45 2007
@@ -70,6 +70,7 @@
boolean registerDone; // used by IndexWriter
long mergeGen; // used by IndexWriter
boolean isExternal; // used by IndexWriter
+ int maxNumSegmentsOptimize; // used by IndexWriter
final SegmentInfos segments;
final boolean useCompoundFile;
Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Fri Nov 30 02:09:45 2007
@@ -557,6 +557,85 @@
dir.close();
}
+ public void testOptimizeMaxNumSegments() throws IOException {
+
+ MockRAMDirectory dir = new MockRAMDirectory();
+
+ final Document doc = new Document();
+ doc.add(new Field("content", "aaa", Field.Store.YES, Field.Index.TOKENIZED));
+
+ for(int numDocs=38;numDocs<500;numDocs += 38) {
+ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
+ LogDocMergePolicy ldmp = new LogDocMergePolicy();
+ ldmp.setMinMergeDocs(1);
+ writer.setMergePolicy(ldmp);
+ writer.setMergeFactor(5);
+ writer.setMaxBufferedDocs(2);
+ for(int j=0;j<numDocs;j++)
+ writer.addDocument(doc);
+ writer.close();
+
+ SegmentInfos sis = new SegmentInfos();
+ sis.read(dir);
+ final int segCount = sis.size();
+
+ writer = new IndexWriter(dir, new WhitespaceAnalyzer());
+ writer.setMergePolicy(ldmp);
+ writer.setMergeFactor(5);
+ writer.optimize(3);
+ writer.close();
+
+ sis = new SegmentInfos();
+ sis.read(dir);
+ final int optSegCount = sis.size();
+
+ if (segCount < 3)
+ assertEquals(segCount, optSegCount);
+ else
+ assertEquals(3, optSegCount);
+ }
+ }
+
+ public void testOptimizeMaxNumSegments2() throws IOException {
+ MockRAMDirectory dir = new MockRAMDirectory();
+
+ final Document doc = new Document();
+ doc.add(new Field("content", "aaa", Field.Store.YES, Field.Index.TOKENIZED));
+
+ IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
+ LogDocMergePolicy ldmp = new LogDocMergePolicy();
+ ldmp.setMinMergeDocs(1);
+ writer.setMergePolicy(ldmp);
+ writer.setMergeFactor(4);
+ writer.setMaxBufferedDocs(2);
+
+ for(int iter=0;iter<10;iter++) {
+
+ for(int i=0;i<19;i++)
+ writer.addDocument(doc);
+
+ writer.flush();
+
+ SegmentInfos sis = new SegmentInfos();
+ ((ConcurrentMergeScheduler) writer.getMergeScheduler()).sync();
+ sis.read(dir);
+
+ final int segCount = sis.size();
+
+ writer.optimize(7);
+
+ sis = new SegmentInfos();
+ ((ConcurrentMergeScheduler) writer.getMergeScheduler()).sync();
+ sis.read(dir);
+ final int optSegCount = sis.size();
+
+ if (segCount < 7)
+ assertEquals(segCount, optSegCount);
+ else
+ assertEquals(7, optSegCount);
+ }
+ }
+
/**
* Make sure optimize doesn't use any more than 1X
* starting index size as its temporary free space