You are viewing a plain text version of this content. The canonical link for it is here.
Posted to java-commits@lucene.apache.org by mi...@apache.org on 2007/11/30 11:09:47 UTC

svn commit: r599766 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/index/ src/test/org/apache/lucene/index/

Author: mikemccand
Date: Fri Nov 30 02:09:45 2007
New Revision: 599766

URL: http://svn.apache.org/viewvc?rev=599766&view=rev
Log:
LUCENE-982: add new method optimize(int maxNumSegments) to IndexWriter

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java
    lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Fri Nov 30 02:09:45 2007
@@ -72,6 +72,11 @@
     setData(byte[] data, int offset, int length), getData(), getOffset()
     and clone() methods to o.a.l.index.Payload. Also add the field name 
     as arg to Similarity.scorePayload(). (Michael Busch)
+
+ 9. LUCENE-982: Add IndexWriter.optimize(int maxNumSegments) method to
+    "partially optimize" an index down to maxNumSegments segments.
+    (Mike McCandless)
+
     
 Bug fixes
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Fri Nov 30 02:09:45 2007
@@ -1654,14 +1654,37 @@
     optimize(true);
   }
 
+  /**
+   * Optimize the index down to <= maxNumSegments.  If
+   * maxNumSegments==1 then this is the same as {@link
+   * #optimize()}.
+   * @param maxNumSegments maximum number of segments left
+   * in the index after optimization finishes
+   */
+  public void optimize(int maxNumSegments) throws CorruptIndexException, IOException {
+    optimize(maxNumSegments, true);
+  }
+
   /** Just like {@link #optimize()}, except you can specify
    *  whether the call should block until the optimize
    *  completes.  This is only meaningful with a
    *  {@link MergeScheduler} that is able to run merges in
    *  background threads. */
   public void optimize(boolean doWait) throws CorruptIndexException, IOException {
+    optimize(1, true);
+  }
+
+  /** Just like {@link #optimize(int)}, except you can
+   *  specify whether the call should block until the
+   *  optimize completes.  This is only meaningful with a
+   *  {@link MergeScheduler} that is able to run merges in
+   *  background threads. */
+  public void optimize(int maxNumSegments, boolean doWait) throws CorruptIndexException, IOException {
     ensureOpen();
 
+    if (maxNumSegments < 1)
+      throw new IllegalArgumentException("maxNumSegments must be >= 1; got " + maxNumSegments);
+
     if (infoStream != null)
       message("optimize: index now " + segString());
 
@@ -1677,15 +1700,21 @@
       // Now mark all pending & running merges as optimize
       // merge:
       Iterator it = pendingMerges.iterator();
-      while(it.hasNext())
-        ((MergePolicy.OneMerge) it.next()).optimize = true;
+      while(it.hasNext()) {
+        final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.next();
+        merge.optimize = true;
+        merge.maxNumSegmentsOptimize = maxNumSegments;
+      }
 
       it = runningMerges.iterator();
-      while(it.hasNext())
-        ((MergePolicy.OneMerge) it.next()).optimize = true;
+      while(it.hasNext()) {
+        final MergePolicy.OneMerge merge = (MergePolicy.OneMerge) it.next();
+        merge.optimize = true;
+        merge.maxNumSegmentsOptimize = maxNumSegments;
+      }
     }
 
-    maybeMerge(true);
+    maybeMerge(maxNumSegments, true);
 
     if (doWait) {
       synchronized(this) {
@@ -1748,25 +1777,29 @@
   }
 
   private final void maybeMerge(boolean optimize) throws CorruptIndexException, IOException {
-    updatePendingMerges(optimize);
+    maybeMerge(1, optimize);
+  }
+
+  private final void maybeMerge(int maxNumSegmentsOptimize, boolean optimize) throws CorruptIndexException, IOException {
+    updatePendingMerges(maxNumSegmentsOptimize, optimize);
     mergeScheduler.merge(this);
   }
 
-  private synchronized void updatePendingMerges(boolean optimize)
+  private synchronized void updatePendingMerges(int maxNumSegmentsOptimize, boolean optimize)
     throws CorruptIndexException, IOException {
+    assert !optimize || maxNumSegmentsOptimize > 0;
 
     final MergePolicy.MergeSpecification spec;
     if (optimize) {
-      // Currently hardwired to 1, but once we add method to
-      // IndexWriter to allow "optimizing to <= N segments"
-      // then we will change this.
-      final int maxSegmentCount = 1;
-      spec = mergePolicy.findMergesForOptimize(segmentInfos, this, maxSegmentCount, segmentsToOptimize);
+      spec = mergePolicy.findMergesForOptimize(segmentInfos, this, maxNumSegmentsOptimize, segmentsToOptimize);
 
       if (spec != null) {
         final int numMerges = spec.merges.size();
-        for(int i=0;i<numMerges;i++)
-          ((MergePolicy.OneMerge) spec.merges.get(i)).optimize = true;
+        for(int i=0;i<numMerges;i++) {
+          final MergePolicy.OneMerge merge = ((MergePolicy.OneMerge) spec.merges.get(i));
+          merge.optimize = true;
+          merge.maxNumSegmentsOptimize = maxNumSegmentsOptimize;
+        }
       }
 
     } else
@@ -2737,6 +2770,7 @@
     throws CorruptIndexException, IOException {
 
     assert merge.registerDone;
+    assert !merge.optimize || merge.maxNumSegmentsOptimize > 0;
 
     boolean success = false;
 
@@ -2753,23 +2787,24 @@
       success = true;
     } finally {
       synchronized(this) {
-        if (!success && infoStream != null)
-          message("hit exception during merge");
+        try {
+          if (!success && infoStream != null)
+            message("hit exception during merge");
 
-        mergeFinish(merge);
+          mergeFinish(merge);
 
-        // This merge (and, generally, any change to the
-        // segments) may now enable new merges, so we call
-        // merge policy & update pending merges.
-        if (success && !merge.isAborted() && !closed && !closing)
-          updatePendingMerges(merge.optimize);
-
-        runningMerges.remove(merge);
-
-        // Optimize may be waiting on the final optimize
-        // merge to finish; and finishMerges() may be
-        // waiting for all merges to finish:
-        notifyAll();
+          // This merge (and, generally, any change to the
+          // segments) may now enable new merges, so we call
+          // merge policy & update pending merges.
+          if (success && !merge.isAborted() && !closed && !closing)
+            updatePendingMerges(merge.maxNumSegmentsOptimize, merge.optimize);
+        } finally {
+          runningMerges.remove(merge);
+          // Optimize may be waiting on the final optimize
+          // merge to finish; and finishMerges() may be
+          // waiting for all merges to finish:
+          notifyAll();
+        }
       }
     }
   }
@@ -2992,8 +3027,7 @@
         SegmentInfo si = sourceSegmentsClone.info(i);
         IndexReader reader = SegmentReader.get(si, MERGE_READ_BUFFER_SIZE, merge.mergeDocStores); // no need to set deleter (yet)
         merger.add(reader);
-        if (infoStream != null)
-          totDocCount += reader.numDocs();
+        totDocCount += reader.numDocs();
       }
       if (infoStream != null) {
         message("merge: total "+totDocCount+" docs");
@@ -3001,8 +3035,7 @@
 
       mergedDocCount = merge.info.docCount = merger.merge(merge.mergeDocStores);
 
-      if (infoStream != null)
-        assert mergedDocCount == totDocCount;
+      assert mergedDocCount == totDocCount;
 
       success = true;
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/LogMergePolicy.java Fri Nov 30 02:09:45 2007
@@ -169,35 +169,74 @@
    *  in use may make use of concurrency. */
   public MergeSpecification findMergesForOptimize(SegmentInfos infos, IndexWriter writer, int maxNumSegments, Set segmentsToOptimize) throws IOException {
     MergeSpecification spec;
-    
+
+    assert maxNumSegments > 0;
+
     if (!isOptimized(infos, writer, maxNumSegments, segmentsToOptimize)) {
 
-      int numSegments = infos.size();
-      while(numSegments > 0) {
-        final SegmentInfo info = infos.info(--numSegments);
+      // Find the newest (rightmost) segment that needs to
+      // be optimized (other segments may have been flushed
+      // since optimize started):
+      int last = infos.size();
+      while(last > 0) {
+        final SegmentInfo info = infos.info(--last);
         if (segmentsToOptimize.contains(info)) {
-          numSegments++;
+          last++;
           break;
         }
       }
 
-      if (numSegments > 0) {
+      if (last > 0) {
 
         spec = new MergeSpecification();
-        while (numSegments > 0) {
-        
-          final int first;
-          if (numSegments > mergeFactor)
-            first = numSegments-mergeFactor;
-          else
-            first = 0;
-
-          if (numSegments > 1 || !isOptimized(writer, infos.info(0)))
-            spec.add(new OneMerge(infos.range(first, numSegments), useCompoundFile));
 
-          numSegments -= mergeFactor;
+        // First, enroll all "full" merges (size
+        // mergeFactor) to potentially be run concurrently:
+        while (last - maxNumSegments + 1 >= mergeFactor) {
+          spec.add(new OneMerge(infos.range(last-mergeFactor, last), useCompoundFile));
+          last -= mergeFactor;
         }
 
+        // Only if there are no full merges pending do we
+        // add a final partial (< mergeFactor segments) merge:
+        if (0 == spec.merges.size()) {
+          if (maxNumSegments == 1) {
+
+            // Since we must optimize down to 1 segment, the
+            // choice is simple:
+            if (last > 1 || !isOptimized(writer, infos.info(0)))
+              spec.add(new OneMerge(infos.range(0, last), useCompoundFile));
+          } else if (last > maxNumSegments) {
+
+            // Take care to pick a partial merge that is
+            // least cost, but does not make the index too
+            // lopsided.  If we always just picked the
+            // partial tail then we could produce a highly
+            // lopsided index over time:
+
+            // We must merge this many segments to leave
+            // maxNumSegments in the index (from when
+            // optimize was first kicked off):
+            final int finalMergeSize = last - maxNumSegments + 1;
+
+            // Consider all possible starting points:
+            long bestSize = 0;
+            int bestStart = 0;
+
+            for(int i=0;i<last-finalMergeSize+1;i++) {
+              long sumSize = 0;
+              for(int j=0;j<finalMergeSize;j++)
+                sumSize += size(infos.info(j+i));
+              if (i == 0 || (sumSize < 2*size(infos.info(i-1)) && sumSize < bestSize)) {
+                bestStart = i;
+                bestSize = sumSize;
+              }
+            }
+
+            spec.add(new OneMerge(infos.range(bestStart, bestStart+finalMergeSize), useCompoundFile));
+          }
+        }
+        
       } else
         spec = null;
     } else

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/MergePolicy.java Fri Nov 30 02:09:45 2007
@@ -70,6 +70,7 @@
     boolean registerDone;           // used by IndexWriter
     long mergeGen;                  // used by IndexWriter
     boolean isExternal;             // used by IndexWriter
+    int maxNumSegmentsOptimize;     // used by IndexWriter
 
     final SegmentInfos segments;
     final boolean useCompoundFile;

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=599766&r1=599765&r2=599766&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Fri Nov 30 02:09:45 2007
@@ -557,6 +557,85 @@
       dir.close();
     }
 
+    public void testOptimizeMaxNumSegments() throws IOException {
+
+      MockRAMDirectory dir = new MockRAMDirectory();
+
+      final Document doc = new Document();
+      doc.add(new Field("content", "aaa", Field.Store.YES, Field.Index.TOKENIZED));
+
+      for(int numDocs=38;numDocs<500;numDocs += 38) {
+        IndexWriter writer  = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
+        LogDocMergePolicy ldmp = new LogDocMergePolicy();
+        ldmp.setMinMergeDocs(1);
+        writer.setMergePolicy(ldmp);
+        writer.setMergeFactor(5);
+        writer.setMaxBufferedDocs(2);
+        for(int j=0;j<numDocs;j++)
+          writer.addDocument(doc);
+        writer.close();
+
+        SegmentInfos sis = new SegmentInfos();
+        sis.read(dir);
+        final int segCount = sis.size();
+
+        writer  = new IndexWriter(dir, new WhitespaceAnalyzer());
+        writer.setMergePolicy(ldmp);
+        writer.setMergeFactor(5);
+        writer.optimize(3);
+        writer.close();
+
+        sis = new SegmentInfos();
+        sis.read(dir);
+        final int optSegCount = sis.size();
+
+        if (segCount < 3)
+          assertEquals(segCount, optSegCount);
+        else
+          assertEquals(3, optSegCount);
+      }
+    }
+
+    public void testOptimizeMaxNumSegments2() throws IOException {
+      MockRAMDirectory dir = new MockRAMDirectory();
+
+      final Document doc = new Document();
+      doc.add(new Field("content", "aaa", Field.Store.YES, Field.Index.TOKENIZED));
+
+      IndexWriter writer  = new IndexWriter(dir, new WhitespaceAnalyzer(), true);
+      LogDocMergePolicy ldmp = new LogDocMergePolicy();
+      ldmp.setMinMergeDocs(1);
+      writer.setMergePolicy(ldmp);
+      writer.setMergeFactor(4);
+      writer.setMaxBufferedDocs(2);
+
+      for(int iter=0;iter<10;iter++) {
+
+        for(int i=0;i<19;i++)
+          writer.addDocument(doc);
+
+        writer.flush();
+
+        SegmentInfos sis = new SegmentInfos();
+        ((ConcurrentMergeScheduler) writer.getMergeScheduler()).sync();
+        sis.read(dir);
+
+        final int segCount = sis.size();
+
+        writer.optimize(7);
+
+        sis = new SegmentInfos();
+        ((ConcurrentMergeScheduler) writer.getMergeScheduler()).sync();
+        sis.read(dir);
+        final int optSegCount = sis.size();
+
+        if (segCount < 7)
+          assertEquals(segCount, optSegCount);
+        else
+          assertEquals(7, optSegCount);
+      }
+    }
+
     /**
      * Make sure optimize doesn't use any more than 1X
      * starting index size as its temporary free space