You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ar...@apache.org on 2008/06/25 04:52:24 UTC
svn commit: r671404 [6/10] -
/incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/KeepOnlyLastCommitDeletionPolicy.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/KeepOnlyLastCommitDeletionPolicy.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/KeepOnlyLastCommitDeletionPolicy.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/KeepOnlyLastCommitDeletionPolicy.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary> This {@link IndexDeletionPolicy} implementation that
+ /// keeps only the most recent commit and immediately removes
+ /// all prior commits after a new commit is done. This is
+ /// the default deletion policy.
+ /// </summary>
+
+ public sealed class KeepOnlyLastCommitDeletionPolicy : IndexDeletionPolicy
+ {
+
+ /// <summary> Deletes all commits except the most recent one.</summary>
+ public void OnInit(System.Collections.IList commits)
+ {
+ // Note that commits.size() should normally be 1:
+ OnCommit(commits);
+ }
+
+ /// <summary> Deletes all commits except the most recent one.</summary>
+ public void OnCommit(System.Collections.IList commits)
+ {
+ // Note that commits.size() should normally be 2 (if not
+ // called by onInit above):
+ int size = commits.Count;
+ for (int i = 0; i < size - 1; i++)
+ {
+ ((IndexCommitPoint) commits[i]).Delete();
+ }
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogByteSizeMergePolicy.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/LogByteSizeMergePolicy.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogByteSizeMergePolicy.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogByteSizeMergePolicy.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary>This is a {@link LogMergePolicy} that measures size of a
+ /// segment as the total byte size of the segment's files.
+ /// </summary>
+ public class LogByteSizeMergePolicy : LogMergePolicy
+ {
+
+ /// <seealso cref="setMinMergeMB">
+ /// </seealso>
+ public const double DEFAULT_MIN_MERGE_MB = 1.6;
+
+ /// <summary>Default maximum segment size. A segment of this size</summary>
+ /// <seealso cref="setMaxMergeMB">
+ /// </seealso>
+ public static readonly double DEFAULT_MAX_MERGE_MB = (double) System.Int64.MaxValue;
+
+ public LogByteSizeMergePolicy():base()
+ {
+ minMergeSize = (long) (DEFAULT_MIN_MERGE_MB * 1024 * 1024);
+ maxMergeSize = (long) (DEFAULT_MAX_MERGE_MB * 1024 * 1024);
+ }
+ protected internal override long Size(SegmentInfo info)
+ {
+ return info.SizeInBytes();
+ }
+
+ /// <summary><p>Determines the largest segment (measured by total
+ /// byte size of the segment's files, in MB) that may be
+ /// merged with other segments. Small values (e.g., less
+ /// than 50 MB) are best for interactive indexing, as this
+ /// limits the length of pauses while indexing to a few
+ /// seconds. Larger values are best for batched indexing
+ /// and speedier searches.</p>
+ ///
+ /// <p>Note that {@link #setMaxMergeDocs} is also
+ /// used to check whether a segment is too large for
+ /// merging (it's either or).</p>
+ /// </summary>
+ public virtual void SetMaxMergeMB(double mb)
+ {
+ maxMergeSize = (long) (mb * 1024 * 1024);
+ }
+
+ /// <summary>Returns the largest segment (meaured by total byte
+ /// size of the segment's files, in MB) that may be merged
+ /// with other segments.
+ /// </summary>
+ /// <seealso cref="setMaxMergeMB">
+ /// </seealso>
+ public virtual double GetMaxMergeMB()
+ {
+ return ((double) maxMergeSize) / 1024 / 1024;
+ }
+
+ /// <summary>Sets the minimum size for the lowest level segments.
+ /// Any segments below this size are considered to be on
+ /// the same level (even if they vary drastically in size)
+ /// and will be merged whenever there are mergeFactor of
+ /// them. This effectively truncates the "long tail" of
+ /// small segments that would otherwise be created into a
+ /// single level. If you set this too large, it could
+ /// greatly increase the merging cost during indexing (if
+ /// you flush many small segments).
+ /// </summary>
+ public virtual void SetMinMergeMB(double mb)
+ {
+ minMergeSize = (long) (mb * 1024 * 1024);
+ }
+
+ /// <summary>Get the minimum size for a segment to remain
+ /// un-merged.
+ /// </summary>
+ /// <seealso cref="setMinMergeMB *">
+ /// </seealso>
+ public virtual double GetMinMergeMB()
+ {
+ return ((double) minMergeSize) / 1024 / 1024;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogDocMergePolicy.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/LogDocMergePolicy.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogDocMergePolicy.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogDocMergePolicy.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary>This is a {@link LogMergePolicy} that measures size of a
+ /// segment as the number of documents (not taking deletions
+ /// into account).
+ /// </summary>
+
+ public class LogDocMergePolicy : LogMergePolicy
+ {
+
+ /// <seealso cref="setMinMergeDocs">
+ /// </seealso>
+ public const int DEFAULT_MIN_MERGE_DOCS = 1000;
+
+ public LogDocMergePolicy():base()
+ {
+ minMergeSize = DEFAULT_MIN_MERGE_DOCS;
+
+ // maxMergeSize is never used by LogDocMergePolicy; set
+ // it to Long.MAX_VALUE to disable it
+ maxMergeSize = System.Int64.MaxValue;
+ }
+ protected internal override long Size(SegmentInfo info)
+ {
+ return info.docCount;
+ }
+
+ /// <summary>Sets the minimum size for the lowest level segments.
+ /// Any segments below this size are considered to be on
+ /// the same level (even if they vary drastically in size)
+ /// and will be merged whenever there are mergeFactor of
+ /// them. This effectively truncates the "long tail" of
+ /// small segments that would otherwise be created into a
+ /// single level. If you set this too large, it could
+ /// greatly increase the merging cost during indexing (if
+ /// you flush many small segments).
+ /// </summary>
+ public virtual void SetMinMergeDocs(int minMergeDocs)
+ {
+ minMergeSize = minMergeDocs;
+ }
+
+ /// <summary>Get the minimum size for a segment to remain
+ /// un-merged.
+ /// </summary>
+ /// <seealso cref="setMinMergeDocs *">
+ /// </seealso>
+ public virtual int GetMinMergeDocs()
+ {
+ return (int) minMergeSize;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogMergePolicy.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/LogMergePolicy.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogMergePolicy.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/LogMergePolicy.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,442 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using Directory = Lucene.Net.Store.Directory;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary><p>This class implements a {@link MergePolicy} that tries
+ /// to merge segments into levels of exponentially
+ /// increasing size, where each level has < mergeFactor
+ /// segments in it. Whenever a given levle has mergeFactor
+ /// segments or more in it, they will be merged.</p>
+ ///
+ /// <p>This class is abstract and requires a subclass to
+ /// define the {@link #size} method which specifies how a
+ /// segment's size is determined. {@link LogDocMergePolicy}
+ /// is one subclass that measures size by document count in
+ /// the segment. {@link LogByteSizeMergePolicy} is another
+ /// subclass that measures size as the total byte size of the
+ /// file(s) for the segment.</p>
+ /// </summary>
+
+ public abstract class LogMergePolicy : MergePolicy
+ {
+
+ /// <summary>Defines the allowed range of log(size) for each
+ /// level. A level is computed by taking the max segment
+ /// log size, minuse LEVEL_LOG_SPAN, and finding all
+ /// segments falling within that range.
+ /// </summary>
+ public const double LEVEL_LOG_SPAN = 0.75;
+
+ /// <summary>Default merge factor, which is how many segments are
+ /// merged at a time
+ /// </summary>
+ public const int DEFAULT_MERGE_FACTOR = 10;
+
+ /// <summary>Default maximum segment size. A segment of this size</summary>
+ /// <seealso cref="setMaxMergeDocs">
+ /// </seealso>
+ public static readonly int DEFAULT_MAX_MERGE_DOCS = System.Int32.MaxValue;
+
+ private int mergeFactor = DEFAULT_MERGE_FACTOR;
+
+ internal long minMergeSize;
+ internal long maxMergeSize;
+ internal int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
+
+ private bool useCompoundFile = true;
+ private bool useCompoundDocStore = true;
+ private IndexWriter writer;
+
+ private void Message(System.String message)
+ {
+ if (writer != null)
+ writer.Message("LMP: " + message);
+ }
+
+ /// <summary><p>Returns the number of segments that are merged at
+ /// once and also controls the total number of segments
+ /// allowed to accumulate in the index.</p>
+ /// </summary>
+ public virtual int GetMergeFactor()
+ {
+ return mergeFactor;
+ }
+
+ /// <summary>Determines how often segment indices are merged by
+ /// addDocument(). With smaller values, less RAM is used
+ /// while indexing, and searches on unoptimized indices are
+ /// faster, but indexing speed is slower. With larger
+ /// values, more RAM is used during indexing, and while
+ /// searches on unoptimized indices are slower, indexing is
+ /// faster. Thus larger values (> 10) are best for batch
+ /// index creation, and smaller values (< 10) for indices
+ /// that are interactively maintained.
+ /// </summary>
+ public virtual void SetMergeFactor(int mergeFactor)
+ {
+ if (mergeFactor < 2)
+ throw new System.ArgumentException("mergeFactor cannot be less than 2");
+ this.mergeFactor = mergeFactor;
+ }
+
+ // Javadoc inherited
+ public override bool UseCompoundFile(SegmentInfos infos, SegmentInfo info)
+ {
+ return useCompoundFile;
+ }
+
+ /// <summary>Sets whether compound file format should be used for
+ /// newly flushed and newly merged segments.
+ /// </summary>
+ public virtual void SetUseCompoundFile(bool useCompoundFile)
+ {
+ this.useCompoundFile = useCompoundFile;
+ }
+
+ /// <summary>Returns true if newly flushed and newly merge segments</summary>
+ /// <seealso cref="">
+ /// #setUseCompoundFile
+ /// </seealso>
+ public virtual bool GetUseCompoundFile()
+ {
+ return useCompoundFile;
+ }
+
+ // Javadoc inherited
+ public override bool UseCompoundDocStore(SegmentInfos infos)
+ {
+ return useCompoundDocStore;
+ }
+
+ /// <summary>Sets whether compound file format should be used for
+ /// newly flushed and newly merged doc store
+ /// segment files (term vectors and stored fields).
+ /// </summary>
+ public virtual void SetUseCompoundDocStore(bool useCompoundDocStore)
+ {
+ this.useCompoundDocStore = useCompoundDocStore;
+ }
+
+ /// <summary>Returns true if newly flushed and newly merge doc
+ /// store segment files (term vectors and stored fields)
+ /// </summary>
+ /// <seealso cref="">
+ /// #setUseCompoundDocStore
+ /// </seealso>
+ public virtual bool GetUseCompoundDocStore()
+ {
+ return useCompoundDocStore;
+ }
+
+ public override void Close()
+ {
+ }
+
+ abstract protected internal long Size(SegmentInfo info);
+
+ private bool IsOptimized(SegmentInfos infos, IndexWriter writer, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
+ {
+ int numSegments = infos.Count;
+ int numToOptimize = 0;
+ SegmentInfo optimizeInfo = null;
+ for (int i = 0; i < numSegments && numToOptimize <= maxNumSegments; i++)
+ {
+ SegmentInfo info = infos.Info(i);
+ if (segmentsToOptimize.Contains(info))
+ {
+ numToOptimize++;
+ optimizeInfo = info;
+ }
+ }
+
+ return numToOptimize <= maxNumSegments && (numToOptimize != 1 || IsOptimized(writer, optimizeInfo));
+ }
+
+ /// <summary>Returns true if this single nfo is optimized (has no
+ /// pending norms or deletes, is in the same dir as the
+ /// writer, and matches the current compound file setting
+ /// </summary>
+ private bool IsOptimized(IndexWriter writer, SegmentInfo info)
+ {
+ return !info.HasDeletions() && !info.HasSeparateNorms() && info.dir == writer.GetDirectory() && info.GetUseCompoundFile() == useCompoundFile;
+ }
+
+ /// <summary>Returns the merges necessary to optimize the index.
+ /// This merge policy defines "optimized" to mean only one
+ /// segment in the index, where that segment has no
+ /// deletions pending nor separate norms, and it is in
+ /// compound file format if the current useCompoundFile
+ /// setting is true. This method returns multiple merges
+ /// (mergeFactor at a time) so the {@link MergeScheduler}
+ /// in use may make use of concurrency.
+ /// </summary>
+ public override MergeSpecification FindMergesForOptimize(SegmentInfos infos, IndexWriter writer, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
+ {
+ MergeSpecification spec;
+
+ System.Diagnostics.Debug.Assert(maxNumSegments > 0);
+
+ if (!IsOptimized(infos, writer, maxNumSegments, segmentsToOptimize))
+ {
+
+ // Find the newest (rightmost) segment that needs to
+ // be optimized (other segments may have been flushed
+ // since optimize started):
+ int last = infos.Count;
+ while (last > 0)
+ {
+ SegmentInfo info = infos.Info(--last);
+ if (segmentsToOptimize.Contains(info))
+ {
+ last++;
+ break;
+ }
+ }
+
+ if (last > 0)
+ {
+
+ spec = new MergeSpecification();
+
+ // First, enroll all "full" merges (size
+ // mergeFactor) to potentially be run concurrently:
+ while (last - maxNumSegments + 1 >= mergeFactor)
+ {
+ spec.Add(new OneMerge(infos.Range(last - mergeFactor, last), useCompoundFile));
+ last -= mergeFactor;
+ }
+
+ // Only if there are no full merges pending do we
+ // add a final partial (< mergeFactor segments) merge:
+ if (0 == spec.merges.Count)
+ {
+ if (maxNumSegments == 1)
+ {
+
+ // Since we must optimize down to 1 segment, the
+ // choice is simple:
+ if (last > 1 || !IsOptimized(writer, infos.Info(0)))
+ spec.Add(new OneMerge(infos.Range(0, last), useCompoundFile));
+ }
+ else if (last > maxNumSegments)
+ {
+
+ // Take care to pick a partial merge that is
+ // least cost, but does not make the index too
+ // lopsided. If we always just picked the
+ // partial tail then we could produce a highly
+ // lopsided index over time:
+
+ // We must merge this many segments to leave
+ // maxNumSegments in the index (from when
+ // optimize was first kicked off):
+ int finalMergeSize = last - maxNumSegments + 1;
+
+ // Consider all possible starting points:
+ long bestSize = 0;
+ int bestStart = 0;
+
+ for (int i = 0; i < last - finalMergeSize + 1; i++)
+ {
+ long sumSize = 0;
+ for (int j = 0; j < finalMergeSize; j++)
+ sumSize += Size(infos.Info(j + i));
+ if (i == 0 || (sumSize < 2 * Size(infos.Info(i - 1)) && sumSize < bestSize))
+ {
+ bestStart = i;
+ bestSize = sumSize;
+ }
+ }
+
+ spec.Add(new OneMerge(infos.Range(bestStart, bestStart + finalMergeSize), useCompoundFile));
+ }
+ }
+ }
+ else
+ spec = null;
+ }
+ else
+ spec = null;
+
+ return spec;
+ }
+
+ /// <summary>Checks if any merges are now necessary and returns a
+ /// {@link MergePolicy.MergeSpecification} if so. A merge
+ /// is necessary when there are more than {@link
+ /// #setMergeFactor} segments at a given level. When
+ /// multiple levels have too many segments, this method
+ /// will return multiple merges, allowing the {@link
+ /// MergeScheduler} to use concurrency.
+ /// </summary>
+ public override MergeSpecification FindMerges(SegmentInfos infos, IndexWriter writer)
+ {
+
+ int numSegments = infos.Count;
+ this.writer = writer;
+ Message("findMerges: " + numSegments + " segments");
+
+ // Compute levels, which is just log (base mergeFactor)
+ // of the size of each segment
+ float[] levels = new float[numSegments];
+ float norm = (float) System.Math.Log(mergeFactor);
+
+ Directory directory = writer.GetDirectory();
+
+ for (int i = 0; i < numSegments; i++)
+ {
+ SegmentInfo info = infos.Info(i);
+ long size = Size(info);
+
+ // Refuse to import a segment that's too large
+ if (info.docCount > maxMergeDocs && info.dir != directory)
+ throw new System.ArgumentException("Segment is too large (" + info.docCount + " docs vs max docs " + maxMergeDocs + ")");
+
+ if (size >= maxMergeSize && info.dir != directory)
+ throw new System.ArgumentException("Segment is too large (" + size + " vs max size " + maxMergeSize + ")");
+
+ // Floor tiny segments
+ if (size < 1)
+ size = 1;
+ levels[i] = (float) System.Math.Log(size) / norm;
+ }
+
+ float levelFloor;
+ if (minMergeSize <= 0)
+ levelFloor = (float) 0.0;
+ else
+ {
+ levelFloor = (float) (System.Math.Log(minMergeSize) / norm);
+ }
+
+ // Now, we quantize the log values into levels. The
+ // first level is any segment whose log size is within
+ // LEVEL_LOG_SPAN of the max size, or, who has such as
+ // segment "to the right". Then, we find the max of all
+ // other segments and use that to define the next level
+ // segment, etc.
+
+ MergeSpecification spec = null;
+
+ int start = 0;
+ while (start < numSegments)
+ {
+
+ // Find max level of all segments not already
+ // quantized.
+ float maxLevel = levels[start];
+ for (int i = 1 + start; i < numSegments; i++)
+ {
+ float level = levels[i];
+ if (level > maxLevel)
+ maxLevel = level;
+ }
+
+ // Now search backwards for the rightmost segment that
+ // falls into this level:
+ float levelBottom;
+ if (maxLevel < levelFloor)
+ // All remaining segments fall into the min level
+ levelBottom = - 1.0F;
+ else
+ {
+ levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
+
+ // Force a boundary at the level floor
+ if (levelBottom < levelFloor && maxLevel >= levelFloor)
+ levelBottom = levelFloor;
+ }
+
+ int upto = numSegments - 1;
+ while (upto >= start)
+ {
+ if (levels[upto] >= levelBottom)
+ {
+ break;
+ }
+ upto--;
+ }
+ Message(" level " + levelBottom + " to " + maxLevel + ": " + (1 + upto - start) + " segments");
+
+ // Finally, record all merges that are viable at this level:
+ int end = start + mergeFactor;
+ while (end <= 1 + upto)
+ {
+ bool anyTooLarge = false;
+ for (int i = start; i < end; i++)
+ {
+ SegmentInfo info = infos.Info(i);
+ anyTooLarge |= (Size(info) >= maxMergeSize || info.docCount >= maxMergeDocs);
+ }
+
+ if (!anyTooLarge)
+ {
+ if (spec == null)
+ spec = new MergeSpecification();
+ Message(" " + start + " to " + end + ": add this merge");
+ spec.Add(new OneMerge(infos.Range(start, end), useCompoundFile));
+ }
+ else
+ Message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
+
+ start = end;
+ end = start + mergeFactor;
+ }
+
+ start = 1 + upto;
+ }
+
+ return spec;
+ }
+
+ /// <summary><p>Determines the largest segment (measured by
+ /// document count) that may be merged with other segments.
+ /// Small values (e.g., less than 10,000) are best for
+ /// interactive indexing, as this limits the length of
+ /// pauses while indexing to a few seconds. Larger values
+ /// are best for batched indexing and speedier
+ /// searches.</p>
+ ///
+ /// <p>The default value is {@link Integer#MAX_VALUE}.</p>
+ ///
+ /// <p>The default merge policy ({@link
+ /// LogByteSizeMergePolicy}) also allows you to set this
+ /// limit by net size (in MB) of the segment, using {@link
+ /// LogByteSizeMergePolicy#setMaxMergeMB}.</p>
+ /// </summary>
+ public virtual void SetMaxMergeDocs(int maxMergeDocs)
+ {
+ this.maxMergeDocs = maxMergeDocs;
+ }
+
+ /// <summary>Returns the largest segment (measured by document
+ /// count) that may be merged with other segments.
+ /// </summary>
+ /// <seealso cref="setMaxMergeDocs">
+ /// </seealso>
+ public virtual int GetMaxMergeDocs()
+ {
+ return maxMergeDocs;
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MergePolicy.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/MergePolicy.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MergePolicy.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MergePolicy.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using Directory = Lucene.Net.Store.Directory;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary> <p>Expert: a MergePolicy determines the sequence of
+ /// primitive merge operations to be used for overall merge
+ /// and optimize operations.</p>
+ ///
+ /// <p>Whenever the segments in an index have been altered by
+ /// {@link IndexWriter}, either the addition of a newly
+ /// flushed segment, addition of many segments from
+ /// addIndexes* calls, or a previous merge that may now need
+ /// to cascade, {@link IndexWriter} invokes {@link
+ /// #findMerges} to give the MergePolicy a chance to pick
+ /// merges that are now required. This method returns a
+ /// {@link MergeSpecification} instance describing the set of
+ /// merges that should be done, or null if no merges are
+ /// necessary. When IndexWriter.optimize is called, it calls
+ /// {@link #findMergesForOptimize} and the MergePolicy should
+ /// then return the necessary merges.</p>
+ ///
+ /// <p>Note that the policy can return more than one merge at
+ /// a time. In this case, if the writer is using {@link
+ /// SerialMergeScheduler}, the merges will be run
+ /// sequentially but if it is using {@link
+ /// ConcurrentMergeScheduler} they will be run concurrently.</p>
+ ///
+ /// <p>The default MergePolicy is {@link
+ /// LogByteSizeMergePolicy}.</p>
+ /// <p><b>NOTE:</b> This API is new and still experimental
+ /// (subject to change suddenly in the next release)</p>
+ /// </summary>
+
+ public abstract class MergePolicy
+ {
+
+ /// <summary>OneMerge provides the information necessary to perform
+ /// an individual primitive merge operation, resulting in
+ /// a single new segment. The merge spec includes the
+ /// subset of segments to be merged as well as whether the
+ /// new segment should use the compound file format.
+ /// </summary>
+
+ public class OneMerge
+ {
+
+ internal SegmentInfo info; // used by IndexWriter
+ internal bool mergeDocStores; // used by IndexWriter
+ internal bool optimize; // used by IndexWriter
+ internal SegmentInfos segmentsClone; // used by IndexWriter
+ internal bool increfDone; // used by IndexWriter
+ internal bool registerDone; // used by IndexWriter
+ internal long mergeGen; // used by IndexWriter
+ internal bool isExternal; // used by IndexWriter
+ internal int maxNumSegmentsOptimize; // used by IndexWriter
+
+ internal SegmentInfos segments;
+ internal bool useCompoundFile;
+ internal bool aborted;
+ internal System.Exception error;
+
+ public OneMerge(SegmentInfos segments, bool useCompoundFile)
+ {
+ if (0 == segments.Count)
+ throw new System.SystemException("segments must include at least one segment");
+ this.segments = segments;
+ this.useCompoundFile = useCompoundFile;
+ }
+
+ /// <summary>Record that an exception occurred while executing
+ /// this merge
+ /// </summary>
+ internal virtual void SetException(System.Exception error)
+ {
+ lock (this)
+ {
+ this.error = error;
+ }
+ }
+
+ /// <summary>Retrieve previous exception set by {@link
+ /// #setException}.
+ /// </summary>
+ internal virtual System.Exception GetException()
+ {
+ lock (this)
+ {
+ return error;
+ }
+ }
+
+ /// <summary>Mark this merge as aborted. If this is called
+ /// before the merge is committed then the merge will
+ /// not be committed.
+ /// </summary>
+ internal virtual void Abort()
+ {
+ lock (this)
+ {
+ aborted = true;
+ }
+ }
+
+ /// <summary>Returns true if this merge was aborted. </summary>
+ internal virtual bool IsAborted()
+ {
+ lock (this)
+ {
+ return aborted;
+ }
+ }
+
+ internal virtual void CheckAborted(Directory dir)
+ {
+ lock (this)
+ {
+ if (aborted)
+ throw new MergeAbortedException("merge is aborted: " + SegString(dir));
+ }
+ }
+
+ internal virtual System.String SegString(Directory dir)
+ {
+ System.Text.StringBuilder b = new System.Text.StringBuilder();
+ int numSegments = segments.Count;
+ for (int i = 0; i < numSegments; i++)
+ {
+ if (i > 0)
+ b.Append(" ");
+ b.Append(segments.Info(i).SegString(dir));
+ }
+ if (info != null)
+ b.Append(" into ").Append(info.name);
+ if (optimize)
+ b.Append(" [optimize]");
+ return b.ToString();
+ }
+ }
+
+ /// <summary> A MergeSpecification instance provides the information
+ /// necessary to perform multiple merges. It simply
+ /// contains a list of {@link OneMerge} instances.
+ /// </summary>
+
+ public class MergeSpecification
+ {
+
+ /// <summary> The subset of segments to be included in the primitive merge.</summary>
+
+ public System.Collections.IList merges = new System.Collections.ArrayList();
+
+ public virtual void Add(OneMerge merge)
+ {
+ merges.Add(merge);
+ }
+
+ public virtual System.String SegString(Directory dir)
+ {
+ System.Text.StringBuilder b = new System.Text.StringBuilder();
+ b.Append("MergeSpec:\n");
+ int count = merges.Count;
+ for (int i = 0; i < count; i++)
+ b.Append(" ").Append(1 + i).Append(": ").Append(((OneMerge) merges[i]).SegString(dir));
+ return b.ToString();
+ }
+ }
+
+ /// <summary>Exception thrown if there are any problems while
+ /// executing a merge.
+ /// </summary>
+ [Serializable]
+ public class MergeException:System.SystemException
+ {
+ public MergeException(System.String message) : base(message)
+ {
+ }
+ public MergeException(System.Exception exc) : base(null, exc)
+ {
+ }
+ }
+
+ [Serializable]
+ public class MergeAbortedException:System.IO.IOException
+ {
+ public MergeAbortedException():base("merge is aborted")
+ {
+ }
+ public MergeAbortedException(System.String message):base(message)
+ {
+ }
+ }
+
+ /// <summary> Determine what set of merge operations are now
+ /// necessary on the index. The IndexWriter calls this
+ /// whenever there is a change to the segments. This call
+ /// is always synchronized on the IndexWriter instance so
+ /// only one thread at a time will call this method.
+ ///
+ /// </summary>
+ /// <param name="segmentInfos">the total set of segments in the index
+ /// </param>
+ /// <param name="writer">IndexWriter instance
+ /// </param>
+ public abstract MergeSpecification FindMerges(SegmentInfos segmentInfos, IndexWriter writer);
+
+ /// <summary> Determine what set of merge operations are necessary in
+ /// order to optimize the index. The IndexWriter calls
+ /// this when its optimize() method is called. This call
+ /// is always synchronized on the IndexWriter instance so
+ /// only one thread at a time will call this method.
+ ///
+ /// </summary>
+ /// <param name="segmentInfos">the total set of segments in the index
+ /// </param>
+ /// <param name="writer">IndexWriter instance
+ /// </param>
+ /// <param name="maxSegmentCount">requested maximum number of
+ /// segments in the index (currently this is always 1)
+ /// </param>
+ /// <param name="segmentsToOptimize">contains the specific
+ /// SegmentInfo instances that must be merged away. This
+ /// may be a subset of all SegmentInfos.
+ /// </param>
+ public abstract MergeSpecification FindMergesForOptimize(SegmentInfos segmentInfos, IndexWriter writer, int maxSegmentCount, System.Collections.Hashtable segmentsToOptimize);
+
+ /// <summary> Release all resources for the policy.</summary>
+ public abstract void Close();
+
+ /// <summary> Returns true if a newly flushed (not from merge)
+ /// segment should use the compound file format.
+ /// </summary>
+ public abstract bool UseCompoundFile(SegmentInfos segments, SegmentInfo newSegment);
+
+ /// <summary> Returns true if the doc store files should use the
+ /// compound file format.
+ /// </summary>
+ public abstract bool UseCompoundDocStore(SegmentInfos segments);
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MergeScheduler.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/MergeScheduler.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MergeScheduler.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MergeScheduler.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary>Expert: {@link IndexWriter} uses an instance
+ /// implementing this interface to execute the merges
+ /// selected by a {@link MergePolicy}. The default
+ /// MergeScheduler is {@link ConcurrentMergeScheduler}.
+ /// <p><b>NOTE:</b> This API is new and still experimental
+ /// (subject to change suddenly in the next release)</p>
+ /// </summary>
+
+ public abstract class MergeScheduler
+ {
+
+ /// <summary>Run the merges provided by {@link IndexWriter#GetNextMerge()}. </summary>
+ public abstract void Merge(IndexWriter writer);
+
+ /// <summary>Close this MergeScheduler. </summary>
+ public abstract void Close();
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiLevelSkipListReader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/MultiLevelSkipListReader.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiLevelSkipListReader.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiLevelSkipListReader.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+using System;
+
+using BufferedIndexInput = Lucene.Net.Store.BufferedIndexInput;
+using IndexInput = Lucene.Net.Store.IndexInput;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary> This abstract class reads skip lists with multiple levels.
+ ///
+ /// See {@link MultiLevelSkipListWriter} for the information about the encoding
+ /// of the multi level skip lists.
+ ///
+ /// Subclasses must implement the abstract method {@link #ReadSkipData(int, IndexInput)}
+ /// which defines the actual format of the skip data.
+ /// </summary>
+ abstract class MultiLevelSkipListReader
+ {
+ // the maximum number of skip levels possible for this index
+ private int maxNumberOfSkipLevels;
+
+ // number of levels in this skip list
+ private int numberOfSkipLevels;
+
+ // Expert: defines the number of top skip levels to buffer in memory.
+ // Reducing this number results in less memory usage, but possibly
+ // slower performance due to more random I/Os.
+ // Please notice that the space each level occupies is limited by
+ // the skipInterval. The top level can not contain more than
+ // skipLevel entries, the second top level can not contain more
+ // than skipLevel^2 entries and so forth.
+ private int numberOfLevelsToBuffer = 1;
+
+ private int docCount;
+ private bool haveSkipped;
+
+ private IndexInput[] skipStream; // skipStream for each level
+ private long[] skipPointer; // the start pointer of each skip level
+ private int[] skipInterval; // skipInterval of each level
+ private int[] numSkipped; // number of docs skipped per level
+
+ private int[] skipDoc; // doc id of current skip entry per level
+ private int lastDoc; // doc id of last read skip entry with docId <= target
+ private long[] childPointer; // child pointer of current skip entry per level
+ private long lastChildPointer; // childPointer of last read skip entry with docId <= target
+
+ private bool inputIsBuffered;
+
+ public MultiLevelSkipListReader(IndexInput skipStream, int maxSkipLevels, int skipInterval)
+ {
+ this.skipStream = new IndexInput[maxSkipLevels];
+ this.skipPointer = new long[maxSkipLevels];
+ this.childPointer = new long[maxSkipLevels];
+ this.numSkipped = new int[maxSkipLevels];
+ this.maxNumberOfSkipLevels = maxSkipLevels;
+ this.skipInterval = new int[maxSkipLevels];
+ this.skipStream[0] = skipStream;
+ this.inputIsBuffered = (skipStream is BufferedIndexInput);
+ this.skipInterval[0] = skipInterval;
+ for (int i = 1; i < maxSkipLevels; i++)
+ {
+ // cache skip intervals
+ this.skipInterval[i] = this.skipInterval[i - 1] * skipInterval;
+ }
+ skipDoc = new int[maxSkipLevels];
+ }
+
+
+ /// <summary>Returns the id of the doc to which the last call of {@link #SkipTo(int)}
+ /// has skipped.
+ /// </summary>
+ internal virtual int GetDoc()
+ {
+ return lastDoc;
+ }
+
+
+ /// <summary>Skips entries to the first beyond the current whose document number is
+ /// greater than or equal to <i>target</i>. Returns the current doc count.
+ /// </summary>
+ internal virtual int SkipTo(int target)
+ {
+ if (!haveSkipped)
+ {
+ // first time, load skip levels
+ LoadSkipLevels();
+ haveSkipped = true;
+ }
+
+ // walk up the levels until highest level is found that has a skip
+ // for this target
+ int level = 0;
+ while (level < numberOfSkipLevels - 1 && target > skipDoc[level + 1])
+ {
+ level++;
+ }
+
+ while (level >= 0)
+ {
+ if (target > skipDoc[level])
+ {
+ if (!LoadNextSkip(level))
+ {
+ continue;
+ }
+ }
+ else
+ {
+ // no more skips on this level, go down one level
+ if (level > 0 && lastChildPointer > skipStream[level - 1].GetFilePointer())
+ {
+ SeekChild(level - 1);
+ }
+ level--;
+ }
+ }
+
+ return numSkipped[0] - skipInterval[0] - 1;
+ }
+
+ private bool LoadNextSkip(int level)
+ {
+ // we have to skip, the target document is greater than the current
+ // skip list entry
+ SetLastSkipData(level);
+
+ numSkipped[level] += skipInterval[level];
+
+ if (numSkipped[level] > docCount)
+ {
+ // this skip list is exhausted
+ skipDoc[level] = System.Int32.MaxValue;
+ if (numberOfSkipLevels > level)
+ numberOfSkipLevels = level;
+ return false;
+ }
+
+ // read next skip entry
+ skipDoc[level] += ReadSkipData(level, skipStream[level]);
+
+ if (level != 0)
+ {
+ // read the child pointer if we are not on the leaf level
+ childPointer[level] = skipStream[level].ReadVLong() + skipPointer[level - 1];
+ }
+
+ return true;
+ }
+
+ /// <summary>Seeks the skip entry on the given level </summary>
+ protected internal virtual void SeekChild(int level)
+ {
+ skipStream[level].Seek(lastChildPointer);
+ numSkipped[level] = numSkipped[level + 1] - skipInterval[level + 1];
+ skipDoc[level] = lastDoc;
+ if (level > 0)
+ {
+ childPointer[level] = skipStream[level].ReadVLong() + skipPointer[level - 1];
+ }
+ }
+
+ internal virtual void Close()
+ {
+ for (int i = 1; i < skipStream.Length; i++)
+ {
+ if (skipStream[i] != null)
+ {
+ skipStream[i].Close();
+ }
+ }
+ }
+
+ /// <summary>initializes the reader </summary>
+ internal virtual void Init(long skipPointer, int df)
+ {
+ this.skipPointer[0] = skipPointer;
+ this.docCount = df;
+ Array.Clear(skipDoc, 0, skipDoc.Length);
+ Array.Clear(numSkipped, 0, numSkipped.Length);
+ Array.Clear(childPointer, 0, childPointer.Length);
+
+ haveSkipped = false;
+ for (int i = 1; i < numberOfSkipLevels; i++)
+ {
+ skipStream[i] = null;
+ }
+ }
+
+ /// <summary>Loads the skip levels </summary>
+ private void LoadSkipLevels()
+ {
+ numberOfSkipLevels = docCount == 0 ? 0 :(int) System.Math.Floor(System.Math.Log(docCount) / System.Math.Log(skipInterval[0]));
+ if (numberOfSkipLevels > maxNumberOfSkipLevels)
+ {
+ numberOfSkipLevels = maxNumberOfSkipLevels;
+ }
+
+ skipStream[0].Seek(skipPointer[0]);
+
+ int toBuffer = numberOfLevelsToBuffer;
+
+ for (int i = numberOfSkipLevels - 1; i > 0; i--)
+ {
+ // the length of the current level
+ long length = skipStream[0].ReadVLong();
+
+ // the start pointer of the current level
+ skipPointer[i] = skipStream[0].GetFilePointer();
+ if (toBuffer > 0)
+ {
+ // buffer this level
+ skipStream[i] = new SkipBuffer(skipStream[0], (int) length);
+ toBuffer--;
+ }
+ else
+ {
+ // clone this stream, it is already at the start of the current level
+ skipStream[i] = (IndexInput) skipStream[0].Clone();
+ if (inputIsBuffered && length < BufferedIndexInput.BUFFER_SIZE)
+ {
+ ((BufferedIndexInput) skipStream[i]).SetBufferSize((int) length);
+ }
+
+ // move base stream beyond the current level
+ skipStream[0].Seek(skipStream[0].GetFilePointer() + length);
+ }
+ }
+
+ // use base stream for the lowest level
+ skipPointer[0] = skipStream[0].GetFilePointer();
+ }
+
+ /// <summary> Subclasses must implement the actual skip data encoding in this method.
+ ///
+ /// </summary>
+ /// <param name="level">the level skip data shall be read from
+ /// </param>
+ /// <param name="skipStream">the skip stream to read from
+ /// </param>
+ protected internal abstract int ReadSkipData(int level, IndexInput skipStream);
+
+ /// <summary>Copies the values of the last read skip entry on this level </summary>
+ protected internal virtual void SetLastSkipData(int level)
+ {
+ lastDoc = skipDoc[level];
+ lastChildPointer = childPointer[level];
+ }
+
+
+ /// <summary>used to buffer the top skip levels </summary>
+ private sealed class SkipBuffer : IndexInput
+ {
+ private byte[] data;
+ private long pointer;
+ private int pos;
+
+ internal SkipBuffer(IndexInput input, int length)
+ {
+ data = new byte[length];
+ pointer = input.GetFilePointer();
+ input.ReadBytes(data, 0, length);
+ }
+
+ public override void Close()
+ {
+ data = null;
+ }
+
+ public override long GetFilePointer()
+ {
+ return pointer + pos;
+ }
+
+ public override long Length()
+ {
+ return data.Length;
+ }
+
+ public override byte ReadByte()
+ {
+ return data[pos++];
+ }
+
+ public override void ReadBytes(byte[] b, int offset, int len)
+ {
+ Array.Copy(data, pos, b, offset, len);
+ pos += len;
+ }
+
+ public override void Seek(long pos)
+ {
+ this.pos = (int) (pos - pointer);
+ }
+
+ //override public System.Object Clone() // {{Aroush-2.3.1}} Do we need this?
+ //{
+ // return null;
+ //}
+ }
+ }
+}
\ No newline at end of file
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiLevelSkipListWriter.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/MultiLevelSkipListWriter.cs?rev=671404&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiLevelSkipListWriter.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiLevelSkipListWriter.cs Tue Jun 24 19:52:22 2008
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+
+using IndexOutput = Lucene.Net.Store.IndexOutput;
+using RAMOutputStream = Lucene.Net.Store.RAMOutputStream;
+
+namespace Lucene.Net.Index
+{
+
+ /// <summary> This abstract class writes skip lists with multiple levels.
+ ///
+ /// Example for skipInterval = 3:
+ /// c (skip level 2)
+ /// c c c (skip level 1)
+ /// x x x x x x x x x x (skip level 0)
+ /// d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d d (posting list)
+ /// 3 6 9 12 15 18 21 24 27 30 (df)
+ ///
+ /// d - document
+ /// x - skip data
+ /// c - skip data with child pointer
+ ///
+ /// Skip level i contains every skipInterval-th entry from skip level i-1.
+ /// Therefore the number of entries on level i is: floor(df / ((skipInterval ^ (i + 1))).
+ ///
+ /// Each skip entry on a level i>0 contains a pointer to the corresponding skip entry in list i-1.
+ /// This guarantess a logarithmic amount of skips to find the target document.
+ ///
+ /// While this class takes care of writing the different skip levels,
+ /// subclasses must define the actual format of the skip data.
+ ///
+ /// </summary>
+ abstract class MultiLevelSkipListWriter
+ {
+ // number of levels in this skip list
+ private int numberOfSkipLevels;
+
+ // the skip interval in the list with level = 0
+ private int skipInterval;
+
+ // for every skip level a different buffer is used
+ private RAMOutputStream[] skipBuffer;
+
+ protected internal MultiLevelSkipListWriter(int skipInterval, int maxSkipLevels, int df)
+ {
+ this.skipInterval = skipInterval;
+
+ // calculate the maximum number of skip levels for this document frequency
+ numberOfSkipLevels = df == 0 ? 0 : (int) System.Math.Floor(System.Math.Log(df) / System.Math.Log(skipInterval));
+
+ // make sure it does not exceed maxSkipLevels
+ if (numberOfSkipLevels > maxSkipLevels)
+ {
+ numberOfSkipLevels = maxSkipLevels;
+ }
+ }
+
+ protected internal virtual void init()
+ {
+ skipBuffer = new RAMOutputStream[numberOfSkipLevels];
+ for (int i = 0; i < numberOfSkipLevels; i++)
+ {
+ skipBuffer[i] = new RAMOutputStream();
+ }
+ }
+
+ protected internal virtual void ResetSkip()
+ {
+ // creates new buffers or empties the existing ones
+ if (skipBuffer == null)
+ {
+ init();
+ }
+ else
+ {
+ for (int i = 0; i < skipBuffer.Length; i++)
+ {
+ skipBuffer[i].Reset();
+ }
+ }
+ }
+
+ /// <summary> Subclasses must implement the actual skip data encoding in this method.
+ ///
+ /// </summary>
+ /// <param name="level">the level skip data shall be writting for
+ /// </param>
+ /// <param name="skipBuffer">the skip buffer to write to
+ /// </param>
+ protected internal abstract void WriteSkipData(int level, IndexOutput skipBuffer);
+
+ /// <summary> Writes the current skip data to the buffers. The current document frequency determines
+ /// the max level is skip data is to be written to.
+ ///
+ /// </summary>
+ /// <param name="df">the current document frequency
+ /// </param>
+ /// <throws> IOException </throws>
+ internal virtual void BufferSkip(int df)
+ {
+ int numLevels;
+
+ // determine max level
+ for (numLevels = 0; (df % skipInterval) == 0 && numLevels < numberOfSkipLevels; df /= skipInterval)
+ {
+ numLevels++;
+ }
+
+ long childPointer = 0;
+
+ for (int level = 0; level < numLevels; level++)
+ {
+ WriteSkipData(level, skipBuffer[level]);
+
+ long newChildPointer = skipBuffer[level].GetFilePointer();
+
+ if (level != 0)
+ {
+ // store child pointers for all levels except the lowest
+ skipBuffer[level].WriteVLong(childPointer);
+ }
+
+ //remember the childPointer for the next level
+ childPointer = newChildPointer;
+ }
+ }
+
+ /// <summary> Writes the buffered skip lists to the given output.
+ ///
+ /// </summary>
+ /// <param name="output">the IndexOutput the skip lists shall be written to
+ /// </param>
+ /// <returns> the pointer the skip list starts
+ /// </returns>
+ internal virtual long WriteSkip(IndexOutput output)
+ {
+ long skipPointer = output.GetFilePointer();
+ if (skipBuffer == null || skipBuffer.Length == 0)
+ return skipPointer;
+
+ for (int level = numberOfSkipLevels - 1; level > 0; level--)
+ {
+ long length = skipBuffer[level].GetFilePointer();
+ if (length > 0)
+ {
+ output.WriteVLong(length);
+ skipBuffer[level].WriteTo(output);
+ }
+ }
+ skipBuffer[0].WriteTo(output);
+
+ return skipPointer;
+ }
+ }
+}
\ No newline at end of file
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiReader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/MultiReader.cs?rev=671404&r1=671403&r2=671404&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiReader.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/MultiReader.cs Tue Jun 24 19:52:22 2008
@@ -16,9 +16,12 @@
*/
using System;
+
using Document = Lucene.Net.Documents.Document;
using FieldSelector = Lucene.Net.Documents.FieldSelector;
-using Directory = Lucene.Net.Store.Directory;
+using MultiTermDocs = Lucene.Net.Index.MultiSegmentReader.MultiTermDocs;
+using MultiTermEnum = Lucene.Net.Index.MultiSegmentReader.MultiTermEnum;
+using MultiTermPositions = Lucene.Net.Index.MultiSegmentReader.MultiTermPositions;
namespace Lucene.Net.Index
{
@@ -26,12 +29,13 @@
/// <summary>An IndexReader which reads multiple indexes, appending their content.
///
/// </summary>
- /// <version> $Id: MultiReader.java 499176 2007-01-23 22:54:40Z dnaber $
+ /// <version> $Id: MultiReader.java 596004 2007-11-17 21:34:23Z buschmi $
/// </version>
public class MultiReader : IndexReader
{
- private IndexReader[] subReaders;
+ protected internal IndexReader[] subReaders;
private int[] starts; // 1st docno for each segment
+ private bool[] decrefOnClose; // remember which subreaders to decRef on close
private System.Collections.Hashtable normsCache = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
private int maxDoc = 0;
private int numDocs = - 1;
@@ -45,55 +49,185 @@
/// <param name="subReaders">set of (sub)readers
/// </param>
/// <throws> IOException </throws>
- public MultiReader(IndexReader[] subReaders):base(subReaders.Length == 0?null:subReaders[0].Directory())
+ public MultiReader(IndexReader[] subReaders)
{
- Initialize(subReaders);
+ Initialize(subReaders, true);
}
- /// <summary>Construct reading the named set of readers. </summary>
- public MultiReader(Directory directory, SegmentInfos sis, bool closeDirectory, IndexReader[] subReaders) : base(directory, sis, closeDirectory)
+ /// <summary> <p>Construct a MultiReader aggregating the named set of (sub)readers.
+ /// Directory locking for delete, undeleteAll, and setNorm operations is
+ /// left to the subreaders. </p>
+ /// </summary>
+ /// <param name="closeSubReaders">indicates whether the subreaders should be closed
+ /// when this MultiReader is closed
+ /// </param>
+ /// <param name="subReaders">set of (sub)readers
+ /// </param>
+ /// <throws> IOException </throws>
+ public MultiReader(IndexReader[] subReaders, bool closeSubReaders)
{
- Initialize(subReaders);
+ Initialize(subReaders, closeSubReaders);
}
- private void Initialize(IndexReader[] subReaders)
+ private void Initialize(IndexReader[] subReaders, bool closeSubReaders)
{
this.subReaders = subReaders;
starts = new int[subReaders.Length + 1]; // build starts array
+ decrefOnClose = new bool[subReaders.Length];
for (int i = 0; i < subReaders.Length; i++)
{
starts[i] = maxDoc;
maxDoc += subReaders[i].MaxDoc(); // compute maxDocs
+ if (!closeSubReaders)
+ {
+ subReaders[i].IncRef();
+ decrefOnClose[i] = true;
+ }
+ else
+ {
+ decrefOnClose[i] = false;
+ }
+
if (subReaders[i].HasDeletions())
hasDeletions = true;
}
starts[subReaders.Length] = maxDoc;
}
-
- /// <summary>Return an array of term frequency vectors for the specified document.
- /// The array contains a vector for each vectorized field in the document.
- /// Each vector vector contains term numbers and frequencies for all terms
- /// in a given vectorized field.
- /// If no such fields existed, the method returns null.
+ /// <summary> Tries to reopen the subreaders.
+ /// <br>
+ /// If one or more subreaders could be re-opened (i. e. subReader.reopen()
+ /// returned a new instance != subReader), then a new MultiReader instance
+ /// is returned, otherwise this instance is returned.
+ /// <p>
+ /// A re-opened instance might share one or more subreaders with the old
+ /// instance. Index modification operations result in undefined behavior
+ /// when performed before the old instance is closed.
+ /// (see {@link IndexReader#Reopen()}).
+ /// <p>
+ /// If subreaders are shared, then the reference count of those
+ /// readers is increased to ensure that the subreaders remain open
+ /// until the last referring reader is closed.
+ ///
/// </summary>
+ /// <throws> CorruptIndexException if the index is corrupt </throws>
+ /// <throws> IOException if there is a low-level IO error </throws>
+ public override IndexReader Reopen()
+ {
+ EnsureOpen();
+
+ bool reopened = false;
+ IndexReader[] newSubReaders = new IndexReader[subReaders.Length];
+ bool[] newDecrefOnClose = new bool[subReaders.Length];
+
+ bool success = false;
+ try
+ {
+ for (int i = 0; i < subReaders.Length; i++)
+ {
+ newSubReaders[i] = subReaders[i].Reopen();
+ // if at least one of the subreaders was updated we remember that
+ // and return a new MultiReader
+ if (newSubReaders[i] != subReaders[i])
+ {
+ reopened = true;
+ // this is a new subreader instance, so on close() we don't
+ // decRef but close it
+ newDecrefOnClose[i] = false;
+ }
+ }
+
+ if (reopened)
+ {
+ for (int i = 0; i < subReaders.Length; i++)
+ {
+ if (newSubReaders[i] == subReaders[i])
+ {
+ newSubReaders[i].IncRef();
+ newDecrefOnClose[i] = true;
+ }
+ }
+
+ MultiReader mr = new MultiReader(newSubReaders);
+ mr.decrefOnClose = newDecrefOnClose;
+ success = true;
+ return mr;
+ }
+ else
+ {
+ success = true;
+ return this;
+ }
+ }
+ finally
+ {
+ if (!success && reopened)
+ {
+ for (int i = 0; i < newSubReaders.Length; i++)
+ {
+ if (newSubReaders[i] != null)
+ {
+ try
+ {
+ if (newDecrefOnClose[i])
+ {
+ newSubReaders[i].DecRef();
+ }
+ else
+ {
+ newSubReaders[i].Close();
+ }
+ }
+ catch (System.IO.IOException ignore)
+ {
+ // keep going - we want to clean up as much as possible
+ }
+ }
+ }
+ }
+ }
+ }
+
public override TermFreqVector[] GetTermFreqVectors(int n)
{
+ EnsureOpen();
int i = ReaderIndex(n); // find segment num
return subReaders[i].GetTermFreqVectors(n - starts[i]); // dispatch to segment
}
public override TermFreqVector GetTermFreqVector(int n, System.String field)
{
+ EnsureOpen();
int i = ReaderIndex(n); // find segment num
return subReaders[i].GetTermFreqVector(n - starts[i], field);
}
+
+ public override void GetTermFreqVector(int docNumber, System.String field, TermVectorMapper mapper)
+ {
+ EnsureOpen();
+ int i = ReaderIndex(docNumber); // find segment num
+ subReaders[i].GetTermFreqVector(docNumber - starts[i], field, mapper);
+ }
+
+ public override void GetTermFreqVector(int docNumber, TermVectorMapper mapper)
+ {
+ EnsureOpen();
+ int i = ReaderIndex(docNumber); // find segment num
+ subReaders[i].GetTermFreqVector(docNumber - starts[i], mapper);
+ }
+
+ public override bool IsOptimized()
+ {
+ return false;
+ }
+
public override int NumDocs()
{
lock (this)
{
+ // Don't call ensureOpen() here (it could affect performance)
if (numDocs == - 1)
{
// check cache
@@ -108,23 +242,28 @@
public override int MaxDoc()
{
+ // Don't call ensureOpen() here (it could affect performance)
return maxDoc;
}
+ // inherit javadoc
public override Document Document(int n, FieldSelector fieldSelector)
{
+ EnsureOpen();
int i = ReaderIndex(n); // find segment num
return subReaders[i].Document(n - starts[i], fieldSelector); // dispatch to segment reader
}
public override bool IsDeleted(int n)
{
+ // Don't call ensureOpen() here (it could affect performance)
int i = ReaderIndex(n); // find segment num
return subReaders[i].IsDeleted(n - starts[i]); // dispatch to segment reader
}
public override bool HasDeletions()
{
+ // Don't call ensureOpen() here (it could affect performance)
return hasDeletions;
}
@@ -140,6 +279,7 @@
{
for (int i = 0; i < subReaders.Length; i++)
subReaders[i].UndeleteAll();
+
hasDeletions = false;
numDocs = - 1; // invalidate cache
}
@@ -147,32 +287,12 @@
private int ReaderIndex(int n)
{
// find reader for doc n:
- int lo = 0; // search starts array
- int hi = subReaders.Length - 1; // for first element less
-
- while (hi >= lo)
- {
- int mid = (lo + hi) >> 1;
- int midValue = starts[mid];
- if (n < midValue)
- hi = mid - 1;
- else if (n > midValue)
- lo = mid + 1;
- else
- {
- // found a match
- while (mid + 1 < subReaders.Length && starts[mid + 1] == midValue)
- {
- mid++; // scan to last match
- }
- return mid;
- }
- }
- return hi;
+ return MultiSegmentReader.ReaderIndex(n, this.starts, this.subReaders.Length);
}
public override bool HasNorms(System.String field)
{
+ EnsureOpen();
for (int i = 0; i < subReaders.Length; i++)
{
if (subReaders[i].HasNorms(field))
@@ -182,7 +302,7 @@
}
private byte[] ones;
- private byte[] fakeNorms()
+ private byte[] FakeNorms()
{
if (ones == null)
ones = SegmentReader.CreateFakeNorms(MaxDoc());
@@ -193,11 +313,12 @@
{
lock (this)
{
+ EnsureOpen();
byte[] bytes = (byte[]) normsCache[field];
if (bytes != null)
return bytes; // cache hit
if (!HasNorms(field))
- return fakeNorms();
+ return FakeNorms();
bytes = new byte[MaxDoc()];
for (int i = 0; i < subReaders.Length; i++)
@@ -211,15 +332,16 @@
{
lock (this)
{
+ EnsureOpen();
byte[] bytes = (byte[]) normsCache[field];
if (bytes == null && !HasNorms(field))
- bytes = fakeNorms();
+ bytes = FakeNorms();
if (bytes != null)
- // cache hit
+ // cache hit
Array.Copy(bytes, 0, result, offset, MaxDoc());
for (int i = 0; i < subReaders.Length; i++)
- // read from segments
+ // read from segments
subReaders[i].Norms(field, result, offset + starts[i]);
}
}
@@ -233,16 +355,19 @@
public override TermEnum Terms()
{
+ EnsureOpen();
return new MultiTermEnum(subReaders, starts, null);
}
public override TermEnum Terms(Term term)
{
+ EnsureOpen();
return new MultiTermEnum(subReaders, starts, term);
}
public override int DocFreq(Term t)
{
+ EnsureOpen();
int total = 0; // sum freqs in segments
for (int i = 0; i < subReaders.Length; i++)
total += subReaders[i].DocFreq(t);
@@ -251,313 +376,72 @@
public override TermDocs TermDocs()
{
+ EnsureOpen();
return new MultiTermDocs(subReaders, starts);
}
public override TermPositions TermPositions()
{
+ EnsureOpen();
return new MultiTermPositions(subReaders, starts);
}
- protected internal override void SetDeleter(IndexFileDeleter deleter)
- {
- // Share deleter to our SegmentReaders:
- this.deleter = deleter;
- for (int i = 0; i < subReaders.Length; i++)
- subReaders[i].SetDeleter(deleter);
- }
-
protected internal override void DoCommit()
{
for (int i = 0; i < subReaders.Length; i++)
subReaders[i].Commit();
}
- internal override void StartCommit()
- {
- base.StartCommit();
- for (int i = 0; i < subReaders.Length; i++)
- {
- subReaders[i].StartCommit();
- }
- }
-
- internal override void RollbackCommit()
- {
- base.RollbackCommit();
- for (int i = 0; i < subReaders.Length; i++)
- {
- subReaders[i].RollbackCommit();
- }
- }
-
protected internal override void DoClose()
{
lock (this)
{
for (int i = 0; i < subReaders.Length; i++)
- subReaders[i].Close();
- }
- }
-
- /// <seealso cref="IndexReader#GetFieldNames(IndexReader.FieldOption)">
- /// </seealso>
- public override System.Collections.ICollection GetFieldNames(IndexReader.FieldOption fieldNames)
- {
- // maintain a unique set of field names
- System.Collections.Hashtable fieldSet = new System.Collections.Hashtable();
- for (int i = 0; i < subReaders.Length; i++)
- {
- IndexReader reader = subReaders[i];
- System.Collections.ICollection names = reader.GetFieldNames(fieldNames);
- for (System.Collections.IEnumerator iterator = names.GetEnumerator(); iterator.MoveNext(); )
- {
- System.Collections.DictionaryEntry fi = (System.Collections.DictionaryEntry) iterator.Current;
- System.String s = fi.Key.ToString();
- if (fieldSet.ContainsKey(s) == false)
- {
- fieldSet.Add(s, s);
- }
- }
- }
- return fieldSet;
- }
- }
-
- class MultiTermEnum : TermEnum
- {
- private SegmentMergeQueue queue;
-
- private Term term;
- private int docFreq;
-
- public MultiTermEnum(IndexReader[] readers, int[] starts, Term t)
- {
- queue = new SegmentMergeQueue(readers.Length);
- for (int i = 0; i < readers.Length; i++)
- {
- IndexReader reader = readers[i];
- TermEnum termEnum;
-
- if (t != null)
- {
- termEnum = reader.Terms(t);
- }
- else
- termEnum = reader.Terms();
-
- SegmentMergeInfo smi = new SegmentMergeInfo(starts[i], termEnum, reader);
- if (t == null?smi.Next():termEnum.Term() != null)
- queue.Put(smi);
- // initialize queue
- else
- smi.Close();
- }
-
- if (t != null && queue.Size() > 0)
- {
- Next();
- }
- }
-
- public override bool Next()
- {
- SegmentMergeInfo top = (SegmentMergeInfo) queue.Top();
- if (top == null)
- {
- term = null;
- return false;
- }
-
- term = top.term;
- docFreq = 0;
-
- while (top != null && term.CompareTo(top.term) == 0)
- {
- queue.Pop();
- docFreq += top.termEnum.DocFreq(); // increment freq
- if (top.Next())
- queue.Put(top);
- // restore queue
- else
- top.Close(); // done with a segment
- top = (SegmentMergeInfo) queue.Top();
- }
- return true;
- }
-
- public override Term Term()
- {
- return term;
- }
-
- public override int DocFreq()
- {
- return docFreq;
- }
-
- public override void Close()
- {
- queue.Close();
- }
- }
-
- class MultiTermDocs : TermDocs
- {
- protected internal IndexReader[] readers;
- protected internal int[] starts;
- protected internal Term term;
-
- protected internal int base_Renamed = 0;
- protected internal int pointer = 0;
-
- private TermDocs[] readerTermDocs;
- protected internal TermDocs current; // == readerTermDocs[pointer]
-
- public MultiTermDocs(IndexReader[] r, int[] s)
- {
- readers = r;
- starts = s;
-
- readerTermDocs = new TermDocs[r.Length];
- }
-
- public virtual int Doc()
- {
- return base_Renamed + current.Doc();
- }
- public virtual int Freq()
- {
- return current.Freq();
- }
-
- public virtual void Seek(Term term)
- {
- this.term = term;
- this.base_Renamed = 0;
- this.pointer = 0;
- this.current = null;
- }
-
- public virtual void Seek(TermEnum termEnum)
- {
- Seek(termEnum.Term());
- }
-
- public virtual bool Next()
- {
- for (; ; )
- {
- if (current != null && current.Next())
- {
- return true;
- }
- else if (pointer < readers.Length)
- {
- base_Renamed = starts[pointer];
- current = TermDocs(pointer++);
- }
- else
- {
- return false;
- }
- }
- }
-
- /// <summary>Optimized implementation. </summary>
- public virtual int Read(int[] docs, int[] freqs)
- {
- while (true)
- {
- while (current == null)
{
- if (pointer < readers.Length)
+ if (decrefOnClose[i])
{
- // try next segment
- base_Renamed = starts[pointer];
- current = TermDocs(pointer++);
+ subReaders[i].DecRef();
}
else
{
- return 0;
+ subReaders[i].Close();
}
}
- int end = current.Read(docs, freqs);
- if (end == 0)
- {
- // none left in segment
- current = null;
- }
- else
- {
- // got some
- int b = base_Renamed; // adjust doc numbers
- for (int i = 0; i < end; i++)
- docs[i] += b;
- return end;
- }
- }
- }
-
- /* A Possible future optimization could skip entire segments */
- public virtual bool SkipTo(int target)
- {
- for (; ; )
- {
- if (current != null && current.SkipTo(target - base_Renamed))
- {
- return true;
- }
- else if (pointer < readers.Length)
- {
- base_Renamed = starts[pointer];
- current = TermDocs(pointer++);
- }
- else
- return false;
}
}
- private TermDocs TermDocs(int i)
- {
- if (term == null)
- return null;
- TermDocs result = readerTermDocs[i];
- if (result == null)
- result = readerTermDocs[i] = TermDocs(readers[i]);
- result.Seek(term);
- return result;
- }
-
- protected internal virtual TermDocs TermDocs(IndexReader reader)
+ public override System.Collections.ICollection GetFieldNames(IndexReader.FieldOption fieldNames)
{
- return reader.TermDocs();
+ EnsureOpen();
+ return MultiSegmentReader.GetFieldNames(fieldNames, this.subReaders);
}
- public virtual void Close()
+ /// <summary> Checks recursively if all subreaders are up to date. </summary>
+ public override bool IsCurrent()
{
- for (int i = 0; i < readerTermDocs.Length; i++)
+ for (int i = 0; i < subReaders.Length; i++)
{
- if (readerTermDocs[i] != null)
- readerTermDocs[i].Close();
+ if (!subReaders[i].IsCurrent())
+ {
+ return false;
+ }
}
- }
- }
-
- class MultiTermPositions : MultiTermDocs, TermPositions
- {
- public MultiTermPositions(IndexReader[] r, int[] s):base(r, s)
- {
+
+ // all subreaders are up to date
+ return true;
}
- protected internal override TermDocs TermDocs(IndexReader reader)
+ /// <summary>Not implemented.</summary>
+ /// <throws> UnsupportedOperationException </throws>
+ public override long GetVersion()
{
- return (TermDocs) reader.TermPositions();
+ throw new System.NotSupportedException("MultiReader does not support this method.");
}
- public virtual int NextPosition()
+ // for testing
+ internal virtual IndexReader[] GetSubReaders()
{
- return ((TermPositions) current).NextPosition();
+ return subReaders;
}
}
}
\ No newline at end of file