You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by do...@apache.org on 2009/07/29 20:04:24 UTC
svn commit: r798995 [12/35] - in /incubator/lucene.net/trunk/C#/src:
Lucene.Net/ Lucene.Net/Analysis/ Lucene.Net/Analysis/Standard/
Lucene.Net/Document/ Lucene.Net/Index/ Lucene.Net/QueryParser/
Lucene.Net/Search/ Lucene.Net/Search/Function/ Lucene.Net...
Added: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/ReusableStringReader.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/ReusableStringReader.cs?rev=798995&view=auto
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/ReusableStringReader.cs (added)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/ReusableStringReader.cs Wed Jul 29 18:04:12 2009
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace Lucene.Net.Index
+{
+ /// <summary>
+ /// Used by DocumentsWriter to implemented a StringReader
+ /// that can be reset to a new string; we use this when
+ /// tokenizing the string value from a Field.
+ /// </summary>
+ internal sealed class ReusableStringReader : System.IO.TextReader
+ {
+ int upto;
+ int left;
+ string s;
+
+ internal void Init(string s)
+ {
+ this.s = s;
+ left = s.Length;
+ this.upto = 0;
+ }
+
+ public int Read(char[] c)
+ {
+ return Read(c, 0, c.Length);
+ }
+
+ public override int Read(char[] c, int off, int len)
+ {
+ if (left > len)
+ {
+ SupportClass.TextSupport.GetCharsFromString(s, upto, upto + len, c, off);
+ upto += len;
+ left -= len;
+ return len;
+ }
+ else if (0 == left)
+ {
+ return -1;
+ }
+ else
+ {
+ SupportClass.TextSupport.GetCharsFromString(s, upto, upto + left, c, off);
+ int r = left;
+ left = 0;
+ upto = s.Length;
+ return r;
+ }
+ }
+
+ public override void Close() { }
+ }
+}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentInfo.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/SegmentInfo.cs?rev=798995&r1=798994&r2=798995&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentInfo.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentInfo.cs Wed Jul 29 18:04:12 2009
@@ -15,8 +15,9 @@
* limitations under the License.
*/
-using System;
+using System.Collections.Generic;
+using BitVector = Lucene.Net.Util.BitVector;
using Directory = Lucene.Net.Store.Directory;
using IndexOutput = Lucene.Net.Store.IndexOutput;
using IndexInput = Lucene.Net.Store.IndexInput;
@@ -32,7 +33,7 @@
internal const int CHECK_DIR = 0; // e.g. must check dir to see if there are norms/deletions
internal const int WITHOUT_GEN = 0; // a file name that has no GEN in it.
- public System.String name; // unique name in dir
+ public string name; // unique name in dir
public int docCount; // number of docs in seg
public Directory dir; // where segment resides
@@ -63,18 +64,23 @@
// and true for newly created merged segments (both
// compound and non compound).
- private System.Collections.IList files; // cached list of files that this segment uses
+ private List<string> files; // cached list of files that this segment uses
// in the Directory
internal long sizeInBytes = - 1; // total byte size of all of our files (computed on demand)
private int docStoreOffset; // if this segment shares stored fields & vectors, this
// offset is where in that file this segment's docs begin
- private System.String docStoreSegment; // name used to derive fields/vectors file we share with
+ private string docStoreSegment; // name used to derive fields/vectors file we share with
// other segments
private bool docStoreIsCompoundFile; // whether doc store files are stored in compound file (*.cfx)
+
+ private int delCount; // How many deleted docs in this segment, or -1 if not yet known
+ // (if it's an older index)
+
+ private bool hasProx; // True if this segment has any fields with omitTf==false
- public SegmentInfo(System.String name, int docCount, Directory dir)
+ public SegmentInfo(string name, int docCount, Directory dir)
{
this.name = name;
this.docCount = docCount;
@@ -86,13 +92,17 @@
docStoreOffset = - 1;
docStoreSegment = name;
docStoreIsCompoundFile = false;
+ delCount = 0;
+ hasProx = true;
}
- public SegmentInfo(System.String name, int docCount, Directory dir, bool isCompoundFile, bool hasSingleNormFile) : this(name, docCount, dir, isCompoundFile, hasSingleNormFile, - 1, null, false)
+ public SegmentInfo(string name, int docCount, Directory dir, bool isCompoundFile, bool hasSingleNormFile)
+ : this(name, docCount, dir, isCompoundFile, hasSingleNormFile, - 1, null, false, true)
{
}
- public SegmentInfo(System.String name, int docCount, Directory dir, bool isCompoundFile, bool hasSingleNormFile, int docStoreOffset, System.String docStoreSegment, bool docStoreIsCompoundFile) : this(name, docCount, dir)
+ public SegmentInfo(string name, int docCount, Directory dir, bool isCompoundFile, bool hasSingleNormFile, int docStoreOffset, string docStoreSegment, bool docStoreIsCompoundFile, bool hasProx)
+ : this(name, docCount, dir)
{
this.isCompoundFile = (sbyte) (isCompoundFile ? YES : NO);
this.hasSingleNormFile = hasSingleNormFile;
@@ -100,7 +110,9 @@
this.docStoreOffset = docStoreOffset;
this.docStoreSegment = docStoreSegment;
this.docStoreIsCompoundFile = docStoreIsCompoundFile;
- System.Diagnostics.Debug.Assert(docStoreOffset == - 1 || docStoreSegment != null);
+ this.hasProx = hasProx;
+ delCount = 0;
+ System.Diagnostics.Debug.Assert(docStoreOffset == - 1 || docStoreSegment != null);
}
/// <summary> Copy everything from src SegmentInfo into our instance.</summary>
@@ -121,10 +133,11 @@
else
{
normGen = new long[src.normGen.Length];
- Array.Copy(src.normGen, 0, normGen, 0, src.normGen.Length);
+ System.Array.Copy(src.normGen, 0, normGen, 0, src.normGen.Length);
}
isCompoundFile = src.isCompoundFile;
hasSingleNormFile = src.hasSingleNormFile;
+ delCount = src.delCount;
}
/// <summary> Construct a new SegmentInfo instance by reading a
@@ -188,6 +201,19 @@
}
isCompoundFile = (sbyte) input.ReadByte();
preLockless = (isCompoundFile == CHECK_DIR);
+ if (format <= SegmentInfos.FORMAT_DEL_COUNT)
+ {
+ delCount = input.ReadInt();
+ System.Diagnostics.Debug.Assert(delCount <= docCount);
+ }
+ else
+ {
+ delCount = -1;
+ }
+ if (format <= SegmentInfos.FORMAT_HAS_PROX)
+ hasProx = input.ReadByte() == 1;
+ else
+ hasProx = true;
}
else
{
@@ -199,6 +225,8 @@
docStoreOffset = - 1;
docStoreIsCompoundFile = false;
docStoreSegment = null;
+ delCount = -1;
+ hasProx = true;
}
}
@@ -231,16 +259,16 @@
/// <summary>Returns total size in bytes of all of files used by
/// this segment.
/// </summary>
- internal long SizeInBytes()
+ public /* changed for zoie 1.3.0: internal */ long SizeInBytes()
{
if (sizeInBytes == - 1)
{
- System.Collections.IList files = Files();
+ List<string> files = Files();
int size = files.Count;
sizeInBytes = 0;
for (int i = 0; i < size; i++)
{
- System.String fileName = (System.String) files[i];
+ string fileName = files[i];
// We don't count bytes used by a shared doc store
// against this segment:
if (docStoreOffset == - 1 || !IndexFileNames.IsDocStoreFile(fileName))
@@ -249,8 +277,8 @@
}
return sizeInBytes;
}
-
- internal bool HasDeletions()
+
+ public /* changed for zoie 1.3.0: internal */ bool HasDeletions()
{
// Cases:
//
@@ -300,11 +328,12 @@
ClearFiles();
}
- public System.Object Clone()
+ public object Clone()
{
SegmentInfo si = new SegmentInfo(name, docCount, dir);
si.isCompoundFile = isCompoundFile;
- si.delGen = delGen;
+ si.delGen = delGen;
+ si.delCount = delCount;
si.preLockless = preLockless;
si.hasSingleNormFile = hasSingleNormFile;
if (normGen != null)
@@ -323,7 +352,7 @@
return si;
}
- internal System.String GetDelFileName()
+ internal string GetDelFileName()
{
if (delGen == NO)
{
@@ -333,9 +362,28 @@
}
else
{
- // If delGen is CHECK_DIR, it's the pre-lockless-commit file format
- return IndexFileNames.FileNameFromGeneration(name, "." + IndexFileNames.DELETES_EXTENSION, delGen);
- }
+ string retVal = null;
+ string current = IndexFileNames.FileNameFromGeneration(name, "." + IndexFileNames.DELETES_EXTENSION, delGen);
+ if (this.dir.FileExists(current))
+ {
+ retVal = current;
+ }
+ else
+ {
+ string backwards = (name + "_" + System.Convert.ToString(delGen, 16) + "." + IndexFileNames.DELETES_EXTENSION);
+ if (this.dir.FileExists(backwards))
+ {
+ // we are dealing with the old name
+ retVal = backwards;
+ }
+ else
+ {
+ // no file, creating one, so use the new name
+ retVal = current;
+ }
+ }
+ return retVal;
+ }
}
/// <summary> Returns true if this field for this segment has saved a separate norms file (_<segment>_N.sX).
@@ -348,7 +396,7 @@
if ((normGen == null && preLockless) || (normGen != null && normGen[fieldNumber] == CHECK_DIR))
{
// Must fallback to directory file exists check:
- System.String fileName = name + ".s" + fieldNumber;
+ string fileName = name + ".s" + fieldNumber;
return dir.FileExists(fileName);
}
else if (normGen == null || normGen[fieldNumber] == NO)
@@ -362,7 +410,7 @@
}
/// <summary> Returns true if any fields in this segment have separate norms.</summary>
- internal bool HasSeparateNorms()
+ public /* changed for zoie 1.3.0: internal */ bool HasSeparateNorms()
{
if (normGen == null)
{
@@ -377,13 +425,13 @@
// This means this segment was saved with pre-LOCKLESS
// code. So we must fallback to the original
// directory list check:
- System.String[] result = dir.List();
+ string[] result = dir.List();
if (result == null)
{
throw new System.IO.IOException("cannot read directory " + dir + ": list() returned null");
}
- System.String pattern;
+ string pattern;
pattern = name + ".s";
int patternLength = pattern.Length;
for (int i = 0; i < result.Length; i++)
@@ -447,9 +495,9 @@
/// </summary>
/// <param name="number">field index
/// </param>
- internal System.String GetNormFileName(int number)
+ internal string GetNormFileName(int number)
{
- System.String prefix;
+ string prefix;
long gen;
if (normGen == null)
@@ -502,7 +550,7 @@
/// <summary> Returns true if this segment is stored as a compound
/// file; else, false.
/// </summary>
- internal bool GetUseCompoundFile()
+ public /* changed for zoie 1.3.0: internal */ bool GetUseCompoundFile()
{
if (isCompoundFile == NO)
{
@@ -517,7 +565,29 @@
return dir.FileExists(name + "." + IndexFileNames.COMPOUND_FILE_EXTENSION);
}
}
-
+
+ public /* changed for zoie 1.3.0: internal */ int GetDelCount()
+ {
+ if (delCount == -1)
+ {
+ if (HasDeletions())
+ {
+ string delFileName = GetDelFileName();
+ delCount = new BitVector(dir, delFileName).Count();
+ }
+ else
+ delCount = 0;
+ }
+ System.Diagnostics.Debug.Assert(delCount <= docCount);
+ return delCount;
+ }
+
+ internal void SetDelCount(int delCount)
+ {
+ this.delCount = delCount;
+ System.Diagnostics.Debug.Assert(delCount <= docCount);
+ }
+
internal int GetDocStoreOffset()
{
return docStoreOffset;
@@ -534,7 +604,7 @@
ClearFiles();
}
- internal System.String GetDocStoreSegment()
+ internal string GetDocStoreSegment()
{
return docStoreSegment;
}
@@ -571,10 +641,24 @@
output.WriteLong(normGen[j]);
}
}
- output.WriteByte((byte) isCompoundFile);
+ output.WriteByte((byte)isCompoundFile);
+ output.WriteInt(delCount);
+ output.WriteByte((byte)(hasProx ? 1 : 0));
}
+
+ internal void SetHasProx(bool hasProx)
+ {
+ this.hasProx = hasProx;
+ ClearFiles();
+ }
+
+ internal bool GetHasProx()
+ {
+ return hasProx;
+ }
+
- private void AddIfExists(System.Collections.IList files, System.String fileName)
+ private void AddIfExists(System.Collections.Generic.List<string> files, string fileName)
{
if (dir.FileExists(fileName))
files.Add(fileName);
@@ -586,7 +670,7 @@
* modify it.
*/
- public System.Collections.IList Files()
+ public List<string> Files()
{
if (files != null)
@@ -595,7 +679,7 @@
return files;
}
- files = new System.Collections.ArrayList();
+ files = new List<string>();
bool useCompoundFile = GetUseCompoundFile();
@@ -605,7 +689,7 @@
}
else
{
- System.String[] exts = IndexFileNames.NON_STORE_INDEX_EXTENSIONS;
+ string[] exts = IndexFileNames.NON_STORE_INDEX_EXTENSIONS;
for (int i = 0; i < exts.Length; i++)
AddIfExists(files, name + "." + exts[i]);
}
@@ -621,7 +705,7 @@
}
else
{
- System.String[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
+ string[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
for (int i = 0; i < exts.Length; i++)
AddIfExists(files, docStoreSegment + "." + exts[i]);
}
@@ -630,12 +714,12 @@
{
// We are not sharing, and, these files were not
// included in the compound file
- System.String[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
+ string[] exts = IndexFileNames.STORE_INDEX_EXTENSIONS;
for (int i = 0; i < exts.Length; i++)
AddIfExists(files, name + "." + exts[i]);
}
-
- System.String delFileName = IndexFileNames.FileNameFromGeneration(name, "." + IndexFileNames.DELETES_EXTENSION, delGen);
+ string delFileName = this.GetDelFileName();
+
if (delFileName != null && (delGen >= YES || dir.FileExists(delFileName)))
{
files.Add(delFileName);
@@ -658,7 +742,7 @@
// in the non compound file case:
if (!hasSingleNormFile && !useCompoundFile)
{
- System.String fileName = name + "." + IndexFileNames.PLAIN_NORMS_EXTENSION + i;
+ string fileName = name + "." + IndexFileNames.PLAIN_NORMS_EXTENSION + i;
if (dir.FileExists(fileName))
{
files.Add(fileName);
@@ -668,7 +752,7 @@
else if (CHECK_DIR == gen)
{
// Pre-2.1: we have to check file existence
- System.String fileName = null;
+ string fileName = null;
if (useCompoundFile)
{
fileName = name + "." + IndexFileNames.SEPARATE_NORMS_EXTENSION + i;
@@ -688,20 +772,20 @@
{
// Pre-2.1: we have to scan the dir to find all
// matching _X.sN/_X.fN files for our segment:
- System.String prefix;
+ string prefix;
if (useCompoundFile)
prefix = name + "." + IndexFileNames.SEPARATE_NORMS_EXTENSION;
else
prefix = name + "." + IndexFileNames.PLAIN_NORMS_EXTENSION;
int prefixLength = prefix.Length;
- System.String[] allFiles = dir.List();
+ string[] allFiles = dir.List();
if (allFiles == null)
{
throw new System.IO.IOException("cannot read directory " + dir + ": list() returned null");
}
for (int i = 0; i < allFiles.Length; i++)
{
- System.String fileName = allFiles[i];
+ string fileName = allFiles[i];
if (fileName.Length > prefixLength && System.Char.IsDigit(fileName[prefixLength]) && fileName.StartsWith(prefix))
{
files.Add(fileName);
@@ -720,9 +804,9 @@
}
/// <summary>Used for debugging </summary>
- public System.String SegString(Directory dir)
+ public string SegString(Directory dir)
{
- System.String cfs;
+ string cfs;
try
{
if (GetUseCompoundFile())
@@ -730,12 +814,12 @@
else
cfs = "C";
}
- catch (System.IO.IOException ioe)
+ catch (System.IO.IOException)
{
cfs = "?";
}
- System.String docStore;
+ string docStore;
if (docStoreOffset != - 1)
docStore = "->" + docStoreSegment;
@@ -748,14 +832,14 @@
/// <summary>We consider another SegmentInfo instance equal if it
/// has the same dir and same name.
/// </summary>
- public override bool Equals(System.Object obj)
+ public override bool Equals(object obj)
{
SegmentInfo other;
try
{
other = (SegmentInfo) obj;
}
- catch (System.InvalidCastException cce)
+ catch (System.InvalidCastException)
{
return false;
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentInfos.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/SegmentInfos.cs?rev=798995&r1=798994&r2=798995&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentInfos.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentInfos.cs Wed Jul 29 18:04:12 2009
@@ -17,6 +17,8 @@
using System;
+using ChecksumIndexInput = Lucene.Net.Store.ChecksumIndexInput;
+using ChecksumIndexOutput = Lucene.Net.Store.ChecksumIndexOutput;
using Directory = Lucene.Net.Store.Directory;
using IndexInput = Lucene.Net.Store.IndexInput;
using IndexOutput = Lucene.Net.Store.IndexOutput;
@@ -27,39 +29,12 @@
[Serializable]
sealed public class SegmentInfos : System.Collections.ArrayList
{
- private class AnonymousClassFindSegmentsFile : FindSegmentsFile
- {
- private void InitBlock(SegmentInfos enclosingInstance)
- {
- this.enclosingInstance = enclosingInstance;
- }
- private SegmentInfos enclosingInstance;
- public SegmentInfos Enclosing_Instance
- {
- get
- {
- return enclosingInstance;
- }
-
- }
- internal AnonymousClassFindSegmentsFile(SegmentInfos enclosingInstance, Lucene.Net.Store.Directory Param1) : base(Param1)
- {
- InitBlock(enclosingInstance);
- }
-
- protected internal override System.Object DoBody(System.String segmentFileName)
- {
- Enclosing_Instance.Read(directory, segmentFileName);
- return null;
- }
- }
-
private class AnonymousClassFindSegmentsFile1 : FindSegmentsFile
{
internal AnonymousClassFindSegmentsFile1(Lucene.Net.Store.Directory Param1) : base(Param1)
{
}
- protected internal override System.Object DoBody(System.String segmentFileName)
+ protected internal override object DoBody(System.String segmentFileName)
{
IndexInput input = directory.OpenInput(segmentFileName);
@@ -118,9 +93,22 @@
/// vectors and stored fields file.
/// </summary>
public const int FORMAT_SHARED_DOC_STORE = - 4;
-
- /* This must always point to the most recent file format. */
- private static readonly int CURRENT_FORMAT = FORMAT_SHARED_DOC_STORE;
+
+ /// <summary> This format adds a checksum at the end of the file to
+ /// ensure all bytes were successfully written.</summary>
+ public const int FORMAT_CHECKSUM = -5;
+
+ /// <summary> This format adds the deletion count for each segment.
+ /// This way IndexWriter can efficiently report numDocs().</summary>
+ public const int FORMAT_DEL_COUNT = -6;
+
+ /// <summary> This format adds the boolean hasProx to record if any
+ /// fields in the segment store prox information (ie, have
+ /// omitTf==false)</summary>
+ public const int FORMAT_HAS_PROX = -7;
+
+ /* This must always point to the most recent file format. */
+ public static readonly int CURRENT_FORMAT = FORMAT_HAS_PROX;
public int counter = 0; // used to name new segments
/// <summary> counts how often the index has been changed by adding or deleting docs.
@@ -269,7 +257,7 @@
// Clear any previous segments:
Clear();
- IndexInput input = directory.OpenInput(segmentFileName);
+ ChecksumIndexInput input = new ChecksumIndexInput(directory.OpenInput(segmentFileName));
generation = GenerationFromSegmentsFileName(segmentFileName);
@@ -308,6 +296,14 @@
else
version = input.ReadLong(); // read version
}
+
+ if (format <= FORMAT_CHECKSUM)
+ {
+ long checksumNow = input.GetChecksum();
+ long checksumThen = input.ReadLong();
+ if (checksumNow != checksumThen)
+ throw new CorruptIndexException("checksum mismatch in segments file");
+ }
success = true;
}
finally
@@ -334,98 +330,121 @@
new AnonymousClassFindSegmentsFile(this, directory).Run();
}
-
- public void Write(Directory directory)
- {
-
- System.String segmentFileName = GetNextSegmentFileName();
-
- // Always advance the generation on write:
- if (generation == - 1)
- {
- generation = 1;
- }
- else
- {
- generation++;
- }
-
- IndexOutput output = directory.CreateOutput(segmentFileName);
-
- bool success = false;
-
- try
- {
- output.WriteInt(CURRENT_FORMAT); // write FORMAT
- output.WriteLong(++version); // every write changes
- // the index
- output.WriteInt(counter); // write counter
- output.WriteInt(Count); // write infos
- for (int i = 0; i < Count; i++)
- {
- Info(i).Write(output);
- }
- }
- finally
- {
- try
- {
- output.Close();
- success = true;
- }
- finally
- {
- if (!success)
- {
- // Try not to leave a truncated segments_N file in
- // the index:
- directory.DeleteFile(segmentFileName);
- }
- }
- }
-
- try
- {
- output = directory.CreateOutput(IndexFileNames.SEGMENTS_GEN);
- try
- {
- output.WriteInt(FORMAT_LOCKLESS);
- output.WriteLong(generation);
- output.WriteLong(generation);
- }
- finally
- {
- output.Close();
- }
- }
- catch (System.IO.IOException e)
- {
- // It's OK if we fail to write this file since it's
- // used only as one of the retry fallbacks.
- }
-
- lastGeneration = generation;
- }
+
+ private class AnonymousClassFindSegmentsFile : FindSegmentsFile
+ {
+ private void InitBlock(SegmentInfos enclosingInstance)
+ {
+ this.enclosingInstance = enclosingInstance;
+ }
+ private SegmentInfos enclosingInstance;
+ public SegmentInfos Enclosing_Instance
+ {
+ get
+ {
+ return enclosingInstance;
+ }
+
+ }
+ internal AnonymousClassFindSegmentsFile(SegmentInfos enclosingInstance, Lucene.Net.Store.Directory Param1)
+ : base(Param1)
+ {
+ InitBlock(enclosingInstance);
+ }
+
+ protected internal override object DoBody(System.String segmentFileName)
+ {
+ Enclosing_Instance.Read(directory, segmentFileName);
+ return null;
+ }
+ }
+
+ // only non-null after PrepareCommit has been called and before FinishCommit is called
+ internal ChecksumIndexOutput pendingOutput;
+
+ private void Write(Directory directory)
+ {
+
+ System.String segmentFileName = GetNextSegmentFileName();
+
+ // Always advance the generation on write:
+ if (generation == -1)
+ {
+ generation = 1;
+ }
+ else
+ {
+ generation++;
+ }
+
+ ChecksumIndexOutput output = new ChecksumIndexOutput(directory.CreateOutput(segmentFileName));
+
+ bool success = false;
+
+ try
+ {
+ output.WriteInt(CURRENT_FORMAT); // write FORMAT
+ output.WriteLong(++version); // every write changes
+ // the index
+ output.WriteInt(counter); // write counter
+ output.WriteInt(Count); // write infos
+ for (int i = 0; i < Count; i++)
+ {
+ Info(i).Write(output);
+ }
+ output.PrepareCommit();
+ success = true;
+ pendingOutput = output;
+ }
+ finally
+ {
+ if (!success)
+ {
+ // we hit an exception above; try to close the file but suppress any exception:
+ try
+ {
+ output.Close();
+ }
+ catch (System.Exception)
+ {
+ // suppress so we keep throwing the original exception
+ }
+ try
+ {
+ // try not to leave a truncated segments_N file int the index
+ directory.DeleteFile(segmentFileName);
+ }
+ catch (System.Exception)
+ {
+ // suppress so we keep throwing the original exception
+ }
+ }
+ }
+ }
+
/// <summary> Returns a copy of this instance, also copying each
/// SegmentInfo.
/// </summary>
- public override System.Object Clone()
+ public override object Clone()
{
- SegmentInfos si = new SegmentInfos();
- for (int i = 0; i < base.Count; i++)
- {
- si.Add(((SegmentInfo) base[i]).Clone());
- }
- si.generation = this.generation;
- si.lastGeneration = this.lastGeneration;
- return si;
+ SegmentInfos si = new SegmentInfos();
+ for (int i = 0; i < base.Count; i++)
+ {
+ si.Add(((SegmentInfo)base[i]).Clone());
+ }
+ si.counter = this.counter;
+ si.version = this.version;
+ si.generation = this.generation;
+ si.lastGeneration = this.lastGeneration;
+ return si;
}
- private SegmentInfos(SegmentInfos si) : base(si)
- {
- }
+ //private SegmentInfos(SegmentInfos si)
+ // : base(si)
+ //{
+ //}
public SegmentInfos()
{
@@ -556,7 +575,7 @@
this.directory = directory;
}
- public System.Object Run()
+ public object Run()
{
System.String segmentFileName = null;
long lastGen = - 1;
@@ -657,7 +676,7 @@
}
}
}
- catch (System.IO.IOException err2)
+ catch (System.IO.IOException)
{
// will retry
}
@@ -670,7 +689,7 @@
{
System.Threading.Thread.Sleep(new System.TimeSpan((System.Int64) 10000 * Lucene.Net.Index.SegmentInfos.defaultGenFileRetryPauseMsec));
}
- catch (System.Threading.ThreadInterruptedException e)
+ catch (System.Threading.ThreadInterruptedException)
{
// will retry
}
@@ -739,7 +758,7 @@
retry = true;
}
}
- else
+ else if (0 == method)
{
// Segment file has advanced since our last loop, so
// reset retry:
@@ -752,7 +771,7 @@
try
{
- System.Object v = DoBody(segmentFileName);
+ object v = DoBody(segmentFileName);
if (exc != null)
{
Lucene.Net.Index.SegmentInfos.Message("success on " + segmentFileName);
@@ -786,53 +805,205 @@
else
{
bool tmpBool;
- if (System.IO.File.Exists(new System.IO.FileInfo(fileDirectory.FullName + System.IO.Path.DirectorySeparatorChar + prevSegmentFileName).FullName))
+ if (System.IO.File.Exists(new System.IO.FileInfo(fileDirectory.FullName + "\\" + prevSegmentFileName).FullName))
tmpBool = true;
else
- tmpBool = System.IO.Directory.Exists(new System.IO.FileInfo(fileDirectory.FullName + System.IO.Path.DirectorySeparatorChar + prevSegmentFileName).FullName);
+ tmpBool = System.IO.Directory.Exists(new System.IO.FileInfo(fileDirectory.FullName + "\\" + prevSegmentFileName).FullName);
prevExists = tmpBool;
}
if (prevExists)
- {
- Lucene.Net.Index.SegmentInfos.Message("fallback to prior segment file '" + prevSegmentFileName + "'");
- try
- {
- System.Object v = DoBody(prevSegmentFileName);
- if (exc != null)
- {
- Lucene.Net.Index.SegmentInfos.Message("success on fallback " + prevSegmentFileName);
- }
- return v;
- }
- catch (System.IO.IOException err2)
- {
- Lucene.Net.Index.SegmentInfos.Message("secondary Exception on '" + prevSegmentFileName + "': " + err2 + "'; will retry");
- }
- }
- }
- }
- }
- }
-
- /// <summary> Subclass must implement this. The assumption is an
- /// IOException will be thrown if something goes wrong
- /// during the processing that could have been caused by
- /// a writer committing.
- /// </summary>
- protected internal abstract System.Object DoBody(System.String segmentFileName);
- }
-
- /// <summary> Returns a new SegmentInfos containg the SegmentInfo
- /// instances in the specified range first (inclusive) to
- /// last (exclusive), so total number of segments returned
- /// is last-first.
- /// </summary>
- public SegmentInfos Range(int first, int last)
- {
- SegmentInfos infos = new SegmentInfos();
- infos.AddRange((System.Collections.IList) ((System.Collections.ArrayList) this).GetRange(first, last - first));
- return infos;
- }
- }
+ {
+ Lucene.Net.Index.SegmentInfos.Message("fallback to prior segment file '" + prevSegmentFileName + "'");
+ try
+ {
+ object v = DoBody(prevSegmentFileName);
+ if (exc != null)
+ {
+ Lucene.Net.Index.SegmentInfos.Message("success on fallback " + prevSegmentFileName);
+ }
+ return v;
+ }
+ catch (System.IO.IOException err2)
+ {
+ Lucene.Net.Index.SegmentInfos.Message("secondary Exception on '" + prevSegmentFileName + "': " + err2 + "'; will retry");
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /// <summary> Subclass must implement this. The assumption is an
+ /// IOException will be thrown if something goes wrong
+ /// during the processing that could have been caused by
+ /// a writer committing.
+ /// </summary>
+ protected internal abstract object DoBody(System.String segmentFileName);
+ }
+
+ /// <summary> Returns a new SegmentInfos containg the SegmentInfo
+ /// instances in the specified range first (inclusive) to
+ /// last (exclusive), so total number of segments returned
+ /// is last-first.
+ /// </summary>
+ public SegmentInfos Range(int first, int last)
+ {
+ SegmentInfos infos = new SegmentInfos();
+ infos.AddRange((System.Collections.IList)((System.Collections.ArrayList)this).GetRange(first, last - first));
+ return infos;
+ }
+
+ // carry over generation numbers from another SegmentInfos
+ internal void UpdateGeneration(SegmentInfos other)
+ {
+ lastGeneration = other.lastGeneration;
+ generation = other.generation;
+ version = other.version;
+ }
+
+ public void RollbackCommit(Directory dir)
+ {
+ if (pendingOutput != null)
+ {
+ try
+ {
+ pendingOutput.Close();
+ }
+ catch (System.Exception)
+ {
+ // Suppress so we keep throwing the original exception
+ // in our caller
+ }
+
+ // Must carefully compute fileName from "generation"
+ // since lastGeneration isn't incremented:
+ try
+ {
+ String segmentFileName = IndexFileNames.FileNameFromGeneration(IndexFileNames.SEGMENTS, "", generation);
+ dir.DeleteFile(segmentFileName);
+ }
+ catch (System.Exception)
+ {
+ // Suppress so we keep throwing the original exception
+ // in our caller
+ }
+ pendingOutput = null;
+ }
+ }
+
+ /** Call this to start a commit. This writes the new
+ * segments file, but writes an invalid checksum at the
+ * end, so that it is not visible to readers. Once this
+ * is called you must call {@link #finishCommit} to complete
+ * the commit or {@link #rollbackCommit} to abort it. */
+ public void PrepareCommit(Directory dir)
+ {
+ if (pendingOutput != null)
+ throw new System.Exception("prepareCommit was already called");
+ Write(dir);
+ }
+
+ public void FinishCommit(Directory dir)
+ {
+ if (pendingOutput == null)
+ throw new System.Exception("prepareCommit was not called");
+ bool success = false;
+ try
+ {
+ pendingOutput.FinishCommit();
+ pendingOutput.Close();
+ pendingOutput = null;
+ success = true;
+ }
+ finally
+ {
+ if (!success)
+ RollbackCommit(dir);
+ }
+
+ // NOTE: if we crash here, we have left a segments_N
+ // file in the directory in a possibly corrupt state (if
+ // some bytes made it to stable storage and others
+ // didn't). But, the segments_N file includes checksum
+ // at the end, which should catch this case. So when a
+ // reader tries to read it, it will throw a
+ // CorruptIndexException, which should cause the retry
+ // logic in SegmentInfos to kick in and load the last
+ // good (previous) segments_N-1 file.
+
+ String fileName = IndexFileNames.FileNameFromGeneration(IndexFileNames.SEGMENTS, "", generation);
+ success = false;
+ try
+ {
+ dir.Sync(fileName);
+ success = true;
+ }
+ finally
+ {
+ if (!success)
+ {
+ try
+ {
+ dir.DeleteFile(fileName);
+ }
+ catch (System.Exception)
+ {
+ // Suppress so we keep throwing the original exception
+ }
+ }
+ }
+
+ lastGeneration = generation;
+
+ try
+ {
+ IndexOutput genOutput = dir.CreateOutput(IndexFileNames.SEGMENTS_GEN);
+ try
+ {
+ genOutput.WriteInt(FORMAT_LOCKLESS);
+ genOutput.WriteLong(generation);
+ genOutput.WriteLong(generation);
+ }
+ finally
+ {
+ genOutput.Close();
+ }
+ }
+ catch (System.Exception)
+ {
+ // It's OK if we fail to write this file since it's
+ // used only as one of the retry fallbacks.
+ }
+ }
+
+ /** Writes & syncs to the Directory dir, taking care to
+ * remove the segments file on exception */
+ public void Commit(Directory dir)
+ {
+ PrepareCommit(dir);
+ FinishCommit(dir);
+ }
+
+ internal string SegString(Directory directory)
+ {
+ lock (this)
+ {
+ System.Text.StringBuilder buffer = new System.Text.StringBuilder();
+ int count = Count;
+ for (int i = 0; i < count; i++)
+ {
+ if (i > 0)
+ {
+ buffer.Append(' ');
+ }
+ SegmentInfo info = Info(i);
+ buffer.Append(info.SegString(directory));
+ if (info.dir != directory)
+ buffer.Append("**");
+ }
+ return buffer.ToString();
+ }
+ }
+ }
}
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentMergeQueue.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/SegmentMergeQueue.cs?rev=798995&r1=798994&r2=798995&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentMergeQueue.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentMergeQueue.cs Wed Jul 29 18:04:12 2009
@@ -29,7 +29,7 @@
Initialize(size);
}
- public override bool LessThan(System.Object a, System.Object b)
+ public override bool LessThan(object a, object b)
{
SegmentMergeInfo stiA = (SegmentMergeInfo) a;
SegmentMergeInfo stiB = (SegmentMergeInfo) b;
Modified: incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentMerger.cs
URL: http://svn.apache.org/viewvc/incubator/lucene.net/trunk/C%23/src/Lucene.Net/Index/SegmentMerger.cs?rev=798995&r1=798994&r2=798995&view=diff
==============================================================================
--- incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentMerger.cs (original)
+++ incubator/lucene.net/trunk/C#/src/Lucene.Net/Index/SegmentMerger.cs Wed Jul 29 18:04:12 2009
@@ -15,11 +15,12 @@
* limitations under the License.
*/
-using System;
+using System.Collections.Generic;
+using Document = Lucene.Net.Documents.Document;
+using Directory = Lucene.Net.Store.Directory;
using FieldSelector = Lucene.Net.Documents.FieldSelector;
using FieldSelectorResult = Lucene.Net.Documents.FieldSelectorResult;
-using Directory = Lucene.Net.Store.Directory;
using IndexInput = Lucene.Net.Store.IndexInput;
using IndexOutput = Lucene.Net.Store.IndexOutput;
@@ -40,31 +41,6 @@
/// </seealso>
public sealed class SegmentMerger
{
- [Serializable]
- private class AnonymousClassFieldSelector : FieldSelector
- {
- public AnonymousClassFieldSelector(SegmentMerger enclosingInstance)
- {
- InitBlock(enclosingInstance);
- }
- private void InitBlock(SegmentMerger enclosingInstance)
- {
- this.enclosingInstance = enclosingInstance;
- }
- private SegmentMerger enclosingInstance;
- public SegmentMerger Enclosing_Instance
- {
- get
- {
- return enclosingInstance;
- }
-
- }
- public FieldSelectorResult Accept(System.String fieldName)
- {
- return FieldSelectorResult.LOAD_FOR_MERGE;
- }
- }
private void InitBlock()
{
termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
@@ -118,6 +94,11 @@
checkAbort = new CheckAbort(merge, directory);
termIndexInterval = writer.GetTermIndexInterval();
}
+
+ internal bool HasProx()
+ {
+ return fieldInfos.HasProx();
+ }
/// <summary> Add an IndexReader to the collection of readers that are to be merged</summary>
/// <param name="reader">
@@ -203,6 +184,10 @@
for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.Length; i++)
{
System.String ext = IndexFileNames.COMPOUND_EXTENSIONS[i];
+
+ if (ext.Equals(IndexFileNames.PROX_EXTENSION) && !HasProx())
+ continue;
+
if (mergeDocStores || (!ext.Equals(IndexFileNames.FIELDS_EXTENSION) && !ext.Equals(IndexFileNames.FIELDS_INDEX_EXTENSION)))
files.Add(segment + "." + ext);
}
@@ -240,17 +225,52 @@
return files;
}
- private void AddIndexed(IndexReader reader, FieldInfos fieldInfos, System.Collections.ICollection names, bool storeTermVectors, bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool storePayloads)
+ private void AddIndexed(IndexReader reader, FieldInfos fieldInfos, ICollection<string> names, bool storeTermVectors, bool storePositionWithTermVector, bool storeOffsetWithTermVector, bool storePayloads, bool omitTf)
{
- System.Collections.IEnumerator i = names.GetEnumerator();
+ IEnumerator<string> i = names.GetEnumerator();
while (i.MoveNext())
{
- System.String field = (System.String) i.Current;
- fieldInfos.Add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.HasNorms(field), storePayloads);
+ string field = i.Current;
+ fieldInfos.Add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.HasNorms(field), storePayloads, omitTf);
}
}
-
- /// <summary> </summary>
+
+ private SegmentReader[] matchingSegmentReaders;
+ private int[] rawDocLengths;
+ private int[] rawDocLengths2;
+
+ private void SetMatchingSegmentReaders()
+ {
+ // if the i'th reader is a SegmentReader and has
+ // identical fieldName->number mapping the this
+ // array will be non-null at position i:
+ matchingSegmentReaders = new SegmentReader[readers.Count];
+
+ // if this reader is a SegmentReader, and all of its
+ // fieldName->number mappings match the "merged"
+ // FieldInfos, then we can do a bulk copy of the
+ // stored fields
+ for (int i = 0; i < readers.Count; i++)
+ {
+ IndexReader reader = (IndexReader)readers[i];
+ if (reader is SegmentReader)
+ {
+ SegmentReader segmentReader = (SegmentReader)reader;
+ bool same = true;
+ FieldInfos segmentFieldInfos = segmentReader.GetFieldInfos();
+ for (int j = 0; same && j < segmentFieldInfos.Size(); j++)
+ same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j));
+ if (same)
+ matchingSegmentReaders[i] = segmentReader;
+ }
+ }
+
+ // used for bulk-reading raw bytes for stored fields
+ rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
+ rawDocLengths2 = new int[MAX_RAW_MERGE_DOCS];
+ }
+
+ /// <summary> </summary>
/// <returns> The number of documents in all of the readers
/// </returns>
/// <throws> CorruptIndexException if the index is corrupt </throws>
@@ -283,167 +303,272 @@
for (int j = 0; j < segmentReader.GetFieldInfos().Size(); j++)
{
FieldInfo fi = segmentReader.GetFieldInfos().FieldInfo(j);
- fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads);
+ fieldInfos.Add(fi.name, fi.isIndexed, fi.storeTermVector, fi.storePositionWithTermVector, fi.storeOffsetWithTermVector, !reader.HasNorms(fi.name), fi.storePayloads, fi.omitTf);
}
}
else
{
- AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
- AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
- AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
- AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
- AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
- AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
+ AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false, false);
+ AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false, false);
+ AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false, false);
+ AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false, false);
+ AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.OMIT_TF), false, false, false, false, true);
+ AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true, false);
+ AddIndexed(reader, fieldInfos, reader.GetFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false, false);
fieldInfos.Add(reader.GetFieldNames(IndexReader.FieldOption.UNINDEXED), false);
}
}
fieldInfos.Write(directory, segment + ".fnm");
int docCount = 0;
-
+
+ SetMatchingSegmentReaders();
+
if (mergeDocStores)
{
-
- // If the i'th reader is a SegmentReader and has
- // identical fieldName -> number mapping, then this
- // array will be non-null at position i:
- SegmentReader[] matchingSegmentReaders = new SegmentReader[readers.Count];
-
- // If this reader is a SegmentReader, and all of its
- // field name -> number mappings match the "merged"
- // FieldInfos, then we can do a bulk copy of the
- // stored fields:
- for (int i = 0; i < readers.Count; i++)
- {
- IndexReader reader = (IndexReader) readers[i];
- if (reader is SegmentReader)
- {
- SegmentReader segmentReader = (SegmentReader) reader;
- bool same = true;
- FieldInfos segmentFieldInfos = segmentReader.GetFieldInfos();
- for (int j = 0; same && j < segmentFieldInfos.Size(); j++)
- same = fieldInfos.FieldName(j).Equals(segmentFieldInfos.FieldName(j));
- if (same)
- {
- matchingSegmentReaders[i] = segmentReader;
- }
- }
- }
-
- // Used for bulk-reading raw bytes for stored fields
- int[] rawDocLengths = new int[MAX_RAW_MERGE_DOCS];
-
// for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
// in merge mode, we use this FieldSelector
FieldSelector fieldSelectorMerge = new AnonymousClassFieldSelector(this);
// merge field values
FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
-
- try
- {
- for (int i = 0; i < readers.Count; i++)
- {
- IndexReader reader = (IndexReader) readers[i];
- SegmentReader matchingSegmentReader = matchingSegmentReaders[i];
- FieldsReader matchingFieldsReader;
- if (matchingSegmentReader != null)
- matchingFieldsReader = matchingSegmentReader.GetFieldsReader();
- else
- matchingFieldsReader = null;
- int maxDoc = reader.MaxDoc();
- for (int j = 0; j < maxDoc; )
- {
- if (!reader.IsDeleted(j))
- {
- // skip deleted docs
- if (matchingSegmentReader != null)
- {
- // We can optimize this case (doing a bulk
- // byte copy) since the field numbers are
- // identical
- int start = j;
- int numDocs = 0;
- do
- {
- j++;
- numDocs++;
- }
- while (j < maxDoc && !matchingSegmentReader.IsDeleted(j) && numDocs < MAX_RAW_MERGE_DOCS);
-
- IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs);
- fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs);
- docCount += numDocs;
- if (checkAbort != null)
- checkAbort.Work(300 * numDocs);
- }
- else
- {
- fieldsWriter.AddDocument(reader.Document(j, fieldSelectorMerge));
- j++;
- docCount++;
- if (checkAbort != null)
- checkAbort.Work(300);
- }
- }
- else
- j++;
- }
- }
- }
- finally
- {
- fieldsWriter.Close();
- }
- System.Diagnostics.Debug.Assert(docCount*8 == directory.FileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION),
- "after MergeFields: fdx size mismatch: " + docCount + " docs vs " +
- directory.FileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION) +
- " length in bytes of " + segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION);
- }
- // If we are skipping the doc stores, that means there
- // are no deletions in any of these segments, so we
- // just sum numDocs() of each segment to get total docCount
- else
- for (int i = 0; i < readers.Count; i++)
- docCount += ((IndexReader) readers[i]).NumDocs();
-
- return docCount;
- }
-
- /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
- /// <throws> IOException </throws>
- private void MergeVectors()
- {
- TermVectorsWriter termVectorsWriter = new TermVectorsWriter(directory, segment, fieldInfos);
-
- try
- {
- for (int r = 0; r < readers.Count; r++)
- {
- IndexReader reader = (IndexReader) readers[r];
- int maxDoc = reader.MaxDoc();
- for (int docNum = 0; docNum < maxDoc; docNum++)
- {
- // skip deleted docs
- if (reader.IsDeleted(docNum))
- continue;
- termVectorsWriter.AddAllDocVectors(reader.GetTermFreqVectors(docNum));
- if (checkAbort != null)
- checkAbort.Work(300);
- }
- }
- }
- finally
- {
- termVectorsWriter.Close();
- }
+ try
+ {
+ for (int i = 0; i < readers.Count; i++)
+ {
+ IndexReader reader = (IndexReader)readers[i];
+ SegmentReader matchingSegmentReader = matchingSegmentReaders[i];
+ FieldsReader matchingFieldsReader;
+ bool hasMatchingReader;
+ if (matchingSegmentReader != null)
+ {
+ FieldsReader fieldsReader = matchingSegmentReader.GetFieldsReader();
+ if (fieldsReader != null && !fieldsReader.CanReadRawDocs())
+ {
+ matchingFieldsReader = null;
+ hasMatchingReader = false;
+ }
+ else
+ {
+ matchingFieldsReader = fieldsReader;
+ hasMatchingReader = true;
+ }
+ }
+ else
+ {
+ hasMatchingReader = false;
+ matchingFieldsReader = null;
+ }
+ int maxDoc = reader.MaxDoc();
+ bool hasDeletions = reader.HasDeletions();
+ for (int j = 0; j < maxDoc; )
+ {
+ if (!hasDeletions || !reader.IsDeleted(j))
+ { // skip deleted docs
+ if (hasMatchingReader)
+ {
+ // We can optimize this case (doing a bulk
+ // byte copy) since the field numbers are
+ // identical
+ int start = j;
+ int numDocs = 0;
+ do
+ {
+ j++;
+ numDocs++;
+ if (j >= maxDoc)
+ break;
+ if (hasDeletions && matchingSegmentReader.IsDeleted(j))
+ {
+ j++;
+ break;
+ }
+ } while (numDocs < MAX_RAW_MERGE_DOCS);
- System.Diagnostics.Debug.Assert(4 + mergedDocs * 8 == directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION),
- "after MergeVectors: tvx size mismatch: " + mergedDocs + " docs vs " +
- directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION) +
- " length in bytes of " + segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
- }
+ IndexInput stream = matchingFieldsReader.RawDocs(rawDocLengths, start, numDocs);
+ fieldsWriter.AddRawDocuments(stream, rawDocLengths, numDocs);
+ docCount += numDocs;
+ if (checkAbort != null)
+ checkAbort.Work(300 * numDocs);
+ }
+ else
+ {
+ // NOTE: it's very important to first assign
+ // to doc then pass it to
+ // termVectorsWriter.addAllDocVectors; see
+ // LUCENE-1282
+ Document doc = reader.Document(j, fieldSelectorMerge);
+ fieldsWriter.AddDocument(doc);
+ j++;
+ docCount++;
+ if (checkAbort != null)
+ checkAbort.Work(300);
+ }
+ }
+ else
+ j++;
+ }
+ }
+ }
+ finally
+ {
+ fieldsWriter.Close();
+ }
+
+ long fdxFileLength = directory.FileLength(segment + "." + IndexFileNames.FIELDS_INDEX_EXTENSION);
+
+ // {{dougsale-2.4.0}
+ // this shouldn't be a problem for us - if it is,
+ // then it's not a JRE bug...
+ //if (4+docCount*8 != fdxFileLength)
+ // // This is most likely a bug in Sun JRE 1.6.0_04/_05;
+ // // we detect that the bug has struck, here, and
+ // // throw an exception to prevent the corruption from
+ // // entering the index. See LUCENE-1282 for
+ // // details.
+ // throw new RuntimeException("mergeFields produced an invalid result: docCount is " + docCount + " but fdx file size is " + fdxFileLength + "; now aborting this merge to prevent index corruption");
+
+ }
+ else
+ // If we are skipping the doc stores, that means there
+ // are no deletions in any of these segments, so we
+ // just sum numDocs() of each segment to get total docCount
+ for (int i = 0; i < readers.Count; i++)
+ docCount += ((IndexReader)readers[i]).NumDocs();
+
+ return docCount;
+ }
+
+ [System.Serializable]
+ private class AnonymousClassFieldSelector : FieldSelector
+ {
+ public AnonymousClassFieldSelector(SegmentMerger enclosingInstance)
+ {
+ InitBlock(enclosingInstance);
+ }
+ private void InitBlock(SegmentMerger enclosingInstance)
+ {
+ this.enclosingInstance = enclosingInstance;
+ }
+ private SegmentMerger enclosingInstance;
+ public SegmentMerger Enclosing_Instance
+ {
+ get
+ {
+ return enclosingInstance;
+ }
+
+ }
+ public FieldSelectorResult Accept(System.String fieldName)
+ {
+ return FieldSelectorResult.LOAD_FOR_MERGE;
+ }
+ }
+
+ /// <summary> Merge the TermVectors from each of the segments into the new one.</summary>
+ /// <throws> IOException </throws>
+ private void MergeVectors()
+ {
+ TermVectorsWriter termVectorsWriter =
+ new TermVectorsWriter(directory, segment, fieldInfos);
+
+ try
+ {
+ for (int r = 0; r < readers.Count; r++)
+ {
+ SegmentReader matchingSegmentReader = matchingSegmentReaders[r];
+ TermVectorsReader matchingVectorsReader;
+ bool hasMatchingReader;
+ if (matchingSegmentReader != null)
+ {
+ matchingVectorsReader = matchingSegmentReader.termVectorsReaderOrig;
+
+ // If the TV* files are an older format then they
+ // cannot read raw docs:
+ if (matchingVectorsReader != null && !matchingVectorsReader.CanReadRawDocs())
+ {
+ matchingVectorsReader = null;
+ hasMatchingReader = false;
+ }
+ else
+ hasMatchingReader = matchingVectorsReader != null;
+
+ }
+ else
+ {
+ hasMatchingReader = false;
+ matchingVectorsReader = null;
+ }
+ IndexReader reader = (IndexReader)readers[r];
+ bool hasDeletions = reader.HasDeletions();
+ int maxDoc = reader.MaxDoc();
+ for (int docNum = 0; docNum < maxDoc; )
+ {
+ // skip deleted docs
+ if (!hasDeletions || !reader.IsDeleted(docNum))
+ {
+ if (hasMatchingReader)
+ {
+ // We can optimize this case (doing a bulk
+ // byte copy) since the field numbers are
+ // identical
+ int start = docNum;
+ int numDocs = 0;
+ do
+ {
+ docNum++;
+ numDocs++;
+ if (docNum >= maxDoc)
+ break;
+ if (hasDeletions && matchingSegmentReader.IsDeleted(docNum))
+ {
+ docNum++;
+ break;
+ }
+ } while (numDocs < MAX_RAW_MERGE_DOCS);
+
+ matchingVectorsReader.RawDocs(rawDocLengths, rawDocLengths2, start, numDocs);
+ termVectorsWriter.AddRawDocuments(matchingVectorsReader, rawDocLengths, rawDocLengths2, numDocs);
+ if (checkAbort != null)
+ checkAbort.Work(300 * numDocs);
+ }
+ else
+ {
+ // NOTE: it's very important to first assign
+ // to vectors then pass it to
+ // termVectorsWriter.addAllDocVectors; see
+ // LUCENE-1282
+ TermFreqVector[] vectors = reader.GetTermFreqVectors(docNum);
+ termVectorsWriter.AddAllDocVectors(vectors);
+ docNum++;
+ if (checkAbort != null)
+ checkAbort.Work(300);
+ }
+ }
+ else
+ docNum++;
+ }
+ }
+ }
+ finally
+ {
+ termVectorsWriter.Close();
+ }
+
+ long tvxSize = directory.FileLength(segment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION);
+
+ // {{dougsale-2.4.0}
+ // this shouldn't be a problem for us - if it is,
+ // then it's not a JRE bug
+ //if (4 + mergedDocs * 16 != tvxSize)
+ // // This is most likely a bug in Sun JRE 1.6.0_04/_05;
+ // // we detect that the bug has struck, here, and
+ // // throw an exception to prevent the corruption from
+ // // entering the index. See LUCENE-1282 for
+ // // details.
+ // throw new RuntimeException("mergeVectors produced an invalid result: mergedDocs is " + mergedDocs + " but tvx size is " + tvxSize + "; now aborting this merge to prevent index corruption");
+ }
private IndexOutput freqOutput = null;
private IndexOutput proxOutput = null;
@@ -458,8 +583,9 @@
try
{
freqOutput = directory.CreateOutput(segment + ".frq");
- proxOutput = directory.CreateOutput(segment + ".prx");
- termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
+ if (HasProx())
+ proxOutput = directory.CreateOutput(segment + ".prx");
+ termInfosWriter = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
skipInterval = termInfosWriter.skipInterval;
maxSkipLevels = termInfosWriter.maxSkipLevels;
skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput);
@@ -483,15 +609,28 @@
private void MergeTermInfos()
{
int base_Renamed = 0;
- for (int i = 0; i < readers.Count; i++)
+ int readerCount = readers.Count;
+ for (int i = 0; i < readerCount; i++)
{
IndexReader reader = (IndexReader) readers[i];
TermEnum termEnum = reader.Terms();
SegmentMergeInfo smi = new SegmentMergeInfo(base_Renamed, termEnum, reader);
- base_Renamed += reader.NumDocs();
+
+ int[] docMap = smi.GetDocMap();
+ if (docMap != null)
+ {
+ if (docMaps == null)
+ {
+ docMaps = new int[readerCount][];
+ delCounts = new int[readerCount];
+ }
+ docMaps[i] = docMap;
+ delCounts[i] = smi.reader.MaxDoc() - smi.reader.NumDocs();
+ }
+
+ base_Renamed += reader.NumDocs();
if (smi.Next())
- queue.Put(smi);
- // initialize queue
+ queue.Put(smi);// initialize queue
else
smi.Close();
}
@@ -544,9 +683,22 @@
private int MergeTermInfo(SegmentMergeInfo[] smis, int n)
{
long freqPointer = freqOutput.GetFilePointer();
- long proxPointer = proxOutput.GetFilePointer();
-
- int df = AppendPostings(smis, n); // append posting data
+ long proxPointer;
+ if (proxOutput != null)
+ proxPointer = proxOutput.GetFilePointer();
+ else
+ proxPointer = 0;
+
+ int df;
+ if (fieldInfos.FieldInfo(smis[0].term.field).omitTf)
+ {
+ // append posting data
+ df = AppendPostingsNoTf(smis, n);
+ }
+ else
+ {
+ df = AppendPostings(smis, n);
+ }
long skipPointer = skipListWriter.WriteSkip(freqOutput);
@@ -560,7 +712,17 @@
return df;
}
- private byte[] payloadBuffer = null;
+ private byte[] payloadBuffer;
+ private int[][] docMaps;
+ internal int[][] GetDocMaps()
+ {
+ return docMaps;
+ }
+ private int[] delCounts;
+ internal int[] GetDelCounts()
+ {
+ return delCounts;
+ }
/// <summary>Process postings from multiple segments all positioned on the
/// same term. Writes out merged entries into freqOutput and
@@ -622,7 +784,7 @@
freqOutput.WriteVInt(freq); // write frequency in doc
}
- /** See {@link DocumentWriter#writePostings(Posting[], String) for
+ /** See {@link DocumentWriter#writePostings(Posting[], String)} for
* documentation about the encoding of positions and payloads
*/
int lastPosition = 0; // write position deltas
@@ -663,8 +825,56 @@
}
return df;
}
-
- private void MergeNorms()
+
+ /// <summary>
+ /// Process postings from multiple segments without tf, all positioned on the same term.
+ /// Writes out merged entries only into freqOutput, proxOut is not written.
+ /// </summary>
+ /// <param name="smis">smis array of segments</param>
+ /// <param name="n">number of cells in the array actually occupied</param>
+ /// <returns></returns>
+ private int AppendPostingsNoTf(SegmentMergeInfo[] smis, int n)
+ {
+ int lastDoc = 0;
+ int df = 0; // number of docs w/ term
+ skipListWriter.ResetSkip();
+ int lastPayloadLength = -1; // ensures that we write the first length
+ for (int i = 0; i < n; i++)
+ {
+ SegmentMergeInfo smi = smis[i];
+ TermPositions postings = smi.GetPositions();
+ System.Diagnostics.Debug.Assert(postings != null);
+ int base_Renamed = smi.base_Renamed;
+ int[] docMap = smi.GetDocMap();
+ postings.Seek(smi.termEnum);
+ while (postings.Next())
+ {
+ int doc = postings.Doc();
+ if (docMap != null)
+ doc = docMap[doc]; // map around deletions
+ doc += base_Renamed; // convert to merged space
+
+ if (doc < 0 || (df > 0 && doc <= lastDoc))
+ throw new CorruptIndexException("docs out of order (" + doc +
+ " <= " + lastDoc + " )");
+
+ df++;
+
+ if ((df % skipInterval) == 0)
+ {
+ skipListWriter.SetSkipData(lastDoc, false, lastPayloadLength);
+ skipListWriter.BufferSkip(df);
+ }
+
+ int docCode = (doc - lastDoc);
+ lastDoc = doc;
+ freqOutput.WriteVInt(docCode); // write doc & freq=1
+ }
+ }
+ return df;
+ }
+
+ private void MergeNorms()
{
byte[] normBuffer = null;
IndexOutput output = null;