You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by ni...@apache.org on 2021/12/05 07:00:38 UTC
[lucenenet] 02/05: Lucene.Net.Codecs.SimpleText.SimpleTextFieldsReader: Re-ported to rule out any logic problems that were introduced by rearranging the statements
This is an automated email from the ASF dual-hosted git repository.
nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit c7fbc2da14d797bc65970e4a0b1aed212adb34a3
Author: Shad Storhaug <sh...@shadstorhaug.com>
AuthorDate: Fri Dec 3 19:48:31 2021 +0700
Lucene.Net.Codecs.SimpleText.SimpleTextFieldsReader: Re-ported to rule out any logic problems that were introduced by rearranging the statements
---
.../SimpleText/SimpleTextFieldsReader.cs | 622 +++++++++++----------
1 file changed, 315 insertions(+), 307 deletions(-)
diff --git a/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs b/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs
index ab7d01f..0763a37 100644
--- a/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs
+++ b/src/Lucene.Net.Codecs/SimpleText/SimpleTextFieldsReader.cs
@@ -53,40 +53,35 @@ namespace Lucene.Net.Codecs.SimpleText
internal class SimpleTextFieldsReader : FieldsProducer
{
- private readonly IDictionary<string, long?> _fields;
- private readonly IndexInput _input;
- private readonly FieldInfos _fieldInfos;
- private readonly int _maxDoc;
- private readonly IDictionary<string, SimpleTextTerms> _termsCache = new Dictionary<string, SimpleTextTerms>();
+ private readonly JCG.SortedDictionary<string, long?> fields;
+ private readonly IndexInput input;
+ private readonly FieldInfos fieldInfos;
+ private readonly int maxDoc;
public SimpleTextFieldsReader(SegmentReadState state)
{
- _maxDoc = state.SegmentInfo.DocCount;
- _fieldInfos = state.FieldInfos;
- _input =
- state.Directory.OpenInput(
- SimpleTextPostingsFormat.GetPostingsFileName(state.SegmentInfo.Name, state.SegmentSuffix),
- state.Context);
+ this.maxDoc = state.SegmentInfo.DocCount;
+ fieldInfos = state.FieldInfos;
+ input = state.Directory.OpenInput(SimpleTextPostingsFormat.GetPostingsFileName(state.SegmentInfo.Name, state.SegmentSuffix), state.Context);
bool success = false;
try
{
- _fields = ReadFields((IndexInput)_input.Clone());
+ fields = ReadFields((IndexInput)input.Clone());
success = true;
}
finally
{
if (!success)
{
- IOUtils.DisposeWhileHandlingException();
+ IOUtils.DisposeWhileHandlingException(this);
}
}
}
- private IDictionary<string, long?> ReadFields(IndexInput @in)
+ private static JCG.SortedDictionary<string, long?> ReadFields(IndexInput @in) // LUCENENET specific - marked static
{
ChecksumIndexInput input = new BufferedChecksumIndexInput(@in);
- var scratch = new BytesRef(10);
-
+ BytesRef scratch = new BytesRef(10);
// LUCENENET specific: Use StringComparer.Ordinal to get the same ordering as Java
var fields = new JCG.SortedDictionary<string, long?>(StringComparer.Ordinal);
@@ -98,11 +93,9 @@ namespace Lucene.Net.Codecs.SimpleText
SimpleTextUtil.CheckFooter(input);
return fields;
}
-
- if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD))
{
- var fieldName = Encoding.UTF8.GetString(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FIELD.Length,
- scratch.Length - SimpleTextFieldsWriter.FIELD.Length);
+ string fieldName = Encoding.UTF8.GetString(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FIELD.Length, scratch.Length - SimpleTextFieldsWriter.FIELD.Length);
fields[fieldName] = input.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
}
}
@@ -110,109 +103,128 @@ namespace Lucene.Net.Codecs.SimpleText
private class SimpleTextTermsEnum : TermsEnum
{
- private readonly SimpleTextFieldsReader _outerInstance;
+ private readonly SimpleTextFieldsReader outerInstance;
- private readonly IndexOptions _indexOptions;
- private int _docFreq;
- private long _totalTermFreq;
- private long _docsStart;
-
- private readonly BytesRefFSTEnum<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair> _fstEnum;
+ private readonly IndexOptions indexOptions;
+ private int docFreq;
+ private long totalTermFreq;
+ private long docsStart;
+ //private bool ended; // LUCENENET: Never read
+ private readonly BytesRefFSTEnum<PairOutputs<long?, PairOutputs<long?, long?>.Pair>.Pair> fstEnum;
- public SimpleTextTermsEnum(SimpleTextFieldsReader outerInstance,
- FST<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair> fst, IndexOptions indexOptions)
+ public SimpleTextTermsEnum(SimpleTextFieldsReader outerInstance, FST<PairOutputs<long?, PairOutputs<long?, long?>.Pair>.Pair> fst, IndexOptions indexOptions)
{
- _outerInstance = outerInstance;
- _indexOptions = indexOptions;
- _fstEnum = new BytesRefFSTEnum<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(fst);
+ this.outerInstance = outerInstance;
+ this.indexOptions = indexOptions;
+ fstEnum = new BytesRefFSTEnum<PairOutputs<long?, PairOutputs<long?, long?>.Pair>.Pair>(fst);
}
public override bool SeekExact(BytesRef text)
{
-
- var result = _fstEnum.SeekExact(text);
-
- if (result == null) return false;
-
- var pair1 = result.Output;
- var pair2 = pair1.Output2;
- _docsStart = pair1.Output1.Value;
- _docFreq = (int) pair2.Output1;
- _totalTermFreq = pair2.Output2.Value;
- return true;
+ var result = fstEnum.SeekExact(text);
+ if (result != null)
+ {
+ var pair1 = result.Output;
+ var pair2 = pair1.Output2;
+ docsStart = pair1.Output1.Value;
+ docFreq = (int)pair2.Output1;
+ totalTermFreq = pair2.Output2.Value;
+ return true;
+ }
+ else
+ {
+ return false;
+ }
}
public override SeekStatus SeekCeil(BytesRef text)
{
- var result = _fstEnum.SeekCeil(text);
+ //System.out.println("seek to text=" + text.utf8ToString());
+ var result = fstEnum.SeekCeil(text);
if (result == null)
+ {
+ //System.out.println(" end");
return SeekStatus.END;
-
- var pair1 = result.Output;
- var pair2 = pair1.Output2;
- _docsStart = pair1.Output1.Value;
- _docFreq = (int) pair2.Output1;
- _totalTermFreq = pair2.Output2.Value;
-
- return result.Input.Equals(text) ? SeekStatus.FOUND : SeekStatus.NOT_FOUND;
-
+ }
+ else
+ {
+ //System.out.println(" got text=" + term.utf8ToString());
+ var pair1 = result.Output;
+ var pair2 = pair1.Output2;
+ docsStart = pair1.Output1.Value;
+ docFreq = (int)pair2.Output1;
+ totalTermFreq = pair2.Output2.Value;
+
+ if (result.Input.Equals(text))
+ {
+ //System.out.println(" match docsStart=" + docsStart);
+ return SeekStatus.FOUND;
+ }
+ else
+ {
+ //System.out.println(" not match docsStart=" + docsStart);
+ return SeekStatus.NOT_FOUND;
+ }
+ }
}
public override bool MoveNext()
{
//if (Debugging.AssertsEnabled) Debugging.Assert(!ended); // LUCENENET: Ended field is never set, so this can never fail
- if (!_fstEnum.MoveNext()) return false;
-
- var pair1 = _fstEnum.Current.Output;
- var pair2 = pair1.Output2;
- _docsStart = pair1.Output1.Value;
- _docFreq = (int)pair2.Output1;
- _totalTermFreq = pair2.Output2.Value;
- return _fstEnum.Current.Input != null;
+ if (fstEnum.MoveNext())
+ {
+ var pair1 = fstEnum.Current.Output;
+ var pair2 = pair1.Output2;
+ docsStart = pair1.Output1.Value;
+ docFreq = (int)pair2.Output1;
+ totalTermFreq = pair2.Output2.Value;
+ return fstEnum.Current.Input != null;
+ }
+ else
+ {
+ return false;
+ }
}
[Obsolete("Use MoveNext() and Term instead. This method will be removed in 4.8.0 release candidate."), System.ComponentModel.EditorBrowsable(System.ComponentModel.EditorBrowsableState.Never)]
public override BytesRef Next()
{
if (MoveNext())
- return _fstEnum.Current.Input;
+ return fstEnum.Current.Input;
return null;
}
- public override BytesRef Term => _fstEnum.Current.Input;
+ public override BytesRef Term => fstEnum.Current.Input;
public override long Ord => throw UnsupportedOperationException.Create();
- public override void SeekExact(long ord)
- {
- throw UnsupportedOperationException.Create();
- }
+ public override void SeekExact(long ord) => throw UnsupportedOperationException.Create();
- public override int DocFreq => _docFreq;
+ public override int DocFreq => docFreq;
- public override long TotalTermFreq => _indexOptions == IndexOptions.DOCS_ONLY ? -1 : _totalTermFreq;
+ public override long TotalTermFreq => indexOptions == IndexOptions.DOCS_ONLY ? -1 : totalTermFreq;
public override DocsEnum Docs(IBits liveDocs, DocsEnum reuse, DocsFlags flags)
{
- if (reuse is null || !(reuse is SimpleTextDocsEnum docsEnum) || !docsEnum.CanReuse(_outerInstance._input))
- docsEnum = new SimpleTextDocsEnum(_outerInstance);
-
- return docsEnum.Reset(_docsStart, liveDocs, _indexOptions == IndexOptions.DOCS_ONLY, _docFreq);
+ if (reuse is null || !(reuse is SimpleTextDocsEnum docsEnum) || !docsEnum.CanReuse(outerInstance.input))
+ docsEnum = new SimpleTextDocsEnum(outerInstance);
+
+ return docsEnum.Reset(docsStart, liveDocs, indexOptions == IndexOptions.DOCS_ONLY, docFreq);
}
public override DocsAndPositionsEnum DocsAndPositions(IBits liveDocs, DocsAndPositionsEnum reuse, DocsAndPositionsFlags flags)
{
// LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
- if (IndexOptionsComparer.Default.Compare(_indexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0)
+ if (IndexOptionsComparer.Default.Compare(indexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) < 0)
{
// Positions were not indexed
return null;
}
- if (reuse is null || !(reuse is SimpleTextDocsAndPositionsEnum docsAndPositionsEnum) || !docsAndPositionsEnum.CanReuse(_outerInstance._input))
- docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum(_outerInstance);
+ if (reuse is null || !(reuse is SimpleTextDocsAndPositionsEnum docsAndPositionsEnum) || !docsAndPositionsEnum.CanReuse(outerInstance.input))
+ docsAndPositionsEnum = new SimpleTextDocsAndPositionsEnum(outerInstance);
- return docsAndPositionsEnum.Reset(_docsStart, liveDocs, _indexOptions, _docFreq);
+ return docsAndPositionsEnum.Reset(docsStart, liveDocs, indexOptions, docFreq);
}
public override IComparer<BytesRef> Comparer => BytesRef.UTF8SortedAsUnicodeComparer;
@@ -220,236 +232,232 @@ namespace Lucene.Net.Codecs.SimpleText
private class SimpleTextDocsEnum : DocsEnum
{
- private readonly IndexInput _inStart;
- private readonly IndexInput _in;
- private bool _omitTf;
- private int _docId = -1;
- private int _tf;
- private IBits _liveDocs;
- private readonly BytesRef _scratch = new BytesRef(10);
- private readonly CharsRef _scratchUtf16 = new CharsRef(10);
- private int _cost;
+ private readonly IndexInput inStart;
+ private readonly IndexInput input;
+ private bool omitTF;
+ private int docID = -1;
+ private int tf;
+ private IBits liveDocs;
+ private readonly BytesRef scratch = new BytesRef(10);
+ private readonly CharsRef scratchUTF16 = new CharsRef(10);
+ private int cost;
public SimpleTextDocsEnum(SimpleTextFieldsReader outerInstance)
{
- _inStart = outerInstance._input;
- _in = (IndexInput) _inStart.Clone();
+ this.inStart = outerInstance.input;
+ this.input = (IndexInput)this.inStart.Clone();
}
public virtual bool CanReuse(IndexInput @in)
{
- return @in == _inStart;
+ return @in == inStart;
}
- public virtual SimpleTextDocsEnum Reset(long fp, IBits liveDocs, bool omitTf, int docFreq)
+ public virtual SimpleTextDocsEnum Reset(long fp, IBits liveDocs, bool omitTF, int docFreq)
{
- _liveDocs = liveDocs;
- _in.Seek(fp);
- _omitTf = omitTf;
- _docId = -1;
- _tf = 1;
- _cost = docFreq;
+ this.liveDocs = liveDocs;
+ input.Seek(fp);
+ this.omitTF = omitTF;
+ docID = -1;
+ tf = 1;
+ cost = docFreq;
return this;
}
- public override int DocID => _docId;
+ public override int DocID => docID;
- public override int Freq => _tf;
+ public override int Freq => tf;
public override int NextDoc()
{
- if (_docId == NO_MORE_DOCS)
+ if (docID == NO_MORE_DOCS)
{
- return _docId;
+ return docID;
}
bool first = true;
int termFreq = 0;
while (true)
{
- long lineStart = _in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
- SimpleTextUtil.ReadLine(_in, _scratch);
- if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
+ long lineStart = input.Position;
+ SimpleTextUtil.ReadLine(input, scratch);
+ if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC))
{
- if (!first && (_liveDocs == null || _liveDocs.Get(_docId)))
+ if (!first && (liveDocs == null || liveDocs.Get(docID)))
{
- _in.Seek(lineStart);
- if (!_omitTf)
+ input.Seek(lineStart);
+ if (!omitTF)
{
- _tf = termFreq;
+ tf = termFreq;
}
- return _docId;
+ return docID;
}
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
- _scratchUtf16);
- _docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16);
+ docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
termFreq = 0;
first = false;
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ))
{
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
- _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
- termFreq = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16);
+ termFreq = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.POS))
{
// skip termFreq++;
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.START_OFFSET))
{
// skip
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.END_OFFSET))
{
// skip
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.PAYLOAD))
{
// skip
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(
- StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD) ||
- StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END), "scratch={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8));
-
- if (!first && (_liveDocs == null || _liveDocs.Get(_docId)))
+ StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM)
+ || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD)
+ || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.END),
+ "scratch={0}", new BytesRefFormatter(scratch, BytesRefFormat.UTF8));
+ if (!first && (liveDocs == null || liveDocs.Get(docID)))
{
- _in.Seek(lineStart);
- if (!_omitTf)
+ input.Seek(lineStart);
+ if (!omitTF)
{
- _tf = termFreq;
+ tf = termFreq;
}
- return _docId;
+ return docID;
}
- return _docId = NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
}
+
public override int Advance(int target)
{
// Naive -- better to index skip data
return SlowAdvance(target);
}
- public override long GetCost()
- {
- return _cost;
- }
+ public override long GetCost() => cost;
}
private class SimpleTextDocsAndPositionsEnum : DocsAndPositionsEnum
{
- private readonly IndexInput _inStart;
- private readonly IndexInput _in;
- private int _docId = -1;
- private int _tf;
- private IBits _liveDocs;
- private readonly BytesRef _scratch = new BytesRef(10);
- private readonly BytesRef _scratch2 = new BytesRef(10);
- private readonly CharsRef _scratchUtf16 = new CharsRef(10);
- private readonly CharsRef _scratchUtf162 = new CharsRef(10);
- private BytesRef _payload;
- private long _nextDocStart;
- private bool _readOffsets;
- private bool _readPositions;
- private int _startOffset;
- private int _endOffset;
- private int _cost;
+ private readonly IndexInput inStart;
+ private readonly IndexInput input;
+ private int docID = -1;
+ private int tf;
+ private IBits liveDocs;
+ private readonly BytesRef scratch = new BytesRef(10);
+ private readonly BytesRef scratch2 = new BytesRef(10);
+ private readonly CharsRef scratchUTF16 = new CharsRef(10);
+ private readonly CharsRef scratchUTF16_2 = new CharsRef(10);
+ private BytesRef payload;
+ private long nextDocStart;
+ private bool readOffsets;
+ private bool readPositions;
+ private int startOffset;
+ private int endOffset;
+ private int cost;
public SimpleTextDocsAndPositionsEnum(SimpleTextFieldsReader outerInstance)
{
- _inStart = outerInstance._input;
- _in = (IndexInput) _inStart.Clone();
+ this.inStart = outerInstance.input;
+ this.input = (IndexInput)inStart.Clone();
}
public virtual bool CanReuse(IndexInput @in)
{
- return @in == _inStart;
+ return @in == inStart;
}
public virtual SimpleTextDocsAndPositionsEnum Reset(long fp, IBits liveDocs, IndexOptions indexOptions, int docFreq)
{
- _liveDocs = liveDocs;
- _nextDocStart = fp;
- _docId = -1;
+ this.liveDocs = liveDocs;
+ nextDocStart = fp;
+ docID = -1;
// LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
- _readPositions = IndexOptionsComparer.Default.Compare(indexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
- _readOffsets = IndexOptionsComparer.Default.Compare(indexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
-
- if (!_readOffsets)
+ readPositions = IndexOptionsComparer.Default.Compare(indexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ readOffsets = IndexOptionsComparer.Default.Compare(indexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ if (!readOffsets)
{
- _startOffset = -1;
- _endOffset = -1;
+ startOffset = -1;
+ endOffset = -1;
}
- _cost = docFreq;
+ cost = docFreq;
return this;
}
- public override int DocID => _docId;
+ public override int DocID => docID;
+
+ public override int Freq => tf;
- public override int Freq => _tf;
public override int NextDoc()
{
bool first = true;
- _in.Seek(_nextDocStart);
+ input.Seek(nextDocStart);
long posStart = 0;
while (true)
{
- long lineStart = _in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
- SimpleTextUtil.ReadLine(_in, _scratch);
+ long lineStart = input.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
+ SimpleTextUtil.ReadLine(input, scratch);
//System.out.println("NEXT DOC: " + scratch.utf8ToString());
- if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
+ if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC))
{
- if (!first && (_liveDocs == null || _liveDocs.Get(_docId)))
+ if (!first && (liveDocs == null || liveDocs.Get(docID)))
{
- _nextDocStart = lineStart;
- _in.Seek(posStart);
- return _docId;
+ nextDocStart = lineStart;
+ input.Seek(posStart);
+ return docID;
}
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
- _scratchUtf16);
- _docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
- _tf = 0;
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16);
+ docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
+ tf = 0;
first = false;
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ))
{
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
- _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
- _tf = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
- posStart = _in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16);
+ tf = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
+ posStart = input.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.POS))
{
// skip
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.START_OFFSET))
{
// skip
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.END_OFFSET))
{
// skip
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.PAYLOAD))
{
// skip
}
else
{
- if (Debugging.AssertsEnabled) Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD) ||
- StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END));
-
- if (!first && (_liveDocs == null || _liveDocs.Get(_docId)))
+ if (Debugging.AssertsEnabled) Debugging.Assert(
+ StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM)
+ || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD)
+ || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.END));
+ if (!first && (liveDocs == null || liveDocs.Get(docID)))
{
- _nextDocStart = lineStart;
- _in.Seek(posStart);
- return _docId;
+ nextDocStart = lineStart;
+ input.Seek(posStart);
+ return docID;
}
- return _docId = NO_MORE_DOCS;
+ return docID = NO_MORE_DOCS;
}
}
}
@@ -463,70 +471,61 @@ namespace Lucene.Net.Codecs.SimpleText
public override int NextPosition()
{
int pos;
- if (_readPositions)
+ if (readPositions)
{
- SimpleTextUtil.ReadLine(_in, _scratch);
+ SimpleTextUtil.ReadLine(input, scratch);
// LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called
- if (Debugging.AssertsEnabled) Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.POS), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8));
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.POS.Length, _scratch.Length - SimpleTextFieldsWriter.POS.Length,
- _scratchUtf162);
- pos = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
+ if (Debugging.AssertsEnabled) Debugging.Assert(StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.POS), "got line={0}", new BytesRefFormatter(scratch, BytesRefFormat.UTF8));
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.POS.Length, scratch.Length - SimpleTextFieldsWriter.POS.Length, scratchUTF16_2);
+ pos = ArrayUtil.ParseInt32(scratchUTF16_2.Chars, 0, scratchUTF16_2.Length);
}
else
{
pos = -1;
}
- if (_readOffsets)
+ if (readOffsets)
{
- SimpleTextUtil.ReadLine(_in, _scratch);
+ SimpleTextUtil.ReadLine(input, scratch);
// LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called
- if (Debugging.AssertsEnabled) Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.START_OFFSET), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8));
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length,
- _scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, _scratchUtf162);
- _startOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
- SimpleTextUtil.ReadLine(_in, _scratch);
+ if (Debugging.AssertsEnabled) Debugging.Assert(StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.START_OFFSET), "got line={0}", new BytesRefFormatter(scratch, BytesRefFormat.UTF8));
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.START_OFFSET.Length, scratch.Length - SimpleTextFieldsWriter.START_OFFSET.Length, scratchUTF16_2);
+ startOffset = ArrayUtil.ParseInt32(scratchUTF16_2.Chars, 0, scratchUTF16_2.Length);
+ SimpleTextUtil.ReadLine(input, scratch);
// LUCENENET specific - use wrapper BytesRefFormatter struct to defer building the string unless string.Format() is called
- if (Debugging.AssertsEnabled) Debugging.Assert(StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.END_OFFSET), "got line={0}", new BytesRefFormatter(_scratch, BytesRefFormat.UTF8));
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length,
- _scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, _scratchUtf162);
- _endOffset = ArrayUtil.ParseInt32(_scratchUtf162.Chars, 0, _scratchUtf162.Length);
+ if (Debugging.AssertsEnabled) Debugging.Assert(StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.END_OFFSET), "got line={0}", new BytesRefFormatter(scratch, BytesRefFormat.UTF8));
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.END_OFFSET.Length, scratch.Length - SimpleTextFieldsWriter.END_OFFSET.Length, scratchUTF16_2);
+ endOffset = ArrayUtil.ParseInt32(scratchUTF16_2.Chars, 0, scratchUTF16_2.Length);
}
- long fp = _in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
- SimpleTextUtil.ReadLine(_in, _scratch);
- if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.PAYLOAD))
+ long fp = input.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
+ SimpleTextUtil.ReadLine(input, scratch);
+ if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.PAYLOAD))
{
- int len = _scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length;
- if (_scratch2.Bytes.Length < len)
+ int len = scratch.Length - SimpleTextFieldsWriter.PAYLOAD.Length;
+ if (scratch2.Bytes.Length < len)
{
- _scratch2.Grow(len);
+ scratch2.Grow(len);
}
- Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, _scratch2.Bytes, 0, len);
- _scratch2.Length = len;
- _payload = _scratch2;
+ System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.PAYLOAD.Length, scratch2.Bytes, 0, len);
+ scratch2.Length = len;
+ payload = scratch2;
}
else
{
- _payload = null;
- _in.Seek(fp);
+ payload = null;
+ input.Seek(fp);
}
return pos;
}
- public override int StartOffset => _startOffset;
+ public override int StartOffset => startOffset;
- public override int EndOffset => _endOffset;
+ public override int EndOffset => endOffset;
- public override BytesRef GetPayload()
- {
- return _payload;
- }
+ public override BytesRef GetPayload() => payload;
- public override long GetCost()
- {
- return _cost;
- }
+ public override long GetCost() => cost;
}
internal class TermData
@@ -543,154 +542,163 @@ namespace Lucene.Net.Codecs.SimpleText
private class SimpleTextTerms : Terms
{
- private readonly SimpleTextFieldsReader _outerInstance;
-
- private readonly long _termsStart;
- private readonly FieldInfo _fieldInfo;
- private readonly int _maxDoc;
- private long _sumTotalTermFreq;
- private long _sumDocFreq;
- private int _docCount;
- private FST<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair> _fst;
- private int _termCount;
- private readonly BytesRef _scratch = new BytesRef(10);
- private readonly CharsRef _scratchUtf16 = new CharsRef(10);
-
- public SimpleTextTerms(SimpleTextFieldsReader outerInstance, string field, long termsStart, int maxDoc)
- {
- _outerInstance = outerInstance;
- _maxDoc = maxDoc;
- _termsStart = termsStart;
- _fieldInfo = outerInstance._fieldInfos.FieldInfo(field);
+ private readonly SimpleTextFieldsReader outerInstance;
+
+ private readonly long termsStart;
+ private readonly FieldInfo fieldInfo;
+ private readonly int maxDoc;
+ private long sumTotalTermFreq;
+ private long sumDocFreq;
+ private int docCount;
+ private FST<PairOutputs<long?, PairOutputs<long?, long?>.Pair>.Pair> fst;
+ private int termCount;
+ private readonly BytesRef scratch = new BytesRef(10);
+ private readonly CharsRef scratchUTF16 = new CharsRef(10);
+
+ public SimpleTextTerms(SimpleTextFieldsReader outerInstance, String field, long termsStart, int maxDoc)
+ {
+ this.outerInstance = outerInstance;
+ this.maxDoc = maxDoc;
+ this.termsStart = termsStart;
+ fieldInfo = outerInstance.fieldInfos.FieldInfo(field);
LoadTerms();
}
private void LoadTerms()
{
- var posIntOutputs = PositiveInt32Outputs.Singleton;
- var outputsInner = new PairOutputs<long?, long?>(posIntOutputs, posIntOutputs);
- var outputs = new PairOutputs<long?, PairOutputs<long?,long?>.Pair>(posIntOutputs, outputsInner);
-
- // honestly, wtf kind of generic mess is this.
- var b = new Builder<PairOutputs<long?, PairOutputs<long?,long?>.Pair>.Pair>(FST.INPUT_TYPE.BYTE1, outputs);
- var input = (IndexInput) _outerInstance._input.Clone();
- input.Seek(_termsStart);
-
- var lastTerm = new BytesRef(10);
+ PositiveInt32Outputs posIntOutputs = PositiveInt32Outputs.Singleton;
+ Builder<PairOutputs<long?, PairOutputs<long?, long?>.Pair>.Pair> b;
+ PairOutputs<long?, long?> outputsInner = new PairOutputs<long?, long?>(posIntOutputs, posIntOutputs);
+ PairOutputs<long?, PairOutputs<long?, long?>.Pair> outputs = new PairOutputs<long?, PairOutputs<long?, long?>.Pair>(posIntOutputs,
+ outputsInner);
+ b = new Builder<PairOutputs<long?, PairOutputs<long?, long?>.Pair>.Pair>(FST.INPUT_TYPE.BYTE1, outputs);
+ IndexInput @in = (IndexInput)outerInstance.input.Clone();
+ @in.Seek(termsStart);
+ BytesRef lastTerm = new BytesRef(10);
long lastDocsStart = -1;
int docFreq = 0;
long totalTermFreq = 0;
- var visitedDocs = new FixedBitSet(_maxDoc);
-
- var scratchIntsRef = new Int32sRef();
+ FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
+ Int32sRef scratchIntsRef = new Int32sRef();
while (true)
{
- SimpleTextUtil.ReadLine(input, _scratch);
- if (_scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FIELD))
+ SimpleTextUtil.ReadLine(@in, scratch);
+ if (scratch.Equals(SimpleTextFieldsWriter.END) || StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FIELD))
{
if (lastDocsStart != -1)
{
b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
- outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
- _sumTotalTermFreq += totalTermFreq;
+ outputs.NewPair(lastDocsStart,
+ outputsInner.NewPair((long)docFreq, totalTermFreq)));
+ sumTotalTermFreq += totalTermFreq;
}
break;
}
-
- if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.DOC))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.DOC))
{
docFreq++;
- _sumDocFreq++;
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.DOC.Length, _scratch.Length - SimpleTextFieldsWriter.DOC.Length,
- _scratchUtf16);
- int docId = ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
- visitedDocs.Set(docId);
+ sumDocFreq++;
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.DOC.Length, scratch.Length - SimpleTextFieldsWriter.DOC.Length, scratchUTF16);
+ int docID = ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
+ visitedDocs.Set(docID);
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.FREQ))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.FREQ))
{
- UnicodeUtil.UTF8toUTF16(_scratch.Bytes, _scratch.Offset + SimpleTextFieldsWriter.FREQ.Length,
- _scratch.Length - SimpleTextFieldsWriter.FREQ.Length, _scratchUtf16);
- totalTermFreq += ArrayUtil.ParseInt32(_scratchUtf16.Chars, 0, _scratchUtf16.Length);
+ UnicodeUtil.UTF8toUTF16(scratch.Bytes, scratch.Offset + SimpleTextFieldsWriter.FREQ.Length, scratch.Length - SimpleTextFieldsWriter.FREQ.Length, scratchUTF16);
+ totalTermFreq += ArrayUtil.ParseInt32(scratchUTF16.Chars, 0, scratchUTF16.Length);
}
- else if (StringHelper.StartsWith(_scratch, SimpleTextFieldsWriter.TERM))
+ else if (StringHelper.StartsWith(scratch, SimpleTextFieldsWriter.TERM))
{
if (lastDocsStart != -1)
{
- b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef),
- outputs.NewPair(lastDocsStart, outputsInner.NewPair(docFreq, totalTermFreq)));
+ b.Add(Util.ToInt32sRef(lastTerm, scratchIntsRef), outputs.NewPair(lastDocsStart,
+ outputsInner.NewPair((long)docFreq, totalTermFreq)));
}
- lastDocsStart = input.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
- int len = _scratch.Length - SimpleTextFieldsWriter.TERM.Length;
+ lastDocsStart = @in.Position; // LUCENENET specific: Renamed from getFilePointer() to match FileStream
+ int len = scratch.Length - SimpleTextFieldsWriter.TERM.Length;
if (len > lastTerm.Length)
{
lastTerm.Grow(len);
}
- Array.Copy(_scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
+ System.Array.Copy(scratch.Bytes, SimpleTextFieldsWriter.TERM.Length, lastTerm.Bytes, 0, len);
lastTerm.Length = len;
docFreq = 0;
- _sumTotalTermFreq += totalTermFreq;
+ sumTotalTermFreq += totalTermFreq;
totalTermFreq = 0;
- _termCount++;
+ termCount++;
}
}
- _docCount = visitedDocs.Cardinality;
- _fst = b.Finish();
-
+ docCount = visitedDocs.Cardinality;
+ fst = b.Finish();
+ /*
+ PrintStream ps = new PrintStream("out.dot");
+ fst.toDot(ps);
+ ps.close();
+ System.out.println("SAVED out.dot");
+ */
+ //System.out.println("FST " + fst.sizeInBytes());
}
/// <summary>Returns approximate RAM bytes used.</summary>
public virtual long RamBytesUsed()
{
- return (_fst != null) ? _fst.GetSizeInBytes() : 0;
+ return (fst != null) ? fst.GetSizeInBytes() : 0;
}
public override TermsEnum GetEnumerator()
{
- return (_fst != null)
- ? new SimpleTextTermsEnum(_outerInstance, _fst, _fieldInfo.IndexOptions)
- : TermsEnum.EMPTY;
+ if (fst != null)
+ {
+ return new SimpleTextTermsEnum(outerInstance, fst, fieldInfo.IndexOptions);
+ }
+ else
+ {
+ return TermsEnum.EMPTY;
+ }
}
public override IComparer<BytesRef> Comparer => BytesRef.UTF8SortedAsUnicodeComparer;
- public override long Count => _termCount;
+ public override long Count => (long)termCount;
- public override long SumTotalTermFreq => _fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY ? - 1 : _sumTotalTermFreq;
+ public override long SumTotalTermFreq => fieldInfo.IndexOptions == IndexOptions.DOCS_ONLY ? -1 : sumTotalTermFreq;
- public override long SumDocFreq => _sumDocFreq;
+ public override long SumDocFreq => sumDocFreq;
- public override int DocCount => _docCount;
+ public override int DocCount => docCount;
// LUCENENET specific - to avoid boxing, changed from CompareTo() to IndexOptionsComparer.Compare()
- public override bool HasFreqs => IndexOptionsComparer.Default.Compare(_fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS) >= 0;
+ public override bool HasFreqs => IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS) >= 0;
- public override bool HasOffsets => IndexOptionsComparer.Default.Compare(_fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
+ public override bool HasOffsets => IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0;
- public override bool HasPositions => IndexOptionsComparer.Default.Compare(_fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
+ public override bool HasPositions => IndexOptionsComparer.Default.Compare(fieldInfo.IndexOptions, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0;
- public override bool HasPayloads => _fieldInfo.HasPayloads;
+ public override bool HasPayloads => fieldInfo.HasPayloads;
}
public override IEnumerator<string> GetEnumerator()
{
- return _fields.Keys.GetEnumerator();
+ return fields.Keys.GetEnumerator();
}
+ private readonly IDictionary<string, SimpleTextTerms> termsCache = new Dictionary<string, SimpleTextTerms>();
+
public override Terms GetTerms(string field)
{
UninterruptableMonitor.Enter(this);
try
{
- if (!_termsCache.TryGetValue(field, out SimpleTextTerms terms) || terms == null)
+ if (!termsCache.TryGetValue(field, out SimpleTextTerms terms) || terms == null)
{
- if (!_fields.TryGetValue(field, out long? fp) || !fp.HasValue)
+ if (!fields.TryGetValue(field, out long? fp) || !fp.HasValue)
{
return null;
}
else
{
- terms = new SimpleTextTerms(this, field, fp.Value, _maxDoc);
- _termsCache[field] = terms;
+ terms = new SimpleTextTerms(this, field, fp.Value, maxDoc);
+ termsCache[field] = terms;
}
}
@@ -708,14 +716,14 @@ namespace Lucene.Net.Codecs.SimpleText
{
if (disposing)
{
- _input.Dispose();
+ input?.Dispose();
}
}
public override long RamBytesUsed()
{
long sizeInBytes = 0;
- foreach (SimpleTextTerms simpleTextTerms in _termsCache.Values)
+ foreach (SimpleTextTerms simpleTextTerms in termsCache.Values)
{
sizeInBytes += (simpleTextTerms != null) ? simpleTextTerms.RamBytesUsed() : 0;
}