You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by pn...@apache.org on 2014/11/08 23:55:03 UTC
[2/5] lucenenet git commit: Updating Memory Codec
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0cc0e7ec/src/Lucene.Net.Codecs/Memory/FSTOrdPostingsFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTOrdPostingsFormat.cs b/src/Lucene.Net.Codecs/Memory/FSTOrdPostingsFormat.cs
index 5723f48..71b9a78 100644
--- a/src/Lucene.Net.Codecs/Memory/FSTOrdPostingsFormat.cs
+++ b/src/Lucene.Net.Codecs/Memory/FSTOrdPostingsFormat.cs
@@ -1,88 +1,81 @@
-namespace org.apache.lucene.codecs.memory
-{
-
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
- using Lucene41PostingsWriter = org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
- using Lucene41PostingsReader = org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
- using IndexOptions = org.apache.lucene.index.FieldInfo.IndexOptions;
- using SegmentReadState = org.apache.lucene.index.SegmentReadState;
- using SegmentWriteState = org.apache.lucene.index.SegmentWriteState;
- using IOUtils = org.apache.lucene.util.IOUtils;
-
- /// <summary>
- /// FSTOrd term dict + Lucene41PBF
- /// </summary>
+namespace Lucene.Net.Codecs.Memory
+{
+ using Lucene41PostingsWriter = Lucene41.Lucene41PostingsWriter;
+ using Lucene41PostingsReader = Lucene41.Lucene41PostingsReader;
+ using SegmentReadState = Index.SegmentReadState;
+ using SegmentWriteState = Index.SegmentWriteState;
+ using IOUtils = Util.IOUtils;
- public sealed class FSTOrdPostingsFormat : PostingsFormat
- {
- public FSTOrdPostingsFormat() : base("FSTOrd41")
- {
- }
+ /// <summary>
+ /// FSTOrd term dict + Lucene41PBF
+ /// </summary>
- public override string ToString()
- {
- return Name;
- }
+ public sealed class FSTOrdPostingsFormat : PostingsFormat
+ {
+ public FSTOrdPostingsFormat() : base("FSTOrd41")
+ {
+ }
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.codecs.FieldsConsumer fieldsConsumer(org.apache.lucene.index.SegmentWriteState state) throws java.io.IOException
- public override FieldsConsumer fieldsConsumer(SegmentWriteState state)
- {
- PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
+ public override string ToString()
+ {
+ return Name;
+ }
- bool success = false;
- try
- {
- FieldsConsumer ret = new FSTOrdTermsWriter(state, postingsWriter);
- success = true;
- return ret;
- }
- finally
- {
- if (!success)
- {
- IOUtils.closeWhileHandlingException(postingsWriter);
- }
- }
- }
+ public override FieldsConsumer FieldsConsumer(SegmentWriteState state)
+ {
+ PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.codecs.FieldsProducer fieldsProducer(org.apache.lucene.index.SegmentReadState state) throws java.io.IOException
- public override FieldsProducer fieldsProducer(SegmentReadState state)
- {
- PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
- bool success = false;
- try
- {
- FieldsProducer ret = new FSTOrdTermsReader(state, postingsReader);
- success = true;
- return ret;
- }
- finally
- {
- if (!success)
- {
- IOUtils.closeWhileHandlingException(postingsReader);
- }
- }
- }
- }
+ bool success = false;
+ try
+ {
+ FieldsConsumer ret = new FSTOrdTermsWriter(state, postingsWriter);
+ success = true;
+ return ret;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(postingsWriter);
+ }
+ }
+ }
+ public override FieldsProducer FieldsProducer(SegmentReadState state)
+ {
+ PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.Directory, state.FieldInfos,
+ state.SegmentInfo, state.Context, state.SegmentSuffix);
+ bool success = false;
+ try
+ {
+ FieldsProducer ret = new FSTOrdTermsReader(state, postingsReader);
+ success = true;
+ return ret;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(postingsReader);
+ }
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0cc0e7ec/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs
index 51c7278..4b06045 100644
--- a/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs
+++ b/src/Lucene.Net.Codecs/Memory/FSTOrdTermsWriter.cs
@@ -1,435 +1,406 @@
-using System;
-using System.Collections.Generic;
-
-namespace org.apache.lucene.codecs.memory
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System.IO;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Codecs.Memory
{
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
- using IndexOptions = org.apache.lucene.index.FieldInfo.IndexOptions;
- using FieldInfo = org.apache.lucene.index.FieldInfo;
- using FieldInfos = org.apache.lucene.index.FieldInfos;
- using IndexFileNames = org.apache.lucene.index.IndexFileNames;
- using SegmentWriteState = org.apache.lucene.index.SegmentWriteState;
- using DataOutput = org.apache.lucene.store.DataOutput;
- using IndexOutput = org.apache.lucene.store.IndexOutput;
- using RAMOutputStream = org.apache.lucene.store.RAMOutputStream;
- using ArrayUtil = org.apache.lucene.util.ArrayUtil;
- using BytesRef = org.apache.lucene.util.BytesRef;
- using IOUtils = org.apache.lucene.util.IOUtils;
- using IntsRef = org.apache.lucene.util.IntsRef;
- using Builder = org.apache.lucene.util.fst.Builder;
- using FST = org.apache.lucene.util.fst.FST;
- using PositiveIntOutputs = org.apache.lucene.util.fst.PositiveIntOutputs;
- using Util = org.apache.lucene.util.fst.Util;
-
- /// <summary>
- /// FST-based term dict, using ord as FST output.
- ///
- /// The FST holds the mapping between <term, ord>, and
- /// term's metadata is delta encoded into a single byte block.
- ///
- /// Typically the byte block consists of four parts:
- /// 1. term statistics: docFreq, totalTermFreq;
- /// 2. monotonic long[], e.g. the pointer to the postings list for that term;
- /// 3. generic byte[], e.g. other information customized by postings base.
- /// 4. single-level skip list to speed up metadata decoding by ord.
- ///
- /// <para>
- /// Files:
- /// <ul>
- /// <li><tt>.tix</tt>: <a href="#Termindex">Term Index</a></li>
- /// <li><tt>.tbk</tt>: <a href="#Termblock">Term Block</a></li>
- /// </ul>
- /// </para>
- ///
- /// <a name="Termindex" id="Termindex"></a>
- /// <h3>Term Index</h3>
- /// <para>
- /// The .tix contains a list of FSTs, one for each field.
- /// The FST maps a term to its corresponding order in current field.
- /// </para>
- ///
- /// <ul>
- /// <li>TermIndex(.tix) --> Header, TermFST<sup>NumFields</sup>, Footer</li>
- /// <li>TermFST --> <seealso cref="FST FST<long>"/></li>
- /// <li>Header --> <seealso cref="CodecUtil#writeHeader CodecHeader"/></li>
- /// <li>Footer --> <seealso cref="CodecUtil#writeFooter CodecFooter"/></li>
- /// </ul>
- ///
- /// <para>Notes:</para>
- /// <ul>
- /// <li>
- /// Since terms are already sorted before writing to <a href="#Termblock">Term Block</a>,
- /// their ords can directly used to seek term metadata from term block.
- /// </li>
- /// </ul>
- ///
- /// <a name="Termblock" id="Termblock"></a>
- /// <h3>Term Block</h3>
- /// <para>
- /// The .tbk contains all the statistics and metadata for terms, along with field summary (e.g.
- /// per-field data like number of documents in current field). For each field, there are four blocks:
- /// <ul>
- /// <li>statistics bytes block: contains term statistics; </li>
- /// <li>metadata longs block: delta-encodes monotonic part of metadata; </li>
- /// <li>metadata bytes block: encodes other parts of metadata; </li>
- /// <li>skip block: contains skip data, to speed up metadata seeking and decoding</li>
- /// </ul>
- /// </para>
- ///
- /// <para>File Format:</para>
- /// <ul>
- /// <li>TermBlock(.tbk) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li>
- /// <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
- /// DocCount, LongsSize, DataBlock > <sup>NumFields</sup>, Footer</li>
- ///
- /// <li>DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
- /// SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock </li>
- /// <li>SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta,
- /// MetaLongsSkipDelta<sup>LongsSize</sup> ><sup>NumTerms</sup>
- /// <li>StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > <sup>NumTerms</sup>
- /// <li>MetaLongsBlock --> < LongDelta<sup>LongsSize</sup>, BytesSize > <sup>NumTerms</sup>
- /// <li>MetaBytesBlock --> Byte <sup>MetaBytesBlockLength</sup>
- /// <li>Header --> <seealso cref="CodecUtil#writeHeader CodecHeader"/></li>
- /// <li>DirOffset --> <seealso cref="DataOutput#writeLong Uint64"/></li>
- /// <li>NumFields, FieldNumber, DocCount, DocFreq, LongsSize,
- /// FieldNumber, DocCount --> <seealso cref="DataOutput#writeVInt VInt"/></li>
- /// <li>NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
- /// StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq,
- /// LongDelta,--> <seealso cref="DataOutput#writeVLong VLong"/></li>
- /// <li>Footer --> <seealso cref="CodecUtil#writeFooter CodecFooter"/></li>
- /// </ul>
- /// <para>Notes: </para>
- /// <ul>
- /// <li>
- /// The format of PostingsHeader and MetaBytes are customized by the specific postings implementation:
- /// they contain arbitrary per-file data (such as parameters or versioning information), and per-term data
- /// (non-monotonic ones like pulsed postings data).
- /// </li>
- /// <li>
- /// During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek
- /// term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset
- /// for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates
- /// the value of preceding metadata longs for every SkipInterval's term.
- /// </li>
- /// <li>
- /// DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term.
- /// Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case,
- /// so that encoding of TotalTermFreq may be omitted.
- /// </li>
- /// </ul>
- ///
- /// @lucene.experimental
- /// </summary>
-
- public class FSTOrdTermsWriter : FieldsConsumer
- {
- internal const string TERMS_INDEX_EXTENSION = "tix";
- internal const string TERMS_BLOCK_EXTENSION = "tbk";
- internal const string TERMS_CODEC_NAME = "FST_ORD_TERMS_DICT";
- public const int TERMS_VERSION_START = 0;
- public const int TERMS_VERSION_CHECKSUM = 1;
- public const int TERMS_VERSION_CURRENT = TERMS_VERSION_CHECKSUM;
- public const int SKIP_INTERVAL = 8;
-
- internal readonly PostingsWriterBase postingsWriter;
- internal readonly FieldInfos fieldInfos;
- internal readonly IList<FieldMetaData> fields = new List<FieldMetaData>();
- internal IndexOutput blockOut = null;
- internal IndexOutput indexOut = null;
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: public FSTOrdTermsWriter(org.apache.lucene.index.SegmentWriteState state, org.apache.lucene.codecs.PostingsWriterBase postingsWriter) throws java.io.IOException
- public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String termsIndexFileName = org.apache.lucene.index.IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
- string termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final String termsBlockFileName = org.apache.lucene.index.IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION);
- string termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION);
-
- this.postingsWriter = postingsWriter;
- this.fieldInfos = state.fieldInfos;
-
- bool success = false;
- try
- {
- this.indexOut = state.directory.createOutput(termsIndexFileName, state.context);
- this.blockOut = state.directory.createOutput(termsBlockFileName, state.context);
- writeHeader(indexOut);
- writeHeader(blockOut);
- this.postingsWriter.init(blockOut);
- success = true;
- }
- finally
- {
- if (!success)
- {
- IOUtils.closeWhileHandlingException(indexOut, blockOut);
- }
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.codecs.TermsConsumer addField(org.apache.lucene.index.FieldInfo field) throws java.io.IOException
- public override TermsConsumer addField(FieldInfo field)
- {
- return new TermsWriter(this, field);
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void close() throws java.io.IOException
- public override void close()
- {
- if (blockOut != null)
- {
- IOException ioe = null;
- try
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final long blockDirStart = blockOut.getFilePointer();
- long blockDirStart = blockOut.FilePointer;
-
- // write field summary
- blockOut.writeVInt(fields.Count);
- foreach (FieldMetaData field in fields)
- {
- blockOut.writeVInt(field.fieldInfo.number);
- blockOut.writeVLong(field.numTerms);
- if (field.fieldInfo.IndexOptions != IndexOptions.DOCS_ONLY)
- {
- blockOut.writeVLong(field.sumTotalTermFreq);
- }
- blockOut.writeVLong(field.sumDocFreq);
- blockOut.writeVInt(field.docCount);
- blockOut.writeVInt(field.longsSize);
- blockOut.writeVLong(field.statsOut.FilePointer);
- blockOut.writeVLong(field.metaLongsOut.FilePointer);
- blockOut.writeVLong(field.metaBytesOut.FilePointer);
-
- field.skipOut.writeTo(blockOut);
- field.statsOut.writeTo(blockOut);
- field.metaLongsOut.writeTo(blockOut);
- field.metaBytesOut.writeTo(blockOut);
- field.dict.save(indexOut);
- }
- writeTrailer(blockOut, blockDirStart);
- CodecUtil.writeFooter(indexOut);
- CodecUtil.writeFooter(blockOut);
- }
- catch (IOException ioe2)
- {
- ioe = ioe2;
- }
- finally
- {
- IOUtils.closeWhileHandlingException(ioe, blockOut, indexOut, postingsWriter);
- blockOut = null;
- }
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void writeHeader(org.apache.lucene.store.IndexOutput out) throws java.io.IOException
- private void writeHeader(IndexOutput @out)
- {
- CodecUtil.writeHeader(@out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
- }
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void writeTrailer(org.apache.lucene.store.IndexOutput out, long dirStart) throws java.io.IOException
- private void writeTrailer(IndexOutput @out, long dirStart)
- {
- @out.writeLong(dirStart);
- }
-
- private class FieldMetaData
- {
- public FieldInfo fieldInfo;
- public long numTerms;
- public long sumTotalTermFreq;
- public long sumDocFreq;
- public int docCount;
- public int longsSize;
- public FST<long?> dict;
-
- // TODO: block encode each part
-
- // vint encode next skip point (fully decoded when reading)
- public RAMOutputStream skipOut;
- // vint encode df, (ttf-df)
- public RAMOutputStream statsOut;
- // vint encode monotonic long[] and length for corresponding byte[]
- public RAMOutputStream metaLongsOut;
- // generic byte[]
- public RAMOutputStream metaBytesOut;
- }
-
- internal sealed class TermsWriter : TermsConsumer
- {
- private readonly FSTOrdTermsWriter outerInstance;
-
- internal readonly Builder<long?> builder;
- internal readonly PositiveIntOutputs outputs;
- internal readonly FieldInfo fieldInfo;
- internal readonly int longsSize;
- internal long numTerms;
-
- internal readonly IntsRef scratchTerm = new IntsRef();
- internal readonly RAMOutputStream statsOut = new RAMOutputStream();
- internal readonly RAMOutputStream metaLongsOut = new RAMOutputStream();
- internal readonly RAMOutputStream metaBytesOut = new RAMOutputStream();
-
- internal readonly RAMOutputStream skipOut = new RAMOutputStream();
- internal long lastBlockStatsFP;
- internal long lastBlockMetaLongsFP;
- internal long lastBlockMetaBytesFP;
- internal long[] lastBlockLongs;
-
- internal long[] lastLongs;
- internal long lastMetaBytesFP;
-
- internal TermsWriter(FSTOrdTermsWriter outerInstance, FieldInfo fieldInfo)
- {
- this.outerInstance = outerInstance;
- this.numTerms = 0;
- this.fieldInfo = fieldInfo;
- this.longsSize = outerInstance.postingsWriter.setField(fieldInfo);
- this.outputs = PositiveIntOutputs.Singleton;
- this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
-
- this.lastBlockStatsFP = 0;
- this.lastBlockMetaLongsFP = 0;
- this.lastBlockMetaBytesFP = 0;
- this.lastBlockLongs = new long[longsSize];
-
- this.lastLongs = new long[longsSize];
- this.lastMetaBytesFP = 0;
- }
-
- public override IComparer<BytesRef> Comparator
- {
- get
- {
- return BytesRef.UTF8SortedAsUnicodeComparator;
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.codecs.PostingsConsumer startTerm(org.apache.lucene.util.BytesRef text) throws java.io.IOException
- public override PostingsConsumer startTerm(BytesRef text)
- {
- outerInstance.postingsWriter.startTerm();
- return outerInstance.postingsWriter;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void finishTerm(org.apache.lucene.util.BytesRef text, org.apache.lucene.codecs.TermStats stats) throws java.io.IOException
- public override void finishTerm(BytesRef text, TermStats stats)
- {
- if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0)
- {
- bufferSkip();
- }
- // write term meta data into fst
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final long longs[] = new long[longsSize];
- long[] longs = new long[longsSize];
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final long delta = stats.totalTermFreq - stats.docFreq;
- long delta = stats.totalTermFreq - stats.docFreq;
- if (stats.totalTermFreq > 0)
- {
- if (delta == 0)
- {
- statsOut.writeVInt(stats.docFreq << 1 | 1);
- }
- else
- {
- statsOut.writeVInt(stats.docFreq << 1 | 0);
- statsOut.writeVLong(stats.totalTermFreq - stats.docFreq);
- }
- }
- else
- {
- statsOut.writeVInt(stats.docFreq);
- }
- BlockTermState state = outerInstance.postingsWriter.newTermState();
- state.docFreq = stats.docFreq;
- state.totalTermFreq = stats.totalTermFreq;
- outerInstance.postingsWriter.finishTerm(state);
- outerInstance.postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true);
- for (int i = 0; i < longsSize; i++)
- {
- metaLongsOut.writeVLong(longs[i] - lastLongs[i]);
- lastLongs[i] = longs[i];
- }
- metaLongsOut.writeVLong(metaBytesOut.FilePointer - lastMetaBytesFP);
-
- builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
- numTerms++;
-
- lastMetaBytesFP = metaBytesOut.FilePointer;
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws java.io.IOException
- public override void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
- {
- if (numTerms > 0)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final FieldMetaData metadata = new FieldMetaData();
- FieldMetaData metadata = new FieldMetaData();
- metadata.fieldInfo = fieldInfo;
- metadata.numTerms = numTerms;
- metadata.sumTotalTermFreq = sumTotalTermFreq;
- metadata.sumDocFreq = sumDocFreq;
- metadata.docCount = docCount;
- metadata.longsSize = longsSize;
- metadata.skipOut = skipOut;
- metadata.statsOut = statsOut;
- metadata.metaLongsOut = metaLongsOut;
- metadata.metaBytesOut = metaBytesOut;
- metadata.dict = builder.finish();
- outerInstance.fields.Add(metadata);
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void bufferSkip() throws java.io.IOException
- internal void bufferSkip()
- {
- skipOut.writeVLong(statsOut.FilePointer - lastBlockStatsFP);
- skipOut.writeVLong(metaLongsOut.FilePointer - lastBlockMetaLongsFP);
- skipOut.writeVLong(metaBytesOut.FilePointer - lastBlockMetaBytesFP);
- for (int i = 0; i < longsSize; i++)
- {
- skipOut.writeVLong(lastLongs[i] - lastBlockLongs[i]);
- }
- lastBlockStatsFP = statsOut.FilePointer;
- lastBlockMetaLongsFP = metaLongsOut.FilePointer;
- lastBlockMetaBytesFP = metaBytesOut.FilePointer;
- Array.Copy(lastLongs, 0, lastBlockLongs, 0, longsSize);
- }
- }
- }
-
+ using System;
+ using System.Collections.Generic;
+
+ using IndexOptions = Index.FieldInfo.IndexOptions;
+ using FieldInfo = Index.FieldInfo;
+ using FieldInfos = Index.FieldInfos;
+ using IndexFileNames = Index.IndexFileNames;
+ using SegmentWriteState = Index.SegmentWriteState;
+ using DataOutput = Store.DataOutput;
+ using IndexOutput = Store.IndexOutput;
+ using RAMOutputStream = Store.RAMOutputStream;
+ using BytesRef = Util.BytesRef;
+ using IOUtils = Util.IOUtils;
+ using IntsRef = Util.IntsRef;
+ using Builder = Util.Fst.Builder<long>;
+ using FST = FST;
+ using PositiveIntOutputs = Util.Fst.PositiveIntOutputs;
+ using Util = Util.Fst.Util;
+
+ /// <summary>
+ /// FST-based term dict, using ord as FST output.
+ ///
+ /// The FST holds the mapping between <term, ord>, and
+ /// term's metadata is delta encoded into a single byte block.
+ ///
+ /// Typically the byte block consists of four parts:
+ /// 1. term statistics: docFreq, totalTermFreq;
+ /// 2. monotonic long[], e.g. the pointer to the postings list for that term;
+ /// 3. generic byte[], e.g. other information customized by postings base.
+ /// 4. single-level skip list to speed up metadata decoding by ord.
+ ///
+ /// <para>
+ /// Files:
+ /// <ul>
+ /// <li><tt>.tix</tt>: <a href="#Termindex">Term Index</a></li>
+ /// <li><tt>.tbk</tt>: <a href="#Termblock">Term Block</a></li>
+ /// </ul>
+ /// </para>
+ ///
+ /// <a name="Termindex" id="Termindex"></a>
+ /// <h3>Term Index</h3>
+ /// <para>
+ /// The .tix contains a list of FSTs, one for each field.
+ /// The FST maps a term to its corresponding order in current field.
+ /// </para>
+ ///
+ /// <ul>
+ /// <li>TermIndex(.tix) --> Header, TermFST<sup>NumFields</sup>, Footer</li>
+ /// <li>TermFST --> <seealso cref="FST"/></li>
+ /// <li>Header --> <seealso cref="CodecUtil#writeHeader CodecHeader"/></li>
+ /// <li>Footer --> <seealso cref="CodecUtil#writeFooter CodecFooter"/></li>
+ /// </ul>
+ ///
+ /// <para>Notes:</para>
+ /// <ul>
+ /// <li>
+ /// Since terms are already sorted before writing to <a href="#Termblock">Term Block</a>,
+ /// their ords can directly used to seek term metadata from term block.
+ /// </li>
+ /// </ul>
+ ///
+ /// <a name="Termblock" id="Termblock"></a>
+ /// <h3>Term Block</h3>
+ /// <para>
+ /// The .tbk contains all the statistics and metadata for terms, along with field summary (e.g.
+ /// per-field data like number of documents in current field). For each field, there are four blocks:
+ /// <ul>
+ /// <li>statistics bytes block: contains term statistics; </li>
+ /// <li>metadata longs block: delta-encodes monotonic part of metadata; </li>
+ /// <li>metadata bytes block: encodes other parts of metadata; </li>
+ /// <li>skip block: contains skip data, to speed up metadata seeking and decoding</li>
+ /// </ul>
+ /// </para>
+ ///
+ /// <para>File Format:</para>
+ /// <ul>
+ /// <li>TermBlock(.tbk) --> Header, <i>PostingsHeader</i>, FieldSummary, DirOffset</li>
+ /// <li>FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
+ /// DocCount, LongsSize, DataBlock > <sup>NumFields</sup>, Footer</li>
+ ///
+ /// <li>DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
+ /// SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock </li>
+ /// <li>SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta,
+ /// MetaLongsSkipDelta<sup>LongsSize</sup> ><sup>NumTerms</sup></li>
+ /// <li>StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > <sup>NumTerms</sup></li>
+ /// <li>MetaLongsBlock --> < LongDelta<sup>LongsSize</sup>, BytesSize > <sup>NumTerms</sup></li>
+ /// <li>MetaBytesBlock --> Byte <sup>MetaBytesBlockLength</sup></li>
+ /// <li>Header --> <seealso cref="CodecUtil#writeHeader CodecHeader"/></li>
+ /// <li>DirOffset --> <seealso cref="DataOutput#writeLong Uint64"/></li>
+ /// <li>NumFields, FieldNumber, DocCount, DocFreq, LongsSize,
+ /// FieldNumber, DocCount --> <seealso cref="DataOutput#writeVInt VInt"/></li>
+ /// <li>NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
+ /// StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq,
+ /// LongDelta,--> <seealso cref="DataOutput#writeVLong VLong"/></li>
+ /// <li>Footer --> <seealso cref="CodecUtil#writeFooter CodecFooter"/></li>
+ /// </ul>
+ /// <para>Notes: </para>
+ /// <ul>
+ /// <li>
+ /// The format of PostingsHeader and MetaBytes are customized by the specific postings implementation:
+ /// they contain arbitrary per-file data (such as parameters or versioning information), and per-term data
+ /// (non-monotonic ones like pulsed postings data).
+ /// </li>
+ /// <li>
+ /// During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek
+ /// term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset
+ /// for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates
+ /// the value of preceding metadata longs for every SkipInterval's term.
+ /// </li>
+ /// <li>
+ /// DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term.
+ /// Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case,
+ /// so that encoding of TotalTermFreq may be omitted.
+ /// </li>
+ /// </ul>
+ ///
+ /// @lucene.experimental
+ /// </summary>
+
+ public class FSTOrdTermsWriter : FieldsConsumer
+ {
+ internal const string TERMS_INDEX_EXTENSION = "tix";
+ internal const string TERMS_BLOCK_EXTENSION = "tbk";
+ internal const string TERMS_CODEC_NAME = "FST_ORD_TERMS_DICT";
+ public const int TERMS_VERSION_START = 0;
+ public const int TERMS_VERSION_CHECKSUM = 1;
+ public const int TERMS_VERSION_CURRENT = TERMS_VERSION_CHECKSUM;
+ public const int SKIP_INTERVAL = 8;
+
+ internal readonly PostingsWriterBase postingsWriter;
+ internal readonly FieldInfos fieldInfos;
+ private readonly IList<FieldMetaData> _fields = new List<FieldMetaData>();
+ internal IndexOutput blockOut = null;
+ internal IndexOutput indexOut = null;
+
+ public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)
+ {
+ var termsIndexFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix,
+ TERMS_INDEX_EXTENSION);
+ var termsBlockFileName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix,
+ TERMS_BLOCK_EXTENSION);
+
+ this.postingsWriter = postingsWriter;
+ fieldInfos = state.FieldInfos;
+
+ var success = false;
+ try
+ {
+ indexOut = state.Directory.CreateOutput(termsIndexFileName, state.Context);
+ blockOut = state.Directory.CreateOutput(termsBlockFileName, state.Context);
+ WriteHeader(indexOut);
+ WriteHeader(blockOut);
+ this.postingsWriter.Init(blockOut);
+ success = true;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(indexOut, blockOut);
+ }
+ }
+ }
+
+ public override TermsConsumer AddField(FieldInfo field)
+ {
+ return new TermsWriter(this, field);
+ }
+
+ public override void Dispose()
+ {
+ if (blockOut == null) return;
+
+ IOException ioe = null;
+ try
+ {
+ var blockDirStart = blockOut.FilePointer;
+
+ // write field summary
+ blockOut.WriteVInt(_fields.Count);
+ foreach (var field in _fields)
+ {
+ blockOut.WriteVInt(field.FieldInfo.Number);
+ blockOut.WriteVLong(field.NumTerms);
+ if (field.FieldInfo.FieldIndexOptions != IndexOptions.DOCS_ONLY)
+ {
+ blockOut.WriteVLong(field.SumTotalTermFreq);
+ }
+ blockOut.WriteVLong(field.SumDocFreq);
+ blockOut.WriteVInt(field.DocCount);
+ blockOut.WriteVInt(field.LongsSize);
+ blockOut.WriteVLong(field.StatsOut.FilePointer);
+ blockOut.WriteVLong(field.MetaLongsOut.FilePointer);
+ blockOut.WriteVLong(field.MetaBytesOut.FilePointer);
+
+ field.SkipOut.WriteTo(blockOut);
+ field.StatsOut.WriteTo(blockOut);
+ field.MetaLongsOut.WriteTo(blockOut);
+ field.MetaBytesOut.WriteTo(blockOut);
+ field.Dict.Save(indexOut);
+ }
+ WriteTrailer(blockOut, blockDirStart);
+ CodecUtil.WriteFooter(indexOut);
+ CodecUtil.WriteFooter(blockOut);
+ }
+ catch (IOException ioe2)
+ {
+ ioe = ioe2;
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ioe, blockOut, indexOut, postingsWriter);
+ blockOut = null;
+ }
+ }
+
+ private static void WriteHeader(IndexOutput @out)
+ {
+ CodecUtil.WriteHeader(@out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT);
+ }
+
+ private static void WriteTrailer(IndexOutput output, long dirStart)
+ {
+ output.WriteLong(dirStart);
+ }
+
+ private class FieldMetaData
+ {
+ public FieldInfo FieldInfo { get; set; }
+ public long NumTerms { get; set; }
+ public long SumTotalTermFreq { get; set; }
+ public long SumDocFreq { get; set; }
+ public int DocCount { get; set; }
+ public int LongsSize { get; set; }
+ public FST<long> Dict { get; set; }
+
+ // TODO: block encode each part
+
+ // vint encode next skip point (fully decoded when reading)
+ public RAMOutputStream SkipOut { get; set; }
+ // vint encode df, (ttf-df)
+ public RAMOutputStream StatsOut { get; set; }
+ // vint encode monotonic long[] and length for corresponding byte[]
+ public RAMOutputStream MetaLongsOut { get; set; }
+ // generic byte[]
+ public RAMOutputStream MetaBytesOut { get; set; }
+ }
+
+ internal sealed class TermsWriter : TermsConsumer
+ {
+ private readonly FSTOrdTermsWriter _outerInstance;
+
+ private readonly Builder<long> _builder;
+ private readonly PositiveIntOutputs _outputs;
+ private readonly FieldInfo _fieldInfo;
+ private readonly int _longsSize;
+ private long _numTerms;
+
+ private readonly IntsRef _scratchTerm = new IntsRef();
+ private readonly RAMOutputStream _statsOut = new RAMOutputStream();
+ private readonly RAMOutputStream _metaLongsOut = new RAMOutputStream();
+ private readonly RAMOutputStream _metaBytesOut = new RAMOutputStream();
+ private readonly RAMOutputStream _skipOut = new RAMOutputStream();
+
+ private long _lastBlockStatsFp;
+ private long _lastBlockMetaLongsFp;
+ private long _lastBlockMetaBytesFp;
+ private readonly long[] _lastBlockLongs;
+
+ private readonly long[] _lastLongs;
+ private long _lastMetaBytesFp;
+
+ internal TermsWriter(FSTOrdTermsWriter outerInstance, FieldInfo fieldInfo)
+ {
+ _outerInstance = outerInstance;
+ _numTerms = 0;
+ _fieldInfo = fieldInfo;
+ _longsSize = outerInstance.postingsWriter.SetField(fieldInfo);
+ _outputs = PositiveIntOutputs.Singleton;
+ _builder = new Builder<long>(FST.INPUT_TYPE.BYTE1, _outputs);
+
+ _lastBlockStatsFp = 0;
+ _lastBlockMetaLongsFp = 0;
+ _lastBlockMetaBytesFp = 0;
+ _lastBlockLongs = new long[_longsSize];
+
+ _lastLongs = new long[_longsSize];
+ _lastMetaBytesFp = 0;
+ }
+
+ public override IComparer<BytesRef> Comparator
+ {
+ get { return BytesRef.UTF8SortedAsUnicodeComparer; }
+ }
+
+ public override PostingsConsumer StartTerm(BytesRef text)
+ {
+ _outerInstance.postingsWriter.StartTerm();
+ return _outerInstance.postingsWriter;
+ }
+
+
+ public override void FinishTerm(BytesRef text, TermStats stats)
+ {
+ if (_numTerms > 0 && _numTerms%SKIP_INTERVAL == 0)
+ {
+ BufferSkip();
+ }
+ // write term meta data into fst
+ var longs = new long[_longsSize];
+
+ long delta = stats.TotalTermFreq - stats.DocFreq;
+ if (stats.TotalTermFreq > 0)
+ {
+ if (delta == 0)
+ {
+ _statsOut.WriteVInt(stats.DocFreq << 1 | 1);
+ }
+ else
+ {
+ _statsOut.WriteVInt(stats.DocFreq << 1 | 0);
+ _statsOut.WriteVLong(stats.TotalTermFreq - stats.DocFreq);
+ }
+ }
+ else
+ {
+ _statsOut.WriteVInt(stats.DocFreq);
+ }
+ var state = _outerInstance.postingsWriter.NewTermState();
+ state.DocFreq = stats.DocFreq;
+ state.TotalTermFreq = stats.TotalTermFreq;
+ _outerInstance.postingsWriter.FinishTerm(state);
+ _outerInstance.postingsWriter.EncodeTerm(longs, _metaBytesOut, _fieldInfo, state, true);
+ for (var i = 0; i < _longsSize; i++)
+ {
+ _metaLongsOut.WriteVLong(longs[i] - _lastLongs[i]);
+ _lastLongs[i] = longs[i];
+ }
+ _metaLongsOut.WriteVLong(_metaBytesOut.FilePointer - _lastMetaBytesFp);
+
+ _builder.Add(Util.ToIntsRef(text, _scratchTerm), _numTerms);
+ _numTerms++;
+
+ _lastMetaBytesFp = _metaBytesOut.FilePointer;
+ }
+
+ public override void Finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
+ {
+ if (_numTerms <= 0) return;
+
+ var metadata = new FieldMetaData
+ {
+ FieldInfo = _fieldInfo,
+ NumTerms = _numTerms,
+ SumTotalTermFreq = sumTotalTermFreq,
+ SumDocFreq = sumDocFreq,
+ DocCount = docCount,
+ LongsSize = _longsSize,
+ SkipOut = _skipOut,
+ StatsOut = _statsOut,
+ MetaLongsOut = _metaLongsOut,
+ MetaBytesOut = _metaBytesOut,
+ Dict = _builder.Finish()
+ };
+ _outerInstance._fields.Add(metadata);
+ }
+
+ internal void BufferSkip()
+ {
+ _skipOut.WriteVLong(_statsOut.FilePointer - _lastBlockStatsFp);
+ _skipOut.WriteVLong(_metaLongsOut.FilePointer - _lastBlockMetaLongsFp);
+ _skipOut.WriteVLong(_metaBytesOut.FilePointer - _lastBlockMetaBytesFp);
+ for (var i = 0; i < _longsSize; i++)
+ {
+ _skipOut.WriteVLong(_lastLongs[i] - _lastBlockLongs[i]);
+ }
+ _lastBlockStatsFp = _statsOut.FilePointer;
+ _lastBlockMetaLongsFp = _metaLongsOut.FilePointer;
+ _lastBlockMetaBytesFp = _metaBytesOut.FilePointer;
+ Array.Copy(_lastLongs, 0, _lastBlockLongs, 0, _longsSize);
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0cc0e7ec/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs b/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs
index f41001b..432fcdd 100644
--- a/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs
+++ b/src/Lucene.Net.Codecs/Memory/FSTPostingsFormat.cs
@@ -1,88 +1,82 @@
-namespace org.apache.lucene.codecs.memory
-{
-
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- using Lucene41PostingsWriter = org.apache.lucene.codecs.lucene41.Lucene41PostingsWriter;
- using Lucene41PostingsReader = org.apache.lucene.codecs.lucene41.Lucene41PostingsReader;
- using IndexOptions = org.apache.lucene.index.FieldInfo.IndexOptions;
- using SegmentReadState = org.apache.lucene.index.SegmentReadState;
- using SegmentWriteState = org.apache.lucene.index.SegmentWriteState;
- using IOUtils = org.apache.lucene.util.IOUtils;
+namespace Lucene.Net.Codecs.Memory
+{
- /// <summary>
- /// FST term dict + Lucene41PBF
- /// </summary>
+ using Lucene41PostingsWriter = Lucene41.Lucene41PostingsWriter;
+ using Lucene41PostingsReader = Lucene41.Lucene41PostingsReader;
+ using SegmentReadState = Index.SegmentReadState;
+ using SegmentWriteState = Index.SegmentWriteState;
+ using IOUtils = Util.IOUtils;
- public sealed class FSTPostingsFormat : PostingsFormat
- {
- public FSTPostingsFormat() : base("FST41")
- {
- }
+ /// <summary>
+ /// FST term dict + Lucene41PBF
+ /// </summary>
- public override string ToString()
- {
- return Name;
- }
+ public sealed class FSTPostingsFormat : PostingsFormat
+ {
+ public FSTPostingsFormat() : base("FST41")
+ {
+ }
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.codecs.FieldsConsumer fieldsConsumer(org.apache.lucene.index.SegmentWriteState state) throws java.io.IOException
- public override FieldsConsumer fieldsConsumer(SegmentWriteState state)
- {
- PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
+ public override string ToString()
+ {
+ return Name;
+ }
- bool success = false;
- try
- {
- FieldsConsumer ret = new FSTTermsWriter(state, postingsWriter);
- success = true;
- return ret;
- }
- finally
- {
- if (!success)
- {
- IOUtils.closeWhileHandlingException(postingsWriter);
- }
- }
- }
+ public override FieldsConsumer FieldsConsumer(SegmentWriteState state)
+ {
+ PostingsWriterBase postingsWriter = new Lucene41PostingsWriter(state);
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public org.apache.lucene.codecs.FieldsProducer fieldsProducer(org.apache.lucene.index.SegmentReadState state) throws java.io.IOException
- public override FieldsProducer fieldsProducer(SegmentReadState state)
- {
- PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
- bool success = false;
- try
- {
- FieldsProducer ret = new FSTTermsReader(state, postingsReader);
- success = true;
- return ret;
- }
- finally
- {
- if (!success)
- {
- IOUtils.closeWhileHandlingException(postingsReader);
- }
- }
- }
- }
+ bool success = false;
+ try
+ {
+ FieldsConsumer ret = new FSTTermsWriter(state, postingsWriter);
+ success = true;
+ return ret;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(postingsWriter);
+ }
+ }
+ }
+ public override FieldsProducer FieldsProducer(SegmentReadState state)
+ {
+ PostingsReaderBase postingsReader = new Lucene41PostingsReader(state.Directory, state.FieldInfos,
+ state.SegmentInfo, state.Context, state.SegmentSuffix);
+ bool success = false;
+ try
+ {
+ FieldsProducer ret = new FSTTermsReader(state, postingsReader);
+ success = true;
+ return ret;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(postingsReader);
+ }
+ }
+ }
+ }
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0cc0e7ec/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs b/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
index 55b0631..0bd16df 100644
--- a/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
+++ b/src/Lucene.Net.Codecs/Memory/FSTTermsWriter.cs
@@ -269,18 +269,18 @@ namespace Lucene.Net.Codecs.Memory
var meta = new FSTTermOutputs.TermData
{
- longs = new long[_longsSize],
- bytes = null,
- docFreq = state.DocFreq = stats.DocFreq,
- totalTermFreq = state.TotalTermFreq = stats.TotalTermFreq
+ LONGS = new long[_longsSize],
+ BYTES = null,
+ DOC_FREQ = state.DocFreq = stats.DocFreq,
+ TOTAL_TERM_FREQ = state.TotalTermFreq = stats.TotalTermFreq
};
_outerInstance._postingsWriter.FinishTerm(state);
- _outerInstance._postingsWriter.EncodeTerm(meta.longs, _metaWriter, _fieldInfo, state, true);
+ _outerInstance._postingsWriter.EncodeTerm(meta.LONGS, _metaWriter, _fieldInfo, state, true);
var bytesSize = (int) _metaWriter.FilePointer;
if (bytesSize > 0)
{
- meta.bytes = new sbyte[bytesSize];
- _metaWriter.WriteTo(meta.bytes, 0);
+ meta.BYTES = new sbyte[bytesSize];
+ _metaWriter.WriteTo(meta.BYTES, 0);
_metaWriter.Reset();
}
_builder.Add(Util.ToIntsRef(text, _scratchTerm), meta);
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0cc0e7ec/src/Lucene.Net.Codecs/Memory/MemoryDocValuesConsumer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Codecs/Memory/MemoryDocValuesConsumer.cs b/src/Lucene.Net.Codecs/Memory/MemoryDocValuesConsumer.cs
index a51df00..4613d28 100644
--- a/src/Lucene.Net.Codecs/Memory/MemoryDocValuesConsumer.cs
+++ b/src/Lucene.Net.Codecs/Memory/MemoryDocValuesConsumer.cs
@@ -1,13 +1,4 @@
-using System;
-using System.Diagnostics;
-using System.Collections.Generic;
-using Lucene.Net.Codecs.Memory;
-using Lucene.Net.Index;
-
-namespace Lucene.Net.Codecs.Memory
-{
-
- /*
+ /*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -24,524 +15,472 @@ namespace Lucene.Net.Codecs.Memory
* limitations under the License.
*/
+using System;
+using System.Collections;
+using System.Diagnostics;
+using System.Collections.Generic;
+using Lucene.Net.Codecs.Memory;
+using Lucene.Net.Index;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Codecs.Memory
+{
- using FieldInfo = index.FieldInfo;
- using IndexFileNames = index.IndexFileNames;
- using SegmentWriteState = index.SegmentWriteState;
- using ByteArrayDataOutput = store.ByteArrayDataOutput;
- using IndexOutput = store.IndexOutput;
- using ArrayUtil = util.ArrayUtil;
- using BytesRef = util.BytesRef;
- using IOUtils = util.IOUtils;
- using IntsRef = util.IntsRef;
- using MathUtil = util.MathUtil;
- using Builder = util.fst.Builder;
- using INPUT_TYPE = util.fst.FST.INPUT_TYPE;
- using FST = util.fst.FST;
- using PositiveIntOutputs = util.fst.PositiveIntOutputs;
- using Util = util.fst.Util;
- using BlockPackedWriter = util.packed.BlockPackedWriter;
- using MonotonicBlockPackedWriter = util.packed.MonotonicBlockPackedWriter;
- using FormatAndBits = util.packed.PackedInts.FormatAndBits;
- using PackedInts = util.packed.PackedInts;
-
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.VERSION_CURRENT;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.BLOCK_SIZE;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.BYTES;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.NUMBER;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.FST;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.DELTA_COMPRESSED;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.GCD_COMPRESSED;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.TABLE_COMPRESSED;
-//JAVA TO C# CONVERTER TODO TASK: This Java 'import static' statement cannot be converted to .NET:
- import static Lucene.Net.Codecs.Memory.MemoryDocValuesProducer.UNCOMPRESSED;
-
- /// <summary>
- /// Writer for <seealso cref="MemoryDocValuesFormat"/>
- /// </summary>
- internal class MemoryDocValuesConsumer : DocValuesConsumer
- {
- internal IndexOutput data, meta;
- internal readonly int maxDoc;
- internal readonly float acceptableOverheadRatio;
-
- internal MemoryDocValuesConsumer(SegmentWriteState state, string dataCodec, string dataExtension, string metaCodec, string metaExtension, float acceptableOverheadRatio)
- {
- this.acceptableOverheadRatio = acceptableOverheadRatio;
- maxDoc = state.segmentInfo.DocCount;
- bool success = false;
- try
- {
- string dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
- data = state.directory.createOutput(dataName, state.context);
- CodecUtil.writeHeader(data, dataCodec, VERSION_CURRENT);
- string metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
- meta = state.directory.createOutput(metaName, state.context);
- CodecUtil.writeHeader(meta, metaCodec, VERSION_CURRENT);
- success = true;
- }
- finally
- {
- if (!success)
- {
- IOUtils.closeWhileHandlingException(this);
- }
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void addNumericField(index.FieldInfo field, Iterable<Number> values) throws java.io.IOException
- public override void addNumericField(FieldInfo field, IEnumerable<Number> values)
- {
- addNumericField(field, values, true);
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: void addNumericField(index.FieldInfo field, Iterable<Number> values, boolean optimizeStorage) throws java.io.IOException
- internal virtual void addNumericField(FieldInfo field, IEnumerable<Number> values, bool optimizeStorage)
- {
- meta.writeVInt(field.number);
- meta.writeByte(NUMBER);
- meta.writeLong(data.FilePointer);
- long minValue = long.MaxValue;
- long maxValue = long.MinValue;
- long gcd = 0;
- bool missing = false;
- // TODO: more efficient?
- HashSet<long?> uniqueValues = null;
- if (optimizeStorage)
- {
- uniqueValues = new HashSet<>();
-
- long count = 0;
- foreach (Number nv in values)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final long v;
- long v;
- if (nv == null)
- {
- v = 0;
- missing = true;
- }
- else
- {
- v = (long)nv;
- }
-
- if (gcd != 1)
- {
- if (v < long.MinValue / 2 || v > long.MaxValue / 2)
- {
- // in that case v - minValue might overflow and make the GCD computation return
- // wrong results. Since these extreme values are unlikely, we just discard
- // GCD computation for them
- gcd = 1;
- } // minValue needs to be set first
- else if (count != 0)
- {
- gcd = MathUtil.gcd(gcd, v - minValue);
- }
- }
-
- minValue = Math.Min(minValue, v);
- maxValue = Math.Max(maxValue, v);
-
- if (uniqueValues != null)
- {
- if (uniqueValues.Add(v))
- {
- if (uniqueValues.Count > 256)
- {
- uniqueValues = null;
- }
- }
- }
-
- ++count;
- }
- Debug.Assert(count == maxDoc);
- }
-
- if (missing)
- {
- long start = data.FilePointer;
- writeMissingBitset(values);
- meta.writeLong(start);
- meta.writeLong(data.FilePointer - start);
- }
- else
- {
- meta.writeLong(-1L);
- }
-
- if (uniqueValues != null)
- {
- // small number of unique values
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int bitsPerValue = util.packed.PackedInts.bitsRequired(uniqueValues.size()-1);
- int bitsPerValue = PackedInts.bitsRequired(uniqueValues.Count - 1);
- FormatAndBits formatAndBits = PackedInts.fastestFormatAndBits(maxDoc, bitsPerValue, acceptableOverheadRatio);
- if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
- {
- meta.writeByte(UNCOMPRESSED); // uncompressed
- foreach (Number nv in values)
- {
- data.writeByte(nv == null ? 0 : (long)(sbyte) nv);
- }
- }
- else
- {
- meta.writeByte(TABLE_COMPRESSED); // table-compressed
- long?[] decode = uniqueValues.toArray(new long?[uniqueValues.Count]);
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final java.util.HashMap<Long,Integer> encode = new java.util.HashMap<>();
- Dictionary<long?, int?> encode = new Dictionary<long?, int?>();
- data.writeVInt(decode.Length);
- for (int i = 0; i < decode.Length; i++)
- {
- data.writeLong(decode[i]);
- encode[decode[i]] = i;
- }
-
- meta.writeVInt(PackedInts.VERSION_CURRENT);
- data.writeVInt(formatAndBits.format.Id);
- data.writeVInt(formatAndBits.bitsPerValue);
-
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final util.packed.PackedInts.Writer writer = util.packed.PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, util.packed.PackedInts.DEFAULT_BUFFER_SIZE);
- PackedInts.Writer writer = PackedInts.getWriterNoHeader(data, formatAndBits.format, maxDoc, formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
- foreach (Number nv in values)
- {
- writer.add(encode[nv == null ? 0 : (long)nv]);
- }
- writer.finish();
- }
- }
- else if (gcd != 0 && gcd != 1)
- {
- meta.writeByte(GCD_COMPRESSED);
- meta.writeVInt(PackedInts.VERSION_CURRENT);
- data.writeLong(minValue);
- data.writeLong(gcd);
- data.writeVInt(BLOCK_SIZE);
-
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final util.packed.BlockPackedWriter writer = new util.packed.BlockPackedWriter(data, BLOCK_SIZE);
- BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
- foreach (Number nv in values)
- {
- long value = nv == null ? 0 : (long)nv;
- writer.add((value - minValue) / gcd);
- }
- writer.finish();
- }
- else
- {
- meta.writeByte(DELTA_COMPRESSED); // delta-compressed
-
- meta.writeVInt(PackedInts.VERSION_CURRENT);
- data.writeVInt(BLOCK_SIZE);
-
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final util.packed.BlockPackedWriter writer = new util.packed.BlockPackedWriter(data, BLOCK_SIZE);
- BlockPackedWriter writer = new BlockPackedWriter(data, BLOCK_SIZE);
- foreach (Number nv in values)
- {
- writer.add(nv == null ? 0 : (long)nv);
- }
- writer.finish();
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void close() throws java.io.IOException
- public override void close()
- {
- bool success = false;
- try
- {
- if (meta != null)
- {
- meta.writeVInt(-1); // write EOF marker
- CodecUtil.writeFooter(meta); // write checksum
- }
- if (data != null)
- {
- CodecUtil.writeFooter(data);
- }
- success = true;
- }
- finally
- {
- if (success)
- {
- IOUtils.close(data, meta);
- }
- else
- {
- IOUtils.closeWhileHandlingException(data, meta);
- }
- data = meta = null;
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void addBinaryField(index.FieldInfo field, final Iterable<util.BytesRef> values) throws java.io.IOException
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
- public override void addBinaryField(FieldInfo field, IEnumerable<BytesRef> values)
- {
- // write the byte[] data
- meta.writeVInt(field.number);
- meta.writeByte(BYTES);
- int minLength = int.MaxValue;
- int maxLength = int.MinValue;
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final long startFP = data.getFilePointer();
- long startFP = data.FilePointer;
- bool missing = false;
- foreach (BytesRef v in values)
- {
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final int length;
- int length;
- if (v == null)
- {
- length = 0;
- missing = true;
- }
- else
- {
- length = v.length;
- }
- if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH)
- {
- throw new System.ArgumentException("DocValuesField \"" + field.name + "\" is too large, must be <= " + MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH);
- }
- minLength = Math.Min(minLength, length);
- maxLength = Math.Max(maxLength, length);
- if (v != null)
- {
- data.writeBytes(v.bytes, v.offset, v.length);
- }
- }
- meta.writeLong(startFP);
- meta.writeLong(data.FilePointer - startFP);
- if (missing)
- {
- long start = data.FilePointer;
- writeMissingBitset(values);
- meta.writeLong(start);
- meta.writeLong(data.FilePointer - start);
- }
- else
- {
- meta.writeLong(-1L);
- }
- meta.writeVInt(minLength);
- meta.writeVInt(maxLength);
-
- // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
- // otherwise, we need to record the length fields...
- if (minLength != maxLength)
- {
- meta.writeVInt(PackedInts.VERSION_CURRENT);
- meta.writeVInt(BLOCK_SIZE);
-
-//JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
-//ORIGINAL LINE: final util.packed.MonotonicBlockPackedWriter writer = new util.packed.MonotonicBlockPackedWriter(data, BLOCK_SIZE);
- MonotonicBlockPackedWriter writer = new MonotonicBlockPackedWriter(data, BLOCK_SIZE);
- long addr = 0;
- foreach (BytesRef v in values)
- {
- if (v != null)
- {
- addr += v.length;
- }
- writer.add(addr);
- }
- writer.finish();
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void writeFST(index.FieldInfo field, Iterable<util.BytesRef> values) throws java.io.IOException
- private void writeFST(FieldInfo field, IEnumerable<BytesRef> values)
- {
- meta.writeVInt(field.number);
- meta.writeByte(FST);
- meta.writeLong(data.FilePointer);
- PositiveIntOutputs outputs = PositiveIntOutputs.Singleton;
- Builder<long?> builder = new Builder<long?>(INPUT_TYPE.BYTE1, outputs);
- IntsRef scratch = new IntsRef();
- long ord = 0;
- foreach (BytesRef v in values)
- {
- builder.add(Util.toIntsRef(v, scratch), ord);
- ord++;
- }
- FST<long?> fst = builder.finish();
- if (fst != null)
- {
- fst.save(data);
- }
- meta.writeVLong(ord);
- }
-
- // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
- // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: void writeMissingBitset(Iterable<?> values) throws java.io.IOException
- internal virtual void writeMissingBitset<T1>(IEnumerable<T1> values)
- {
- long bits = 0;
- int count = 0;
- foreach (object v in values)
- {
- if (count == 64)
- {
- data.writeLong(bits);
- count = 0;
- bits = 0;
- }
- if (v != null)
- {
- bits |= 1L << (count & 0x3f);
- }
- count++;
- }
- if (count > 0)
- {
- data.writeLong(bits);
- }
- }
-
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void addSortedField(index.FieldInfo field, Iterable<util.BytesRef> values, Iterable<Number> docToOrd) throws java.io.IOException
- public override void addSortedField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<Number> docToOrd)
- {
- // write the ordinals as numerics
- addNumericField(field, docToOrd, false);
-
- // write the values as FST
- writeFST(field, values);
- }
-
- // note: this might not be the most efficient... but its fairly simple
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: @Override public void addSortedSetField(index.FieldInfo field, Iterable<util.BytesRef> values, final Iterable<Number> docToOrdCount, final Iterable<Number> ords) throws java.io.IOException
-//JAVA TO C# CONVERTER WARNING: 'final' parameters are not available in .NET:
- public override void addSortedSetField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<Number> docToOrdCount, IEnumerable<Number> ords)
- {
- // write the ordinals as a binary field
- addBinaryField(field, new IterableAnonymousInnerClassHelper(this, docToOrdCount, ords));
-
- // write the values as FST
- writeFST(field, values);
- }
-
- private class IterableAnonymousInnerClassHelper : IEnumerable<BytesRef>
- {
- private readonly MemoryDocValuesConsumer outerInstance;
-
- private IEnumerable<Number> docToOrdCount;
- private IEnumerable<Number> ords;
-
- public IterableAnonymousInnerClassHelper(MemoryDocValuesConsumer outerInstance, IEnumerable<Number> docToOrdCount, IEnumerable<Number> ords)
- {
- this.outerInstance = outerInstance;
- this.docToOrdCount = docToOrdCount;
- this.ords = ords;
- }
-
- public virtual IEnumerator<BytesRef> GetEnumerator()
- {
- return new SortedSetIterator(docToOrdCount.GetEnumerator(), ords.GetEnumerator());
- }
- }
-
- // per-document vint-encoded byte[]
- internal class SortedSetIterator : IEnumerator<BytesRef>
- {
- internal sbyte[] buffer = new sbyte[10];
- internal ByteArrayDataOutput @out = new ByteArrayDataOutput();
- internal BytesRef @ref = new BytesRef();
-
- internal readonly IEnumerator<Number> counts;
- internal readonly IEnumerator<Number> ords;
-
- internal SortedSetIterator(IEnumerator<Number> counts, IEnumerator<Number> ords)
- {
- this.counts = counts;
- this.ords = ords;
- }
-
- public override bool hasNext()
- {
+
+ using FieldInfo = Index.FieldInfo;
+ using IndexFileNames = Index.IndexFileNames;
+ using SegmentWriteState = Index.SegmentWriteState;
+ using ByteArrayDataOutput = Store.ByteArrayDataOutput;
+ using IndexOutput = Store.IndexOutput;
+ using ArrayUtil = Util.ArrayUtil;
+ using BytesRef = Util.BytesRef;
+ using IOUtils = Util.IOUtils;
+ using IntsRef = Util.IntsRef;
+ using MathUtil = Util.MathUtil;
+ using Builder = Util.Fst.Builder;
+ using INPUT_TYPE = Util.Fst.FST.INPUT_TYPE;
+ using FST = Util.Fst.FST;
+ using PositiveIntOutputs = Util.Fst.PositiveIntOutputs;
+ using Util = Util.Fst.Util;
+ using BlockPackedWriter = Util.Packed.BlockPackedWriter;
+ using MonotonicBlockPackedWriter = Util.Packed.MonotonicBlockPackedWriter;
+ using FormatAndBits = Util.Packed.PackedInts.FormatAndBits;
+ using PackedInts = Util.Packed.PackedInts;
+
+ /// <summary>
+ /// Writer for <seealso cref="MemoryDocValuesFormat"/>
+ /// </summary>
+ internal class MemoryDocValuesConsumer : DocValuesConsumer
+ {
+ internal IndexOutput data, meta;
+ internal readonly int maxDoc;
+ internal readonly float acceptableOverheadRatio;
+
+ internal MemoryDocValuesConsumer(SegmentWriteState state, string dataCodec, string dataExtension,
+ string metaCodec,
+ string metaExtension, float acceptableOverheadRatio)
+ {
+ this.acceptableOverheadRatio = acceptableOverheadRatio;
+ maxDoc = state.SegmentInfo.DocCount;
+ var success = false;
+ try
+ {
+ var dataName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, dataExtension);
+ data = state.Directory.CreateOutput(dataName, state.Context);
+ CodecUtil.WriteHeader(data, dataCodec, MemoryDocValuesProducer.VERSION_CURRENT);
+ var metaName = IndexFileNames.SegmentFileName(state.SegmentInfo.Name, state.SegmentSuffix, metaExtension);
+ meta = state.Directory.CreateOutput(metaName, state.Context);
+ CodecUtil.WriteHeader(meta, metaCodec, MemoryDocValuesProducer.VERSION_CURRENT);
+ success = true;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(this);
+ }
+ }
+ }
+
+ public override void AddNumericField(FieldInfo field, IEnumerable<long> values)
+ {
+ AddNumericField(field, values, true);
+ }
+
+ internal virtual void AddNumericField(FieldInfo field, IEnumerable<long> values, bool optimizeStorage)
+ {
+ meta.WriteVInt(field.Number);
+ meta.WriteByte(MemoryDocValuesProducer.NUMBER);
+ meta.WriteLong(data.FilePointer);
+ long minValue = long.MaxValue;
+ long maxValue = long.MinValue;
+ long gcd = 0;
+ bool missing = false;
+ // TODO: more efficient?
+ HashSet<long?> uniqueValues = null;
+ if (optimizeStorage)
+ {
+ uniqueValues = new HashSet<>();
+
+ long count = 0;
+ foreach (var nv in values)
+ {
+ long v = nv;
+
+ if (gcd != 1)
+ {
+ if (v < long.MinValue/2 || v > long.MaxValue/2)
+ {
+ // in that case v - minValue might overflow and make the GCD computation return
+ // wrong results. Since these extreme values are unlikely, we just discard
+ // GCD computation for them
+ gcd = 1;
+ } // minValue needs to be set first
+ else if (count != 0)
+ {
+ gcd = MathUtil.Gcd(gcd, v - minValue);
+ }
+ }
+
+ minValue = Math.Min(minValue, v);
+ maxValue = Math.Max(maxValue, v);
+
+ if (uniqueValues != null)
+ {
+ if (uniqueValues.Add(v))
+ {
+ if (uniqueValues.Count > 256)
+ {
+ uniqueValues = null;
+ }
+ }
+ }
+
+ ++count;
+ }
+ Debug.Assert(count == maxDoc);
+ }
+
+ if (missing)
+ {
+ long start = data.FilePointer;
+ WriteMissingBitset(values);
+ meta.WriteLong(start);
+ meta.WriteLong(data.FilePointer - start);
+ }
+ else
+ {
+ meta.WriteLong(-1L);
+ }
+
+ if (uniqueValues != null)
+ {
+ // small number of unique values
+
+ int bitsPerValue = PackedInts.BitsRequired(uniqueValues.Count - 1);
+ FormatAndBits formatAndBits = PackedInts.FastestFormatAndBits(maxDoc, bitsPerValue,
+ acceptableOverheadRatio);
+ if (formatAndBits.bitsPerValue == 8 && minValue >= sbyte.MinValue && maxValue <= sbyte.MaxValue)
+ {
+ meta.WriteByte(MemoryDocValuesProducer.UNCOMPRESSED); // uncompressed
+ foreach (var nv in values)
+ {
+ data.WriteByte(nv == null ? 0 : (long) (sbyte) nv);
+ }
+ }
+ else
+ {
+ meta.WriteByte(MemoryDocValuesProducer.TABLE_COMPRESSED); // table-compressed
+ long?[] decode = uniqueValues.toArray(new long?[uniqueValues.Count]);
+
+ var encode = new Dictionary<long?, int?>();
+ data.WriteVInt(decode.Length);
+ for (int i = 0; i < decode.Length; i++)
+ {
+ data.WriteLong(decode[i]);
+ encode[decode[i]] = i;
+ }
+
+ meta.WriteVInt(PackedInts.VERSION_CURRENT);
+ data.WriteVInt(formatAndBits.format.Id);
+ data.WriteVInt(formatAndBits.bitsPerValue);
+
+ PackedInts.Writer writer = PackedInts.GetWriterNoHeader(data, formatAndBits.format, maxDoc,
+ formatAndBits.bitsPerValue, PackedInts.DEFAULT_BUFFER_SIZE);
+ foreach (long nv in values)
+ {
+ writer.Add(encode[nv == null ? 0 : (long) nv]);
+ }
+ writer.Finish();
+ }
+ }
+ else if (gcd != 0 && gcd != 1)
+ {
+ meta.WriteByte(MemoryDocValuesProducer.GCD_COMPRESSED);
+ meta.WriteVInt(PackedInts.VERSION_CURRENT);
+ data.WriteLong(minValue);
+ data.WriteLong(gcd);
+ data.WriteVInt(MemoryDocValuesProducer.BLOCK_SIZE);
+
+ var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE);
+ foreach (var nv in values)
+ {
+ writer.Add((nv - minValue)/gcd);
+ }
+ writer.Finish();
+ }
+ else
+ {
+ meta.WriteByte(MemoryDocValuesProducer.DELTA_COMPRESSED); // delta-compressed
+
+ meta.WriteVInt(PackedInts.VERSION_CURRENT);
+ data.WriteVInt(MemoryDocValuesProducer.BLOCK_SIZE);
+
+ var writer = new BlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE);
+ foreach (var nv in values)
+ {
+ writer.Add(nv);
+ }
+ writer.Finish();
+ }
+ }
+
+ protected override void Dispose(bool disposing)
+ {
+ if (disposing) return;
+
+ var success = false;
+ try
+ {
+ if (meta != null)
+ {
+ meta.WriteVInt(-1); // write EOF marker
+ CodecUtil.WriteFooter(meta); // write checksum
+ }
+ if (data != null)
+ {
+ CodecUtil.WriteFooter(data);
+ }
+ success = true;
+ }
+ finally
+ {
+ if (success)
+ {
+ IOUtils.Close(data, meta);
+ }
+ else
+ {
+ IOUtils.CloseWhileHandlingException(data, meta);
+ }
+ data = meta = null;
+ }
+ }
+
+ public override void AddBinaryField(FieldInfo field, IEnumerable<BytesRef> values)
+ {
+ // write the byte[] data
+ meta.WriteVInt(field.Number);
+ meta.WriteByte(MemoryDocValuesProducer.BYTES);
+ var minLength = int.MaxValue;
+ var maxLength = int.MinValue;
+
+ var startFP = data.FilePointer;
+ var missing = false;
+ foreach (var v in values)
+ {
+ int length;
+ if (v == null)
+ {
+ length = 0;
+ missing = true;
+ }
+ else
+ {
+ length = v.Length;
+ }
+ if (length > MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH)
+ {
+ throw new ArgumentException("DocValuesField \"" + field.Name + "\" is too large, must be <= " +
+ MemoryDocValuesFormat.MAX_BINARY_FIELD_LENGTH);
+ }
+ minLength = Math.Min(minLength, length);
+ maxLength = Math.Max(maxLength, length);
+ if (v != null)
+ {
+ data.WriteBytes(v.Bytes, v.Offset, v.Length);
+ }
+ }
+ meta.WriteLong(startFP);
+ meta.WriteLong(data.FilePointer - startFP);
+ if (missing)
+ {
+ long start = data.FilePointer;
+ WriteMissingBitset(values);
+ meta.WriteLong(start);
+ meta.WriteLong(data.FilePointer - start);
+ }
+ else
+ {
+ meta.WriteLong(-1L);
+ }
+ meta.WriteVInt(minLength);
+ meta.WriteVInt(maxLength);
+
+ // if minLength == maxLength, its a fixed-length byte[], we are done (the addresses are implicit)
+ // otherwise, we need to record the length fields...
+ if (minLength != maxLength)
+ {
+ meta.WriteVInt(PackedInts.VERSION_CURRENT);
+ meta.WriteVInt(MemoryDocValuesProducer.BLOCK_SIZE);
+
+
+ var writer = new MonotonicBlockPackedWriter(data, MemoryDocValuesProducer.BLOCK_SIZE);
+ long addr = 0;
+ foreach (BytesRef v in values)
+ {
+ if (v != null)
+ {
+ addr += v.Length;
+ }
+ writer.Add(addr);
+ }
+ writer.Finish();
+ }
+ }
+
+ private void WriteFST(FieldInfo field, IEnumerable<BytesRef> values)
+ {
+ meta.WriteVInt(field.Number);
+ meta.WriteByte(FST);
+ meta.WriteLong(data.FilePointer);
+ PositiveIntOutputs outputs = PositiveIntOutputs.Singleton;
+ var builder = new Builder<long?>(INPUT_TYPE.BYTE1, outputs);
+ var scratch = new IntsRef();
+ long ord = 0;
+ foreach (BytesRef v in values)
+ {
+ builder.Add(Util.ToIntsRef(v, scratch), ord);
+ ord++;
+ }
+ FST<long?> fst = builder.Finish();
+ if (fst != null)
+ {
+ fst.Save(data);
+ }
+ meta.WriteVLong(ord);
+ }
+
+ // TODO: in some cases representing missing with minValue-1 wouldn't take up additional space and so on,
+ // but this is very simple, and algorithms only check this for values of 0 anyway (doesnt slow down normal decode)
+
+ internal virtual void WriteMissingBitset<T1>(IEnumerable<T1> values)
+ {
+ long bits = 0;
+ int count = 0;
+ foreach (object v in values)
+ {
+ if (count == 64)
+ {
+ data.WriteLong(bits);
+ count = 0;
+ bits = 0;
+ }
+ if (v != null)
+ {
+ bits |= 1L << (count & 0x3f);
+ }
+ count++;
+ }
+ if (count > 0)
+ {
+ data.WriteLong(bits);
+ }
+ }
+
+ public override void AddSortedField(FieldInfo field, IEnumerable<BytesRef> values, IEnumerable<long> docToOrd)
+ {
+ // write the ordinals as numerics
+ AddNumericField(field, docToOrd, false);
+
+ // write the values as FST
+ WriteFST(field, values);
+ }
+
+ // note: this might not be the most efficient... but its fairly simple
+ public override void AddSortedSetField(FieldInfo field, IEnumerable<BytesRef> values,
+ IEnumerable<long> docToOrdCount, IEnumerable<long> ords)
+ {
+ // write the ordinals as a binary field
+ AddBinaryField(field, new IterableAnonymousInnerClassHelper(this, docToOrdCount, ords));
+
+ // write the values as FST
+ WriteFST(field, values);
+ }
+
+ private class IterableAnonymousInnerClassHelper : IEnumerable<BytesRef>
+ {
+ private readonly IEnumerable<long> _docToOrdCount;
+ private readonly IEnumerable<long> _ords;
+
+ public IterableAnonymousInnerClassHelper(MemoryDocValuesConsumer outerInstance,
+ IEnumerable<long> docToOrdCount, IEnumerable<long> ords)
+ {
+ _docToOrdCount = docToOrdCount;
+ _ords = ords;
+ }
+
+ public IEnumerator<BytesRef> GetEnumerator()
+ {
+ return new SortedSetIterator(_docToOrdCount.GetEnumerator(), _ords.GetEnumerator());
+ }
+ }
+
+ // per-document vint-encoded byte[]
+ internal class SortedSetIterator : IEnumerator<BytesRef>
+ {
+ internal sbyte[] buffer = new sbyte[10];
+ internal ByteArrayDataOutput @out = new ByteArrayDataOutput();
+ internal BytesRef @ref = new BytesRef();
+
+ internal readonly IEnumerator<long> counts;
+ internal readonly IEnumerator<long> ords;
+
+ internal SortedSetIterator(IEnumerator<long> counts, IEnumerator<long> ords)
+ {
+ this.counts = counts;
+ this.ords = ords;
+ }
+
+ public override bool HasNext()
+ {
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
- return counts.hasNext();
- }
+ return counts.hasNext();
+ }
- public override BytesRef next()
- {
- if (!hasNext())
- {
- throw new NoSuchElementException();
- }
+ public override BytesRef Next()
+ {
+ if (!HasNext())
+ {
+ throw new ArgumentOutOfRangeException();
+ }
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
- int count = (int)counts.next();
- int maxSize = count * 9; // worst case
- if (maxSize > buffer.Length)
- {
- buffer = ArrayUtil.grow(buffer, maxSize);
- }
-
- try
- {
- encodeValues(count);
- }
- catch (IOException bogus)
- {
- throw new Exception(bogus);
- }
-
- @ref.bytes = buffer;
- @ref.offset = 0;
- @ref.length = @out.Position;
-
- return @ref;
- }
-
- // encodes count values to buffer
-//JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
-//ORIGINAL LINE: private void encodeValues(int count) throws java.io.IOException
- internal virtual void encodeValues(int count)
- {
- @out.reset(buffer);
- long lastOrd = 0;
- for (int i = 0; i < count; i++)
- {
+ int count = (int) counts.next();
+ int maxSize = count*9; // worst case
+ if (maxSize > buffer.Length)
+ {
+ buffer = ArrayUtil.Grow(buffer, maxSize);
+ }
+
+ EncodeValues(count);
+
+
+ @ref.Bytes = buffer;
+ @ref.Offset = 0;
+ @ref.Length = @out.Position;
+
+ return @ref;
+ }
+
+ // encodes count values to buffer
+ internal virtual void EncodeValues(int count)
+ {
+ @out.Reset(buffer);
+ long lastOrd = 0;
+ for (int i = 0; i < count; i++)
+ {
//JAVA TO C# CONVERTER TODO TASK: Java iterators are only converted within the context of 'while' and 'for' loops:
- long ord = (long)ords.next();
- @out.writeVLong(ord - lastOrd);
- lastOrd = ord;
- }
- }
-
- public override void remove()
- {
- throw new System.NotSupportedException();
- }
- }
- }
-
+ long ord = (long) ords.next();
+ @out.writeVLong(ord - lastOrd);
+ lastOrd = ord;
+ }
+ }
+
+ public override void Remove()
+ {
+ throw new NotSupportedException();
+ }
+ }
+ }
}
\ No newline at end of file