You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/25 19:52:05 UTC
[04/16] lucenenet git commit: Move facets into src folder
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/src/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs b/src/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs
new file mode 100644
index 0000000..5b69985
--- /dev/null
+++ b/src/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs
@@ -0,0 +1,34 @@
+namespace Lucene.Net.Facet.Taxonomy.Directory
+{
+
+ using BytesRef = Lucene.Net.Util.BytesRef;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// @lucene.experimental
+ /// </summary>
+ internal abstract class Consts
+ {
+ internal const string FULL = "$full_path$";
+ internal const string FIELD_PAYLOADS = "$payloads$";
+ internal const string PAYLOAD_PARENT = "p";
+ internal static readonly BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs b/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs
new file mode 100644
index 0000000..a567210
--- /dev/null
+++ b/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs
@@ -0,0 +1,450 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+using Lucene.Net.Store;
+
+namespace Lucene.Net.Facet.Taxonomy.Directory
+{
+ using Document = Lucene.Net.Documents.Document;
+ using Lucene.Net.Facet.Taxonomy;
+ using CorruptIndexException = Lucene.Net.Index.CorruptIndexException; // javadocs
+ using DirectoryReader = Lucene.Net.Index.DirectoryReader;
+ using DocsEnum = Lucene.Net.Index.DocsEnum;
+ using IndexWriter = Lucene.Net.Index.IndexWriter;
+ using MultiFields = Lucene.Net.Index.MultiFields;
+ using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
+ using Directory = Lucene.Net.Store.Directory;
+ using BytesRef = Lucene.Net.Util.BytesRef;
+ using IOUtils = Lucene.Net.Util.IOUtils;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A <seealso cref="TaxonomyReader"/> which retrieves stored taxonomy information from a
+ /// <seealso cref="Directory"/>.
+ /// <P>
+ /// Reading from the on-disk index on every method call is too slow, so this
+ /// implementation employs caching: Some methods cache recent requests and their
+ /// results, while other methods prefetch all the data into memory and then
+ /// provide answers directly from in-memory tables. See the documentation of
+ /// individual methods for comments on their performance.
+ ///
+ /// @lucene.experimental
+ /// </summary>
+ public class DirectoryTaxonomyReader : TaxonomyReader, IDisposable
+ {
+
+ public class IntClass
+ {
+ public int? IntItem { get; set; }
+ }
+ private const int DEFAULT_CACHE_VALUE = 4000;
+
+ private readonly DirectoryTaxonomyWriter taxoWriter;
+ private readonly long taxoEpoch; // used in doOpenIfChanged
+ private readonly DirectoryReader indexReader;
+
+ // TODO: test DoubleBarrelLRUCache and consider using it instead
+ private LRUHashMap<FacetLabel, IntClass> ordinalCache;
+ private LRUHashMap<int, FacetLabel> categoryCache;
+
+ private volatile TaxonomyIndexArrays taxoArrays;
+
+ /// <summary>
+ /// Called only from <seealso cref="#doOpenIfChanged()"/>. If the taxonomy has been
+ /// recreated, you should pass {@code null} as the caches and parent/children
+ /// arrays.
+ /// </summary>
+ internal DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter, LRUHashMap<FacetLabel, IntClass> ordinalCache, LRUHashMap<int, FacetLabel> categoryCache, TaxonomyIndexArrays taxoArrays)
+ {
+ this.indexReader = indexReader;
+ this.taxoWriter = taxoWriter;
+ this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.TaxonomyEpoch;
+
+ // use the same instance of the cache, note the protective code in getOrdinal and getPath
+ this.ordinalCache = ordinalCache == null ? new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE) : ordinalCache;
+ this.categoryCache = categoryCache == null ? new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE) : categoryCache;
+
+ this.taxoArrays = taxoArrays != null ? new TaxonomyIndexArrays(indexReader, taxoArrays) : null;
+ }
+
+ /// <summary>
+ /// Open for reading a taxonomy stored in a given <seealso cref="Directory"/>.
+ /// </summary>
+ /// <param name="directory">
+ /// The <seealso cref="Directory"/> in which the taxonomy resides. </param>
+ /// <exception cref="CorruptIndexException">
+ /// if the Taxonomy is corrupt. </exception>
+ /// <exception cref="IOException">
+ /// if another error occurred. </exception>
+ public DirectoryTaxonomyReader(Directory directory)
+ {
+ indexReader = OpenIndexReader(directory);
+ taxoWriter = null;
+ taxoEpoch = -1;
+
+ // These are the default cache sizes; they can be configured after
+ // construction with the cache's setMaxSize() method
+
+ ordinalCache = new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE);
+ categoryCache = new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE);
+ }
+
+ /// <summary>
+ /// Opens a <seealso cref="DirectoryTaxonomyReader"/> over the given
+ /// <seealso cref="DirectoryTaxonomyWriter"/> (for NRT).
+ /// </summary>
+ /// <param name="taxoWriter">
+ /// The <seealso cref="DirectoryTaxonomyWriter"/> from which to obtain newly
+ /// added categories, in real-time. </param>
+ public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter)
+ {
+ this.taxoWriter = taxoWriter;
+ taxoEpoch = taxoWriter.TaxonomyEpoch;
+ indexReader = OpenIndexReader(taxoWriter.InternalIndexWriter);
+
+ // These are the default cache sizes; they can be configured after
+ // construction with the cache's setMaxSize() method
+
+ ordinalCache = new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE);
+ categoryCache = new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE);
+ }
+
+ private void InitTaxoArrays()
+ {
+ lock (this)
+ {
+ if (taxoArrays == null)
+ {
+ // according to Java Concurrency in Practice, this might perform better on
+ // some JVMs, because the array initialization doesn't happen on the
+ // volatile member.
+ TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(indexReader);
+ taxoArrays = tmpArrays;
+ }
+ }
+ }
+
+ protected internal override void DoClose()
+ {
+ indexReader.Dispose();
+ taxoArrays = null;
+ // do not clear() the caches, as they may be used by other DTR instances.
+ ordinalCache = null;
+ categoryCache = null;
+ }
+
+ /// <summary>
+ /// Implements the opening of a new <seealso cref="DirectoryTaxonomyReader"/> instance if
+ /// the taxonomy has changed.
+ ///
+ /// <para>
+ /// <b>NOTE:</b> the returned <seealso cref="DirectoryTaxonomyReader"/> shares the
+ /// ordinal and category caches with this reader. This is not expected to cause
+ /// any issues, unless the two instances continue to live. The reader
+ /// guarantees that the two instances cannot affect each other in terms of
+ /// correctness of the caches, however if the size of the cache is changed
+ /// through <seealso cref="#setCacheSize(int)"/>, it will affect both reader instances.
+ /// </para>
+ /// </summary>
+ protected override TaxonomyReader DoOpenIfChanged()
+ {
+ EnsureOpen();
+
+ // This works for both NRT and non-NRT readers (i.e. an NRT reader remains NRT).
+ var r2 = DirectoryReader.OpenIfChanged(indexReader);
+ if (r2 == null)
+ {
+ return null; // no changes, nothing to do
+ }
+
+ // check if the taxonomy was recreated
+ bool success = false;
+ try
+ {
+ bool recreated = false;
+ if (taxoWriter == null)
+ {
+ // not NRT, check epoch from commit data
+ string t1 = indexReader.IndexCommit.UserData[DirectoryTaxonomyWriter.INDEX_EPOCH];
+ string t2 = r2.IndexCommit.UserData[DirectoryTaxonomyWriter.INDEX_EPOCH];
+ if (t1 == null)
+ {
+ if (t2 != null)
+ {
+ recreated = true;
+ }
+ }
+ else if (!t1.Equals(t2))
+ {
+ // t1 != null and t2 cannot be null b/c DirTaxoWriter always puts the commit data.
+ // it's ok to use String.equals because we require the two epoch values to be the same.
+ recreated = true;
+ }
+ }
+ else
+ {
+ // NRT, compare current taxoWriter.epoch() vs the one that was given at construction
+ if (taxoEpoch != taxoWriter.TaxonomyEpoch)
+ {
+ recreated = true;
+ }
+ }
+
+ DirectoryTaxonomyReader newtr;
+ if (recreated)
+ {
+ // if recreated, do not reuse anything from this instace. the information
+ // will be lazily computed by the new instance when needed.
+ newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
+ }
+ else
+ {
+ newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays);
+ }
+
+ success = true;
+ return newtr;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(r2);
+ }
+ }
+ }
+
+ /// <summary>
+ /// Open the <seealso cref="DirectoryReader"/> from this {@link
+ /// Directory}.
+ /// </summary>
+ protected virtual DirectoryReader OpenIndexReader(Directory directory)
+ {
+ return DirectoryReader.Open(directory);
+ }
+
+ /// <summary>
+ /// Open the <seealso cref="DirectoryReader"/> from this {@link
+ /// IndexWriter}.
+ /// </summary>
+ protected virtual DirectoryReader OpenIndexReader(IndexWriter writer)
+ {
+ return DirectoryReader.Open(writer, false);
+ }
+
+ /// <summary>
+ /// Expert: returns the underlying <seealso cref="DirectoryReader"/> instance that is
+ /// used by this <seealso cref="TaxonomyReader"/>.
+ /// </summary>
+ internal virtual DirectoryReader InternalIndexReader
+ {
+ get
+ {
+ EnsureOpen();
+ return indexReader;
+ }
+ }
+
+ public override ParallelTaxonomyArrays ParallelTaxonomyArrays
+ {
+ get
+ {
+ EnsureOpen();
+ if (taxoArrays == null)
+ {
+ InitTaxoArrays();
+ }
+ return taxoArrays;
+ }
+ }
+
+ public override IDictionary<string, string> CommitUserData
+ {
+ get
+ {
+ EnsureOpen();
+ return indexReader.IndexCommit.UserData;
+ }
+ }
+
+ public override int GetOrdinal(FacetLabel cp)
+ {
+ EnsureOpen();
+ if (cp.Length == 0)
+ {
+ return ROOT_ORDINAL;
+ }
+
+ // First try to find the answer in the LRU cache:
+ lock (ordinalCache)
+ {
+ IntClass res = ordinalCache.Get(cp);
+ if (res != null && res.IntItem != null)
+ {
+ if ((int)res.IntItem.Value < indexReader.MaxDoc)
+ {
+ // Since the cache is shared with DTR instances allocated from
+ // doOpenIfChanged, we need to ensure that the ordinal is one that
+ // this DTR instance recognizes.
+ return (int)res.IntItem.Value;
+ }
+ else
+ {
+ // if we get here, it means that the category was found in the cache,
+ // but is not recognized by this TR instance. Therefore there's no
+ // need to continue search for the path on disk, because we won't find
+ // it there too.
+ return TaxonomyReader.INVALID_ORDINAL;
+ }
+ }
+ }
+
+ // If we're still here, we have a cache miss. We need to fetch the
+ // value from disk, and then also put it in the cache:
+ int ret = TaxonomyReader.INVALID_ORDINAL;
+ DocsEnum docs = MultiFields.GetTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(FacetsConfig.PathToString(cp.Components, cp.Length)), 0);
+ if (docs != null && docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
+ {
+ ret = docs.DocID();
+
+ // we only store the fact that a category exists, not its inexistence.
+ // This is required because the caches are shared with new DTR instances
+ // that are allocated from doOpenIfChanged. Therefore, if we only store
+ // information about found categories, we cannot accidently tell a new
+ // generation of DTR that a category does not exist.
+ lock (ordinalCache)
+ {
+ ordinalCache.Put(cp, new IntClass { IntItem = Convert.ToInt32(ret) });
+ }
+ }
+
+ return ret;
+ }
+
+ public override FacetLabel GetPath(int ordinal)
+ {
+ EnsureOpen();
+
+ // Since the cache is shared with DTR instances allocated from
+ // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
+ // instance recognizes. Therefore we do this check up front, before we hit
+ // the cache.
+ if (ordinal < 0 || ordinal >= indexReader.MaxDoc)
+ {
+ return null;
+ }
+
+ // TODO: can we use an int-based hash impl, such as IntToObjectMap,
+ // wrapped as LRU?
+ int catIDInteger = Convert.ToInt32(ordinal);
+ lock (categoryCache)
+ {
+ var res = categoryCache.Get(catIDInteger,false);
+ if (res != null)
+ {
+ return res;
+ }
+ }
+
+ Document doc = indexReader.Document(ordinal);
+ FacetLabel ret = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL)));
+ lock (categoryCache)
+ {
+ categoryCache.Put(catIDInteger, ret);
+ }
+
+ return ret;
+ }
+
+ public override int Size
+ {
+ get
+ {
+ EnsureOpen();
+ return indexReader.NumDocs;
+ }
+ }
+
+ /// <summary>
+ /// setCacheSize controls the maximum allowed size of each of the caches
+ /// used by <seealso cref="#getPath(int)"/> and <seealso cref="#getOrdinal(FacetLabel)"/>.
+ /// <P>
+ /// Currently, if the given size is smaller than the current size of
+ /// a cache, it will not shrink, and rather we be limited to its current
+ /// size. </summary>
+ /// <param name="size"> the new maximum cache size, in number of entries. </param>
+ public virtual int CacheSize
+ {
+ set
+ {
+ EnsureOpen();
+ lock (categoryCache)
+ {
+ categoryCache.MaxSize = value;
+ }
+ lock (ordinalCache)
+ {
+ ordinalCache.MaxSize = value;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Returns ordinal -> label mapping, up to the provided
+ /// max ordinal or number of ordinals, whichever is
+ /// smaller.
+ /// </summary>
+ public virtual string ToString(int max)
+ {
+ EnsureOpen();
+ StringBuilder sb = new StringBuilder();
+ int upperl = Math.Min(max, indexReader.MaxDoc);
+ for (int i = 0; i < upperl; i++)
+ {
+ try
+ {
+ FacetLabel category = this.GetPath(i);
+ if (category == null)
+ {
+ sb.Append(i + ": NULL!! \n");
+ continue;
+ }
+ if (category.Length == 0)
+ {
+ sb.Append(i + ": EMPTY STRING!! \n");
+ continue;
+ }
+ sb.Append(i + ": " + category.ToString() + "\n");
+ }
+ catch (IOException e)
+ {
+ throw;
+ }
+ }
+ return sb.ToString();
+ }
+
+ public void Dispose()
+ {
+ Dispose(true);
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs b/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs
new file mode 100644
index 0000000..63967ee
--- /dev/null
+++ b/src/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs
@@ -0,0 +1,1202 @@
+using System;
+using System.Collections;
+using System.Collections.Concurrent;
+using System.Diagnostics;
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+
+namespace Lucene.Net.Facet.Taxonomy.Directory
+{
+
+ using TokenStream = Lucene.Net.Analysis.TokenStream;
+ using CharTermAttribute = Lucene.Net.Analysis.Tokenattributes.CharTermAttribute;
+ using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
+ using Document = Lucene.Net.Documents.Document;
+ using Field = Lucene.Net.Documents.Field;
+ using FieldType = Lucene.Net.Documents.FieldType;
+ using StringField = Lucene.Net.Documents.StringField;
+ using TextField = Lucene.Net.Documents.TextField;
+ using TaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.TaxonomyWriterCache;
+ using Cl2oTaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.Cl2oTaxonomyWriterCache;
+ using LruTaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.LruTaxonomyWriterCache;
+ using AtomicReader = Lucene.Net.Index.AtomicReader;
+ using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
+ using CorruptIndexException = Lucene.Net.Index.CorruptIndexException; // javadocs
+ using DirectoryReader = Lucene.Net.Index.DirectoryReader;
+ using DocsEnum = Lucene.Net.Index.DocsEnum;
+ using IndexReader = Lucene.Net.Index.IndexReader;
+ using IndexWriter = Lucene.Net.Index.IndexWriter;
+ using OpenMode = Lucene.Net.Index.IndexWriterConfig.OpenMode_e;
+ using IndexWriterConfig = Lucene.Net.Index.IndexWriterConfig;
+ using LogByteSizeMergePolicy = Lucene.Net.Index.LogByteSizeMergePolicy;
+ using ReaderManager = Lucene.Net.Index.ReaderManager;
+ using SegmentInfos = Lucene.Net.Index.SegmentInfos;
+ using Terms = Lucene.Net.Index.Terms;
+ using TermsEnum = Lucene.Net.Index.TermsEnum;
+ using TieredMergePolicy = Lucene.Net.Index.TieredMergePolicy;
+ using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException;
+ using Directory = Lucene.Net.Store.Directory;
+ using LockObtainFailedException = Lucene.Net.Store.LockObtainFailedException; // javadocs
+ using NativeFSLockFactory = Lucene.Net.Store.NativeFSLockFactory;
+ using SimpleFSLockFactory = Lucene.Net.Store.SimpleFSLockFactory;
+ using BytesRef = Lucene.Net.Util.BytesRef;
+ using Version = Lucene.Net.Util.Version;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// <seealso cref="TaxonomyWriter"/> which uses a <seealso cref="Directory"/> to store the taxonomy
+ /// information on disk, and keeps an additional in-memory cache of some or all
+ /// categories.
+ /// <para>
+ /// In addition to the permanently-stored information in the <seealso cref="Directory"/>,
+ /// efficiency dictates that we also keep an in-memory cache of <B>recently
+ /// seen</B> or <B>all</B> categories, so that we do not need to go back to disk
+ /// for every category addition to see which ordinal this category already has,
+ /// if any. A <seealso cref="TaxonomyWriterCache"/> object determines the specific caching
+ /// algorithm used.
+ /// </para>
+ /// <para>
+ /// This class offers some hooks for extending classes to control the
+ /// <seealso cref="IndexWriter"/> instance that is used. See <seealso cref="#openIndexWriter"/>.
+ ///
+ /// @lucene.experimental
+ /// </para>
+ /// </summary>
+ public class DirectoryTaxonomyWriter : TaxonomyWriter
+ {
+
+ /// <summary>
+ /// Property name of user commit data that contains the index epoch. The epoch
+ /// changes whenever the taxonomy is recreated (i.e. opened with
+ /// <seealso cref="OpenMode#CREATE"/>.
+ /// <para>
+ /// Applications should not use this property in their commit data because it
+ /// will be overridden by this taxonomy writer.
+ /// </para>
+ /// </summary>
+ public const string INDEX_EPOCH = "index.epoch";
+
+ private readonly Directory dir;
+ private readonly IndexWriter indexWriter;
+ private readonly TaxonomyWriterCache cache;
+ private readonly AtomicInteger cacheMisses = new AtomicInteger(0);
+
+ // Records the taxonomy index epoch, updated on replaceTaxonomy as well.
+ private long indexEpoch;
+
+ private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
+ private Field parentStreamField;
+ private Field fullPathField;
+ private int cacheMissesUntilFill = 11;
+ private bool shouldFillCache = true;
+
+ // even though lazily initialized, not volatile so that access to it is
+ // faster. we keep a volatile boolean init instead.
+ private ReaderManager readerManager;
+ private volatile bool initializedReaderManager = false;
+ private volatile bool shouldRefreshReaderManager;
+
+ /// <summary>
+ /// We call the cache "complete" if we know that every category in our
+ /// taxonomy is in the cache. When the cache is <B>not</B> complete, and
+ /// we can't find a category in the cache, we still need to look for it
+ /// in the on-disk index; Therefore when the cache is not complete, we
+ /// need to open a "reader" to the taxonomy index.
+ /// The cache becomes incomplete if it was never filled with the existing
+ /// categories, or if a put() to the cache ever returned true (meaning
+ /// that some of the cached data was cleared).
+ /// </summary>
+ private volatile bool cacheIsComplete;
+ private volatile bool isClosed = false;
+ private volatile TaxonomyIndexArrays taxoArrays;
+ private volatile int nextID;
+
+ /// <summary>
+ /// Reads the commit data from a Directory. </summary>
+ private static IDictionary<string, string> ReadCommitData(Directory dir)
+ {
+ SegmentInfos infos = new SegmentInfos();
+ infos.Read(dir);
+ return infos.UserData;
+ }
+
+ /// <summary>
+ /// Forcibly unlocks the taxonomy in the named directory.
+ /// <P>
+ /// Caution: this should only be used by failure recovery code, when it is
+ /// known that no other process nor thread is in fact currently accessing
+ /// this taxonomy.
+ /// <P>
+ /// This method is unnecessary if your <seealso cref="Directory"/> uses a
+ /// <seealso cref="NativeFSLockFactory"/> instead of the default
+ /// <seealso cref="SimpleFSLockFactory"/>. When the "native" lock is used, a lock
+ /// does not stay behind forever when the process using it dies.
+ /// </summary>
+ public static void Unlock(Directory directory)
+ {
+ IndexWriter.Unlock(directory);
+ }
+
+ /// <summary>
+ /// Construct a Taxonomy writer.
+ /// </summary>
+ /// <param name="directory">
+ /// The <seealso cref="Directory"/> in which to store the taxonomy. Note that
+ /// the taxonomy is written directly to that directory (not to a
+ /// subdirectory of it). </param>
+ /// <param name="openMode">
+ /// Specifies how to open a taxonomy for writing: <code>APPEND</code>
+ /// means open an existing index for append (failing if the index does
+ /// not yet exist). <code>CREATE</code> means create a new index (first
+ /// deleting the old one if it already existed).
+ /// <code>APPEND_OR_CREATE</code> appends to an existing index if there
+ /// is one, otherwise it creates a new index. </param>
+ /// <param name="cache">
+ /// A <seealso cref="TaxonomyWriterCache"/> implementation which determines
+ /// the in-memory caching policy. See for example
+ /// <seealso cref="LruTaxonomyWriterCache"/> and <seealso cref="Cl2oTaxonomyWriterCache"/>.
+ /// If null or missing, <seealso cref="#defaultTaxonomyWriterCache()"/> is used. </param>
+ /// <exception cref="CorruptIndexException">
+ /// if the taxonomy is corrupted. </exception>
+ /// <exception cref="LockObtainFailedException">
+ /// if the taxonomy is locked by another writer. If it is known
+ /// that no other concurrent writer is active, the lock might
+ /// have been left around by an old dead process, and should be
+ /// removed using <seealso cref="#unlock(Directory)"/>. </exception>
+ /// <exception cref="IOException">
+ /// if another error occurred. </exception>
+ public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode, TaxonomyWriterCache cache)
+ {
+
+ dir = directory;
+ IndexWriterConfig config = CreateIndexWriterConfig(openMode);
+ indexWriter = OpenIndexWriter(dir, config);
+
+ // verify (to some extent) that merge policy in effect would preserve category docids
+ if (indexWriter != null)
+ {
+ Debug.Assert(!(indexWriter.Config.MergePolicy is TieredMergePolicy), "for preserving category docids, merging none-adjacent segments is not allowed");
+ }
+
+ // after we opened the writer, and the index is locked, it's safe to check
+ // the commit data and read the index epoch
+ openMode = config.OpenMode.HasValue ? config.OpenMode.Value : OpenMode.CREATE_OR_APPEND;
+ if (!DirectoryReader.IndexExists(directory))
+ {
+ indexEpoch = 1;
+ }
+ else
+ {
+ string epochStr = null;
+ IDictionary<string, string> commitData = ReadCommitData(directory);
+ if (commitData != null && commitData.ContainsKey(INDEX_EPOCH))
+ {
+ epochStr = commitData[INDEX_EPOCH];
+ }
+ // no commit data, or no epoch in it means an old taxonomy, so set its epoch to 1, for lack
+ // of a better value.
+ indexEpoch = epochStr == null ? 1 : Convert.ToInt64(epochStr, 16);
+ }
+
+ if (openMode == OpenMode.CREATE)
+ {
+ ++indexEpoch;
+ }
+
+ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+ ft.OmitNorms = true;
+ parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
+ fullPathField = new StringField(Consts.FULL, "", Field.Store.YES);
+
+ if (indexWriter == null)
+ return;
+
+ nextID = indexWriter.MaxDoc;
+
+ if (cache == null)
+ {
+ cache = DefaultTaxonomyWriterCache();
+ }
+ this.cache = cache;
+
+ if (nextID == 0)
+ {
+ cacheIsComplete = true;
+ // Make sure that the taxonomy always contain the root category
+ // with category id 0.
+ AddCategory(new FacetLabel());
+ }
+ else
+ {
+ // There are some categories on the disk, which we have not yet
+ // read into the cache, and therefore the cache is incomplete.
+ // We choose not to read all the categories into the cache now,
+ // to avoid terrible performance when a taxonomy index is opened
+ // to add just a single category. We will do it later, after we
+ // notice a few cache misses.
+ cacheIsComplete = false;
+ }
+ }
+
+ /// <summary>
+ /// Open internal index writer, which contains the taxonomy data.
+ /// <para>
+ /// Extensions may provide their own <seealso cref="IndexWriter"/> implementation or instance.
+ /// <br><b>NOTE:</b> the instance this method returns will be closed upon calling
+ /// to <seealso cref="#close()"/>.
+ /// <br><b>NOTE:</b> the merge policy in effect must not merge none adjacent segments. See
+ /// comment in <seealso cref="#createIndexWriterConfig(IndexWriterConfig.OpenMode)"/> for the logic behind this.
+ ///
+ /// </para>
+ /// </summary>
+ /// <seealso cref= #createIndexWriterConfig(IndexWriterConfig.OpenMode)
+ /// </seealso>
+ /// <param name="directory">
+ /// the <seealso cref="Directory"/> on top of which an <seealso cref="IndexWriter"/>
+ /// should be opened. </param>
+ /// <param name="config">
+ /// configuration for the internal index writer. </param>
+ protected virtual IndexWriter OpenIndexWriter(Directory directory, IndexWriterConfig config)
+ {
+ return new IndexWriter(directory, config);
+ }
+
+ /// <summary>
+ /// Create the <seealso cref="IndexWriterConfig"/> that would be used for opening the internal index writer.
+ /// <br>Extensions can configure the <seealso cref="IndexWriter"/> as they see fit,
+ /// including setting a <seealso cref="Lucene.Net.index.MergeScheduler merge-scheduler"/>, or
+ /// <seealso cref="Lucene.Net.index.IndexDeletionPolicy deletion-policy"/>, different RAM size
+ /// etc.<br>
+ /// <br><b>NOTE:</b> internal docids of the configured index must not be altered.
+ /// For that, categories are never deleted from the taxonomy index.
+ /// In addition, merge policy in effect must not merge none adjacent segments.
+ /// </summary>
+ /// <seealso cref= #openIndexWriter(Directory, IndexWriterConfig)
+ /// </seealso>
+ /// <param name="openMode"> see <seealso cref="OpenMode"/> </param>
+ protected virtual IndexWriterConfig CreateIndexWriterConfig(OpenMode openMode)
+ {
+ // TODO: should we use a more optimized Codec, e.g. Pulsing (or write custom)?
+ // The taxonomy has a unique structure, where each term is associated with one document
+
+ // :Post-Release-Update-Version.LUCENE_XY:
+ // Make sure we use a MergePolicy which always merges adjacent segments and thus
+ // keeps the doc IDs ordered as well (this is crucial for the taxonomy index).
+ return (new IndexWriterConfig(Version.LUCENE_48, null)).SetOpenMode(openMode).SetMergePolicy(new LogByteSizeMergePolicy());
+ }
+
+ /// <summary>
+ /// Opens a <seealso cref="ReaderManager"/> from the internal <seealso cref="IndexWriter"/>.
+ /// </summary>
+ private void InitReaderManager()
+ {
+ if (!initializedReaderManager)
+ {
+ lock (this)
+ {
+ // verify that the taxo-writer hasn't been closed on us.
+ EnsureOpen();
+ if (!initializedReaderManager)
+ {
+ readerManager = new ReaderManager(indexWriter, false);
+ shouldRefreshReaderManager = false;
+ initializedReaderManager = true;
+ }
+ }
+ }
+ }
+
+ /// <summary>
+ /// Creates a new instance with a default cache as defined by
+ /// <seealso cref="#defaultTaxonomyWriterCache()"/>.
+ /// </summary>
+ public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode = OpenMode.CREATE_OR_APPEND)
+ : this(directory, openMode, DefaultTaxonomyWriterCache())
+ {
+ }
+
+ /// <summary>
+ /// Defines the default <seealso cref="TaxonomyWriterCache"/> to use in constructors
+ /// which do not specify one.
+ /// <P>
+ /// The current default is <seealso cref="Cl2oTaxonomyWriterCache"/> constructed
+ /// with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
+ /// cached in memory while building it.
+ /// </summary>
+ public static TaxonomyWriterCache DefaultTaxonomyWriterCache()
+ {
+ return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
+ }
+
+ /// <summary>
+ /// Frees used resources as well as closes the underlying <seealso cref="IndexWriter"/>,
+ /// which commits whatever changes made to it to the underlying
+ /// <seealso cref="Directory"/>.
+ /// </summary>
+ public void Dispose()
+ {
+ lock (this)
+ {
+ if (!isClosed)
+ {
+ Commit();
+ DoClose();
+ }
+ }
+ }
+
+ private void DoClose()
+ {
+ indexWriter.Dispose();
+ isClosed = true;
+ CloseResources();
+ }
+
+ /// <summary>
+ /// A hook for extending classes to close additional resources that were used.
+ /// The default implementation closes the <seealso cref="IndexReader"/> as well as the
+ /// <seealso cref="TaxonomyWriterCache"/> instances that were used. <br>
+ /// <b>NOTE:</b> if you override this method, you should include a
+ /// <code>super.closeResources()</code> call in your implementation.
+ /// </summary>
+ protected virtual void CloseResources()
+ {
+ lock (this)
+ {
+ if (initializedReaderManager)
+ {
+ readerManager.Dispose();
+ readerManager = null;
+ initializedReaderManager = false;
+ }
+ if (cache != null)
+ {
+ cache.Close();
+ }
+ }
+ }
+
+ /// <summary>
+ /// Look up the given category in the cache and/or the on-disk storage,
+ /// returning the category's ordinal, or a negative number in case the
+ /// category does not yet exist in the taxonomy.
+ /// </summary>
+ protected virtual int FindCategory(FacetLabel categoryPath)
+ {
+ lock (this)
+ {
+ // If we can find the category in the cache, or we know the cache is
+ // complete, we can return the response directly from it
+ int res = cache.Get(categoryPath);
+ if (res >= 0 || cacheIsComplete)
+ {
+ return res;
+ }
+
+ cacheMisses.IncrementAndGet();
+ // After a few cache misses, it makes sense to read all the categories
+ // from disk and into the cache. The reason not to do this on the first
+ // cache miss (or even when opening the writer) is that it will
+ // significantly slow down the case when a taxonomy is opened just to
+ // add one category. The idea only spending a long time on reading
+ // after enough time was spent on cache misses is known as an "online
+ // algorithm".
+ PerhapsFillCache();
+ res = cache.Get(categoryPath);
+ if (res >= 0 || cacheIsComplete)
+ {
+ // if after filling the cache from the info on disk, the category is in it
+ // or the cache is complete, return whatever cache.get returned.
+ return res;
+ }
+
+ // if we get here, it means the category is not in the cache, and it is not
+ // complete, and therefore we must look for the category on disk.
+
+ // We need to get an answer from the on-disk index.
+ InitReaderManager();
+
+ int doc = -1;
+ DirectoryReader reader = readerManager.Acquire();
+ try
+ {
+ BytesRef catTerm = new BytesRef(FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length));
+ TermsEnum termsEnum = null; // reuse
+ DocsEnum docs = null; // reuse
+ foreach (AtomicReaderContext ctx in reader.Leaves)
+ {
+ Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
+ if (terms != null)
+ {
+ termsEnum = terms.Iterator(termsEnum);
+ if (termsEnum.SeekExact(catTerm))
+ {
+ // liveDocs=null because the taxonomy has no deletes
+ docs = termsEnum.Docs(null, docs, 0); // freqs not required
+ // if the term was found, we know it has exactly one document.
+ doc = docs.NextDoc() + ctx.DocBase;
+ break;
+ }
+ }
+ }
+ }
+ finally
+ {
+ readerManager.Release(reader);
+ }
+ if (doc > 0)
+ {
+ AddToCache(categoryPath, doc);
+ }
+ return doc;
+ }
+ }
+
+ public virtual int AddCategory(FacetLabel categoryPath)
+ {
+ EnsureOpen();
+ // check the cache outside the synchronized block. this results in better
+ // concurrency when categories are there.
+ int res = cache.Get(categoryPath);
+ if (res < 0)
+ {
+ // the category is not in the cache - following code cannot be executed in parallel.
+ lock (this)
+ {
+ res = FindCategory(categoryPath);
+ if (res < 0)
+ {
+ // This is a new category, and we need to insert it into the index
+ // (and the cache). Actually, we might also need to add some of
+ // the category's ancestors before we can add the category itself
+ // (while keeping the invariant that a parent is always added to
+ // the taxonomy before its child). internalAddCategory() does all
+ // this recursively
+ res = InternalAddCategory(categoryPath);
+ }
+ }
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// Add a new category into the index (and the cache), and return its new
+ /// ordinal.
+ /// <para>
+ /// Actually, we might also need to add some of the category's ancestors
+ /// before we can add the category itself (while keeping the invariant that a
+ /// parent is always added to the taxonomy before its child). We do this by
+ /// recursion.
+ /// </para>
+ /// </summary>
+ private int InternalAddCategory(FacetLabel cp)
+ {
+ // Find our parent's ordinal (recursively adding the parent category
+ // to the taxonomy if it's not already there). Then add the parent
+ // ordinal as payloads (rather than a stored field; payloads can be
+ // more efficiently read into memory in bulk by LuceneTaxonomyReader)
+ int parent;
+ if (cp.Length > 1)
+ {
+ FacetLabel parentPath = cp.Subpath(cp.Length - 1);
+ parent = FindCategory(parentPath);
+ if (parent < 0)
+ {
+ parent = InternalAddCategory(parentPath);
+ }
+ }
+ else if (cp.Length == 1)
+ {
+ parent = TaxonomyReader.ROOT_ORDINAL;
+ }
+ else
+ {
+ parent = TaxonomyReader.INVALID_ORDINAL;
+ }
+ int id = AddCategoryDocument(cp, parent);
+
+ return id;
+ }
+
+ /// <summary>
+ /// Verifies that this instance wasn't closed, or throws
+ /// <seealso cref="AlreadyClosedException"/> if it is.
+ /// </summary>
+ protected internal void EnsureOpen()
+ {
+ if (isClosed)
+ {
+ throw new AlreadyClosedException("The taxonomy writer has already been closed");
+ }
+ }
+
+ /// <summary>
+ /// Note that the methods calling addCategoryDocument() are synchornized, so
+ /// this method is effectively synchronized as well.
+ /// </summary>
+ private int AddCategoryDocument(FacetLabel categoryPath, int parent)
+ {
+ // Before Lucene 2.9, position increments >=0 were supported, so we
+ // added 1 to parent to allow the parent -1 (the parent of the root).
+ // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
+ // no longer enough, since 0 is not encoded consistently either (see
+ // comment in SinglePositionTokenStream). But because we must be
+ // backward-compatible with existing indexes, we can't just fix what
+ // we write here (e.g., to write parent+2), and need to do a workaround
+ // in the reader (which knows that anyway only category 0 has a parent
+ // -1).
+ parentStream.Set(Math.Max(parent + 1, 1));
+ Document d = new Document();
+ d.Add(parentStreamField);
+
+ fullPathField.StringValue = FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length);
+ d.Add(fullPathField);
+
+ // Note that we do no pass an Analyzer here because the fields that are
+ // added to the Document are untokenized or contains their own TokenStream.
+ // Therefore the IndexWriter's Analyzer has no effect.
+ indexWriter.AddDocument(d);
+ int id = nextID++;
+
+ // added a category document, mark that ReaderManager is not up-to-date
+ shouldRefreshReaderManager = true;
+
+ // also add to the parent array
+ taxoArrays = TaxoArrays.Add(id, parent);
+
+ // NOTE: this line must be executed last, or else the cache gets updated
+ // before the parents array (LUCENE-4596)
+ AddToCache(categoryPath, id);
+
+ return id;
+ }
+
+ private class SinglePositionTokenStream : TokenStream
+ {
+ internal ICharTermAttribute termAtt;
+ internal IPositionIncrementAttribute posIncrAtt;
+ internal bool returned;
+ internal int val;
+ internal readonly string word;
+
+ public SinglePositionTokenStream(string word)
+ {
+ termAtt = AddAttribute<ICharTermAttribute>();
+ posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
+ this.word = word;
+ returned = true;
+ }
+
+ /// <summary>
+ /// Set the value we want to keep, as the position increment.
+ /// Note that when TermPositions.nextPosition() is later used to
+ /// retrieve this value, val-1 will be returned, not val.
+ /// <P>
+ /// IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
+ /// the retrieved position would be -1). But starting with Lucene 2.9,
+ /// this unfortunately changed, and only val>0 are safe. val=0 can
+ /// still be used, but don't count on the value you retrieve later
+ /// (it could be 0 or -1, depending on circumstances or versions).
+ /// This change is described in Lucene's JIRA: LUCENE-1542.
+ /// </summary>
+ public virtual void Set(int val)
+ {
+ this.val = val;
+ returned = false;
+ }
+
+ public override bool IncrementToken()
+ {
+ if (returned)
+ {
+ return false;
+ }
+ ClearAttributes();
+ posIncrAtt.PositionIncrement = val;
+ termAtt.SetEmpty();
+ termAtt.Append(word);
+ returned = true;
+ return true;
+ }
+ }
+
+ private void AddToCache(FacetLabel categoryPath, int id)
+ {
+ if (cache.Put(categoryPath, id))
+ {
+ // If cache.put() returned true, it means the cache was limited in
+ // size, became full, and parts of it had to be evicted. It is
+ // possible that a relatively-new category that isn't yet visible
+ // to our 'reader' was evicted, and therefore we must now refresh
+ // the reader.
+ RefreshReaderManager();
+ cacheIsComplete = false;
+ }
+ }
+
+ private void RefreshReaderManager()
+ {
+ lock (this)
+ {
+ // this method is synchronized since it cannot happen concurrently with
+ // addCategoryDocument -- when this method returns, we must know that the
+ // reader manager's state is current. also, it sets shouldRefresh to false,
+ // and this cannot overlap with addCatDoc too.
+ // NOTE: since this method is sync'ed, it can call maybeRefresh, instead of
+ // maybeRefreshBlocking. If ever this is changed, make sure to change the
+ // call too.
+ if (shouldRefreshReaderManager && initializedReaderManager)
+ {
+ readerManager.MaybeRefresh();
+ shouldRefreshReaderManager = false;
+ }
+ }
+ }
+
+ public virtual void Commit()
+ {
+ lock (this)
+ {
+ EnsureOpen();
+ // LUCENE-4972: if we always call setCommitData, we create empty commits
+ string epochStr = null;
+ indexWriter.CommitData.TryGetValue(INDEX_EPOCH, out epochStr);
+ if (epochStr == null || Convert.ToInt64(epochStr, 16) != indexEpoch)
+ {
+ indexWriter.CommitData = CombinedCommitData(indexWriter.CommitData);
+ }
+ indexWriter.Commit();
+ }
+ }
+
+ /// <summary>
+ /// Combine original user data with the taxonomy epoch. </summary>
+ private IDictionary<string, string> CombinedCommitData(IDictionary<string, string> commitData)
+ {
+ IDictionary<string, string> m = new Dictionary<string, string>();
+ if (commitData != null)
+ {
+ m.PutAll(commitData);
+ }
+ m[INDEX_EPOCH] = Convert.ToString(indexEpoch, 16);
+ return m;
+ }
+
+ public virtual IDictionary<string, string> CommitData
+ {
+ set
+ {
+ indexWriter.CommitData = CombinedCommitData(value);
+ }
+ get
+ {
+ return CombinedCommitData(indexWriter.CommitData);
+ }
+ }
+
+
+ /// <summary>
+ /// prepare most of the work needed for a two-phase commit.
+ /// See <seealso cref="IndexWriter#prepareCommit"/>.
+ /// </summary>
+ public virtual void PrepareCommit()
+ {
+ lock (this)
+ {
+ EnsureOpen();
+ // LUCENE-4972: if we always call setCommitData, we create empty commits
+ string epochStr = indexWriter.CommitData[INDEX_EPOCH];
+ if (epochStr == null || Convert.ToInt64(epochStr, 16) != indexEpoch)
+ {
+ indexWriter.CommitData = CombinedCommitData(indexWriter.CommitData);
+ }
+ indexWriter.PrepareCommit();
+ }
+ }
+
+ public virtual int Size
+ {
+ get
+ {
+ EnsureOpen();
+ return nextID;
+ }
+ }
+
+ /// <summary>
+ /// Set the number of cache misses before an attempt is made to read the entire
+ /// taxonomy into the in-memory cache.
+ /// <para>
+ /// This taxonomy writer holds an in-memory cache of recently seen categories
+ /// to speed up operation. On each cache-miss, the on-disk index needs to be
+ /// consulted. When an existing taxonomy is opened, a lot of slow disk reads
+ /// like that are needed until the cache is filled, so it is more efficient to
+ /// read the entire taxonomy into memory at once. We do this complete read
+ /// after a certain number (defined by this method) of cache misses.
+ /// </para>
+ /// <para>
+ /// If the number is set to {@code 0}, the entire taxonomy is read into the
+ /// cache on first use, without fetching individual categories first.
+ /// </para>
+ /// <para>
+ /// NOTE: it is assumed that this method is called immediately after the
+ /// taxonomy writer has been created.
+ /// </para>
+ /// </summary>
+ public virtual int CacheMissesUntilFill
+ {
+ set
+ {
+ EnsureOpen();
+ cacheMissesUntilFill = value;
+ }
+ }
+
+ // we need to guarantee that if several threads call this concurrently, only
+ // one executes it, and after it returns, the cache is updated and is either
+ // complete or not.
+ private void PerhapsFillCache()
+ {
+ lock (this)
+ {
+ if (cacheMisses.Get() < cacheMissesUntilFill)
+ {
+ return;
+ }
+
+ if (!shouldFillCache)
+ {
+ // we already filled the cache once, there's no need to re-fill it
+ return;
+ }
+ shouldFillCache = false;
+
+ InitReaderManager();
+
+ bool aborted = false;
+ DirectoryReader reader = readerManager.Acquire();
+ try
+ {
+ TermsEnum termsEnum = null;
+ DocsEnum docsEnum = null;
+ foreach (AtomicReaderContext ctx in reader.Leaves)
+ {
+ Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
+ if (terms != null) // cannot really happen, but be on the safe side
+ {
+ termsEnum = terms.Iterator(termsEnum);
+ while (termsEnum.Next() != null)
+ {
+ if (!cache.Full)
+ {
+ BytesRef t = termsEnum.Term();
+ // Since we guarantee uniqueness of categories, each term has exactly
+ // one document. Also, since we do not allow removing categories (and
+ // hence documents), there are no deletions in the index. Therefore, it
+ // is sufficient to call next(), and then doc(), exactly once with no
+ // 'validation' checks.
+ FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString()));
+ docsEnum = termsEnum.Docs(null, docsEnum, DocsEnum.FLAG_NONE);
+ bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase);
+ Debug.Assert(!res, "entries should not have been evicted from the cache");
+ }
+ else
+ {
+ // the cache is full and the next put() will evict entries from it, therefore abort the iteration.
+ aborted = true;
+ break;
+ }
+ }
+ }
+ if (aborted)
+ {
+ break;
+ }
+ }
+ }
+ finally
+ {
+ readerManager.Release(reader);
+ }
+
+ cacheIsComplete = !aborted;
+ if (cacheIsComplete)
+ {
+ lock (this)
+ {
+ // everything is in the cache, so no need to keep readerManager open.
+ // this block is executed in a sync block so that it works well with
+ // initReaderManager called in parallel.
+ readerManager.Dispose();
+ readerManager = null;
+ initializedReaderManager = false;
+ }
+ }
+ }
+ }
+
+ private TaxonomyIndexArrays TaxoArrays
+ {
+ get
+ {
+ if (taxoArrays == null)
+ {
+ lock (this)
+ {
+ if (taxoArrays == null)
+ {
+ InitReaderManager();
+ DirectoryReader reader = readerManager.Acquire();
+ try
+ {
+ // according to Java Concurrency, this might perform better on some
+ // JVMs, since the object initialization doesn't happen on the
+ // volatile member.
+ TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(reader);
+ taxoArrays = tmpArrays;
+ }
+ finally
+ {
+ readerManager.Release(reader);
+ }
+ }
+ }
+ }
+ return taxoArrays;
+ }
+ }
+
+ public virtual int GetParent(int ordinal)
+ {
+ EnsureOpen();
+ // Note: the following if() just enforces that a user can never ask
+ // for the parent of a nonexistant category - even if the parent array
+ // was allocated bigger than it really needs to be.
+ if (ordinal >= nextID)
+ {
+ throw new System.IndexOutOfRangeException("requested ordinal is bigger than the largest ordinal in the taxonomy");
+ }
+
+ int[] parents = TaxoArrays.Parents();
+ Debug.Assert(ordinal < parents.Length, "requested ordinal (" + ordinal + "); parents.length (" + parents.Length + ") !");
+ return parents[ordinal];
+ }
+
+ /// <summary>
+ /// Takes the categories from the given taxonomy directory, and adds the
+ /// missing ones to this taxonomy. Additionally, it fills the given
+ /// <seealso cref="OrdinalMap"/> with a mapping from the original ordinal to the new
+ /// ordinal.
+ /// </summary>
+ public virtual void AddTaxonomy(Directory taxoDir, OrdinalMap map)
+ {
+ EnsureOpen();
+ DirectoryReader r = DirectoryReader.Open(taxoDir);
+ try
+ {
+ int size = r.NumDocs;
+ OrdinalMap ordinalMap = map;
+ ordinalMap.Size = size;
+ int @base = 0;
+ TermsEnum te = null;
+ DocsEnum docs = null;
+ foreach (AtomicReaderContext ctx in r.Leaves)
+ {
+ AtomicReader ar = ctx.AtomicReader;
+ Terms terms = ar.Terms(Consts.FULL);
+ te = terms.Iterator(te);
+ while (te.Next() != null)
+ {
+ FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(te.Term().Utf8ToString()));
+ int ordinal = AddCategory(cp);
+ docs = te.Docs(null, docs, DocsEnum.FLAG_NONE);
+ ordinalMap.AddMapping(docs.NextDoc() + @base, ordinal);
+ }
+ @base += ar.MaxDoc; // no deletions, so we're ok
+ }
+ ordinalMap.AddDone();
+ }
+ finally
+ {
+ r.Dispose();
+ }
+ }
+
+ /// <summary>
+ /// Mapping from old ordinal to new ordinals, used when merging indexes
+ /// wit separate taxonomies.
+ /// <para>
+ /// addToTaxonomies() merges one or more taxonomies into the given taxonomy
+ /// (this). An OrdinalMap is filled for each of the added taxonomies,
+ /// containing the new ordinal (in the merged taxonomy) of each of the
+ /// categories in the old taxonomy.
+ /// <P>
+ /// There exist two implementations of OrdinalMap: MemoryOrdinalMap and
+ /// DiskOrdinalMap. As their names suggest, the former keeps the map in
+ /// memory and the latter in a temporary disk file. Because these maps will
+ /// later be needed one by one (to remap the counting lists), not all at the
+ /// same time, it is recommended to put the first taxonomy's map in memory,
+ /// and all the rest on disk (later to be automatically read into memory one
+ /// by one, when needed).
+ /// </para>
+ /// </summary>
+ public interface OrdinalMap
+ {
+ /// <summary>
+ /// Set the size of the map. This MUST be called before addMapping().
+ /// It is assumed (but not verified) that addMapping() will then be
+ /// called exactly 'size' times, with different origOrdinals between 0
+ /// and size-1.
+ /// </summary>
+ int Size { set; }
+
+ /// <summary>
+ /// Record a mapping. </summary>
+ void AddMapping(int origOrdinal, int newOrdinal);
+
+ /// <summary>
+ /// Call addDone() to say that all addMapping() have been done.
+ /// In some implementations this might free some resources.
+ /// </summary>
+ void AddDone();
+
+ /// <summary>
+ /// Return the map from the taxonomy's original (consecutive) ordinals
+ /// to the new taxonomy's ordinals. If the map has to be read from disk
+ /// and ordered appropriately, it is done when getMap() is called.
+ /// getMap() should only be called once, and only when the map is actually
+ /// needed. Calling it will also free all resources that the map might
+ /// be holding (such as temporary disk space), other than the returned int[].
+ /// </summary>
+ int[] Map { get; }
+ }
+
+ /// <summary>
+ /// <seealso cref="OrdinalMap"/> maintained in memory
+ /// </summary>
+ public sealed class MemoryOrdinalMap : OrdinalMap
+ {
+ internal int[] map;
+
+ /// <summary>
+ /// Sole constructor.
+ /// </summary>
+ public MemoryOrdinalMap()
+ {
+ map = new int[] { };
+ }
+
+ public int Size { set; private get; }
+
+ public void AddMapping(int origOrdinal, int newOrdinal)
+ {
+ if (map.Length - 1 >= origOrdinal)
+ {
+ map[origOrdinal] = newOrdinal;
+ }
+ else
+ {
+ Array.Resize(ref map, origOrdinal + 1);
+ map[origOrdinal] = newOrdinal;
+ }
+
+
+ }
+
+ public void AddDone() // nothing to do
+ {
+ }
+ public int[] Map
+ {
+ get
+ {
+ return map;
+ }
+ }
+ }
+
+ /// <summary>
+ /// <seealso cref="OrdinalMap"/> maintained on file system
+ /// </summary>
+ public sealed class DiskOrdinalMap : OrdinalMap
+ {
+ internal string tmpfile;
+ internal OutputStreamDataOutput @out;
+
+ /// <summary>
+ /// Sole constructor.
+ /// </summary>
+ public DiskOrdinalMap(string tmpfile)
+ {
+ this.tmpfile = tmpfile;
+ var outfs = new FileStream(tmpfile, FileMode.OpenOrCreate, FileAccess.Write);
+ @out = new OutputStreamDataOutput(outfs);
+ }
+
+ public void AddMapping(int origOrdinal, int newOrdinal)
+ {
+ @out.WriteInt(origOrdinal);
+ @out.WriteInt(newOrdinal);
+ }
+
+ public int Size
+ {
+ set
+ {
+ @out.WriteInt(value);
+ }
+ }
+
+ public void AddDone()
+ {
+ if (@out != null)
+ {
+ @out.Dispose();
+ @out = null;
+ }
+ }
+
+ int[] map = null;
+
+ public int[] Map
+ {
+ get
+ {
+ if (map != null)
+ {
+ return map;
+ }
+ AddDone(); // in case this wasn't previously called
+
+ var ifs = new FileStream(tmpfile, FileMode.OpenOrCreate, FileAccess.Read);
+ var @in = new InputStreamDataInput(ifs);
+ map = new int[@in.ReadInt()];
+ // NOTE: The current code assumes here that the map is complete,
+ // i.e., every ordinal gets one and exactly one value. Otherwise,
+ // we may run into an EOF here, or vice versa, not read everything.
+ for (int i = 0; i < map.Length; i++)
+ {
+ int origordinal = @in.ReadInt();
+ int newordinal = @in.ReadInt();
+ map[origordinal] = newordinal;
+ }
+ @in.Dispose();
+
+ // Delete the temporary file, which is no longer needed.
+ if (File.Exists(tmpfile))
+ {
+ File.Delete(tmpfile);
+ }
+ return map;
+ }
+ }
+ }
+
+ /// <summary>
+ /// Rollback changes to the taxonomy writer and closes the instance. Following
+ /// this method the instance becomes unusable (calling any of its API methods
+ /// will yield an <seealso cref="AlreadyClosedException"/>).
+ /// </summary>
+ public virtual void Rollback()
+ {
+ lock (this)
+ {
+ EnsureOpen();
+ indexWriter.Rollback();
+ DoClose();
+ }
+ }
+
+ /// <summary>
+ /// Replaces the current taxonomy with the given one. This method should
+ /// generally be called in conjunction with
+ /// <seealso cref="IndexWriter#addIndexes(Directory...)"/> to replace both the taxonomy
+ /// as well as the search index content.
+ /// </summary>
+ public virtual void ReplaceTaxonomy(Directory taxoDir)
+ {
+ lock (this)
+ {
+ // replace the taxonomy by doing IW optimized operations
+ indexWriter.DeleteAll();
+ indexWriter.AddIndexes(taxoDir);
+ shouldRefreshReaderManager = true;
+ InitReaderManager(); // ensure that it's initialized
+ RefreshReaderManager();
+ nextID = indexWriter.MaxDoc;
+ taxoArrays = null; // must nullify so that it's re-computed next time it's needed
+
+ // need to clear the cache, so that addCategory won't accidentally return
+ // old categories that are in the cache.
+ cache.Clear();
+ cacheIsComplete = false;
+ shouldFillCache = true;
+ cacheMisses.Set(0);
+
+ // update indexEpoch as a taxonomy replace is just like it has be recreated
+ ++indexEpoch;
+ }
+ }
+
+ /// <summary>
+ /// Returns the <seealso cref="Directory"/> of this taxonomy writer. </summary>
+ public virtual Directory Directory
+ {
+ get
+ {
+ return dir;
+ }
+ }
+
+ /// <summary>
+ /// Used by <seealso cref="DirectoryTaxonomyReader"/> to support NRT.
+ /// <para>
+ /// <b>NOTE:</b> you should not use the obtained <seealso cref="IndexWriter"/> in any
+ /// way, other than opening an IndexReader on it, or otherwise, the taxonomy
+ /// index may become corrupt!
+ /// </para>
+ /// </summary>
+ internal IndexWriter InternalIndexWriter
+ {
+ get
+ {
+ return indexWriter;
+ }
+ }
+
+ /// <summary>
+ /// Expert: returns current index epoch, if this is a
+ /// near-real-time reader. Used by {@link
+ /// DirectoryTaxonomyReader} to support NRT.
+ ///
+ /// @lucene.internal
+ /// </summary>
+ public long TaxonomyEpoch
+ {
+ get
+ {
+ return indexEpoch;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/src/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs b/src/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs
new file mode 100644
index 0000000..9a99f4a
--- /dev/null
+++ b/src/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs
@@ -0,0 +1,252 @@
+using System;
+using System.Diagnostics;
+
+namespace Lucene.Net.Facet.Taxonomy.Directory
+{
+
+ using CorruptIndexException = Lucene.Net.Index.CorruptIndexException;
+ using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
+ using IndexReader = Lucene.Net.Index.IndexReader;
+ using MultiFields = Lucene.Net.Index.MultiFields;
+ using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
+ using ArrayUtil = Lucene.Net.Util.ArrayUtil;
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// A <seealso cref="ParallelTaxonomyArrays"/> that are initialized from the taxonomy
+ /// index.
+ ///
+ /// @lucene.experimental
+ /// </summary>
+ internal class TaxonomyIndexArrays : ParallelTaxonomyArrays
+ {
+
+ private readonly int[] parents_Renamed;
+
+ // the following two arrays are lazily intialized. note that we only keep a
+ // single boolean member as volatile, instead of declaring the arrays
+ // volatile. the code guarantees that only after the boolean is set to true,
+ // the arrays are returned.
+ private volatile bool initializedChildren = false;
+ private int[] children_Renamed, siblings_Renamed;
+
+ /// <summary>
+ /// Used by <seealso cref="#add(int, int)"/> after the array grew. </summary>
+ private TaxonomyIndexArrays(int[] parents)
+ {
+ this.parents_Renamed = parents;
+ }
+
+ public TaxonomyIndexArrays(IndexReader reader)
+ {
+ parents_Renamed = new int[reader.MaxDoc];
+ if (parents_Renamed.Length > 0)
+ {
+ InitParents(reader, 0);
+ // Starting Lucene 2.9, following the change LUCENE-1542, we can
+ // no longer reliably read the parent "-1" (see comment in
+ // LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
+ // to fix this in indexing without breaking backward-compatibility
+ // with existing indexes, so what we'll do instead is just
+ // hard-code the parent of ordinal 0 to be -1, and assume (as is
+ // indeed the case) that no other parent can be -1.
+ parents_Renamed[0] = TaxonomyReader.INVALID_ORDINAL;
+ }
+ }
+
+ public TaxonomyIndexArrays(IndexReader reader, TaxonomyIndexArrays copyFrom)
+ {
+ Debug.Assert(copyFrom != null);
+
+ // note that copyParents.length may be equal to reader.maxDoc(). this is not a bug
+ // it may be caused if e.g. the taxonomy segments were merged, and so an updated
+ // NRT reader was obtained, even though nothing was changed. this is not very likely
+ // to happen.
+ int[] copyParents = copyFrom.Parents();
+ this.parents_Renamed = new int[reader.MaxDoc];
+ Array.Copy(copyParents, 0, parents_Renamed, 0, copyParents.Length);
+ InitParents(reader, copyParents.Length);
+
+ if (copyFrom.initializedChildren)
+ {
+ InitChildrenSiblings(copyFrom);
+ }
+ }
+
+ private void InitChildrenSiblings(TaxonomyIndexArrays copyFrom)
+ {
+ lock (this)
+ {
+ if (!initializedChildren) // must do this check !
+ {
+ children_Renamed = new int[parents_Renamed.Length];
+ siblings_Renamed = new int[parents_Renamed.Length];
+ if (copyFrom != null)
+ {
+ // called from the ctor, after we know copyFrom has initialized children/siblings
+ Array.Copy(copyFrom.Children(), 0, children_Renamed, 0, copyFrom.Children().Length);
+ Array.Copy(copyFrom.Siblings(), 0, siblings_Renamed, 0, copyFrom.Siblings().Length);
+ ComputeChildrenSiblings(copyFrom.parents_Renamed.Length);
+ }
+ else
+ {
+ ComputeChildrenSiblings(0);
+ }
+ initializedChildren = true;
+ }
+ }
+ }
+
+ private void ComputeChildrenSiblings(int first)
+ {
+ // reset the youngest child of all ordinals. while this should be done only
+ // for the leaves, we don't know up front which are the leaves, so we reset
+ // all of them.
+ for (int i = first; i < parents_Renamed.Length; i++)
+ {
+ children_Renamed[i] = TaxonomyReader.INVALID_ORDINAL;
+ }
+
+ // the root category has no parent, and therefore no siblings
+ if (first == 0)
+ {
+ first = 1;
+ siblings_Renamed[0] = TaxonomyReader.INVALID_ORDINAL;
+ }
+
+ for (int i = first; i < parents_Renamed.Length; i++)
+ {
+ // note that parents[i] is always < i, so the right-hand-side of
+ // the following line is already set when we get here
+ siblings_Renamed[i] = children_Renamed[parents_Renamed[i]];
+ children_Renamed[parents_Renamed[i]] = i;
+ }
+ }
+
+ // Read the parents of the new categories
+ private void InitParents(IndexReader reader, int first)
+ {
+ if (reader.MaxDoc == first)
+ {
+ return;
+ }
+
+ // it's ok to use MultiFields because we only iterate on one posting list.
+ // breaking it to loop over the leaves() only complicates code for no
+ // apparent gain.
+ DocsAndPositionsEnum positions = MultiFields.GetTermPositionsEnum(reader, null, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, DocsAndPositionsEnum.FLAG_PAYLOADS);
+
+ // shouldn't really happen, if it does, something's wrong
+ if (positions == null || positions.Advance(first) == DocIdSetIterator.NO_MORE_DOCS)
+ {
+ throw new CorruptIndexException("Missing parent data for category " + first);
+ }
+
+ int num = reader.MaxDoc;
+ for (int i = first; i < num; i++)
+ {
+ if (positions.DocID() == i)
+ {
+ if (positions.Freq() == 0) // shouldn't happen
+ {
+ throw new CorruptIndexException("Missing parent data for category " + i);
+ }
+
+ parents_Renamed[i] = positions.NextPosition();
+
+ if (positions.NextDoc() == DocIdSetIterator.NO_MORE_DOCS)
+ {
+ if (i + 1 < num)
+ {
+ throw new CorruptIndexException("Missing parent data for category " + (i + 1));
+ }
+ break;
+ }
+ } // this shouldn't happen
+ else
+ {
+ throw new CorruptIndexException("Missing parent data for category " + i);
+ }
+ }
+ }
+
+ /// <summary>
+ /// Adds the given ordinal/parent info and returns either a new instance if the
+ /// underlying array had to grow, or this instance otherwise.
+ /// <para>
+ /// <b>NOTE:</b> you should call this method from a thread-safe code.
+ /// </para>
+ /// </summary>
+ internal virtual TaxonomyIndexArrays Add(int ordinal, int parentOrdinal)
+ {
+ if (ordinal >= parents_Renamed.Length)
+ {
+ int[] newarray = ArrayUtil.Grow(parents_Renamed, ordinal + 1);
+ newarray[ordinal] = parentOrdinal;
+ return new TaxonomyIndexArrays(newarray);
+ }
+ parents_Renamed[ordinal] = parentOrdinal;
+ return this;
+ }
+
+ /// <summary>
+ /// Returns the parents array, where {@code parents[i]} denotes the parent of
+ /// category ordinal {@code i}.
+ /// </summary>
+ public override int[] Parents()
+ {
+ return parents_Renamed;
+ }
+
+ /// <summary>
+ /// Returns the children array, where {@code children[i]} denotes the youngest
+ /// child of category ordinal {@code i}. The youngest child is defined as the
+ /// category that was added last to the taxonomy as an immediate child of
+ /// {@code i}.
+ /// </summary>
+ public override int[] Children()
+ {
+ if (!initializedChildren)
+ {
+ InitChildrenSiblings(null);
+ }
+
+ // the array is guaranteed to be populated
+ return children_Renamed;
+ }
+
+ /// <summary>
+ /// Returns the siblings array, where {@code siblings[i]} denotes the sibling
+ /// of category ordinal {@code i}. The sibling is defined as the previous
+ /// youngest child of {@code parents[i]}.
+ /// </summary>
+ public override int[] Siblings()
+ {
+ if (!initializedChildren)
+ {
+ InitChildrenSiblings(null);
+ }
+
+ // the array is guaranteed to be populated
+ return siblings_Renamed;
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/src/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs b/src/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs
new file mode 100644
index 0000000..3d50275
--- /dev/null
+++ b/src/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs
@@ -0,0 +1,130 @@
+namespace Lucene.Net.Facet.Taxonomy
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
+ using BinaryDocValues = Lucene.Net.Index.BinaryDocValues;
+ using DocValues = Lucene.Net.Index.DocValues;
+ using ArrayUtil = Lucene.Net.Util.ArrayUtil;
+ using BytesRef = Lucene.Net.Util.BytesRef;
+ using IntsRef = Lucene.Net.Util.IntsRef;
+
+ /// <summary>
+ /// Decodes ordinals previously indexed into a BinaryDocValues field </summary>
+
+ public class DocValuesOrdinalsReader : OrdinalsReader
+ {
+ private readonly string field;
+
+ /// <summary>
+ /// Default constructor. </summary>
+ public DocValuesOrdinalsReader()
+ : this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME)
+ {
+ }
+
+ /// <summary>
+ /// Create this, with the specified indexed field name. </summary>
+ public DocValuesOrdinalsReader(string field)
+ {
+ this.field = field;
+ }
+
+ public override OrdinalsSegmentReader GetReader(AtomicReaderContext context)
+ {
+ BinaryDocValues values0 = context.AtomicReader.GetBinaryDocValues(field);
+ if (values0 == null)
+ {
+ values0 = DocValues.EMPTY_BINARY;
+ }
+
+ BinaryDocValues values = values0;
+
+ return new OrdinalsSegmentReaderAnonymousInnerClassHelper(this, values);
+ }
+
+ private class OrdinalsSegmentReaderAnonymousInnerClassHelper : OrdinalsSegmentReader
+ {
+ private readonly DocValuesOrdinalsReader outerInstance;
+
+ private BinaryDocValues values;
+
+ public OrdinalsSegmentReaderAnonymousInnerClassHelper(DocValuesOrdinalsReader outerInstance, BinaryDocValues values)
+ {
+ this.outerInstance = outerInstance;
+ this.values = values;
+ }
+
+ public override void Get(int docID, IntsRef ordinals)
+ {
+ BytesRef bytes = new BytesRef();
+ values.Get(docID, bytes);
+ outerInstance.Decode(bytes, ordinals);
+ }
+ }
+
+ public override string IndexFieldName
+ {
+ get
+ {
+ return field;
+ }
+ }
+
+ /// <summary>
+ /// Subclass & override if you change the encoding. </summary>
+ protected virtual void Decode(BytesRef buf, IntsRef ordinals)
+ {
+
+ // grow the buffer up front, even if by a large number of values (buf.length)
+ // that saves the need to check inside the loop for every decoded value if
+ // the buffer needs to grow.
+ if (ordinals.Ints.Length < buf.Length)
+ {
+ ordinals.Ints = ArrayUtil.Grow(ordinals.Ints, buf.Length);
+ }
+
+ ordinals.Offset = 0;
+ ordinals.Length = 0;
+
+ // it is better if the decoding is inlined like so, and not e.g.
+ // in a utility method
+ int upto = buf.Offset + buf.Length;
+ int value = 0;
+ int offset = buf.Offset;
+ int prev = 0;
+ while (offset < upto)
+ {
+ byte b = buf.Bytes[offset++];
+ if ((sbyte)b >= 0)
+ {
+ ordinals.Ints[ordinals.Length] = ((value << 7) | b) + prev;
+ value = 0;
+ prev = ordinals.Ints[ordinals.Length];
+ ordinals.Length++;
+ }
+ else
+ {
+ value = (value << 7) | (b & 0x7F);
+ }
+ }
+ }
+ }
+
+}
\ No newline at end of file