You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/11/25 19:52:11 UTC
[10/16] lucenenet git commit: Move facets into src folder
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs
----------------------------------------------------------------------
diff --git a/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs b/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs
deleted file mode 100644
index 5b69985..0000000
--- a/Lucene.Net.Facet/Taxonomy/Directory/Consts.cs
+++ /dev/null
@@ -1,34 +0,0 @@
-namespace Lucene.Net.Facet.Taxonomy.Directory
-{
-
- using BytesRef = Lucene.Net.Util.BytesRef;
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// @lucene.experimental
- /// </summary>
- internal abstract class Consts
- {
- internal const string FULL = "$full_path$";
- internal const string FIELD_PAYLOADS = "$payloads$";
- internal const string PAYLOAD_PARENT = "p";
- internal static readonly BytesRef PAYLOAD_PARENT_BYTES_REF = new BytesRef(PAYLOAD_PARENT);
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs
----------------------------------------------------------------------
diff --git a/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs b/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs
deleted file mode 100644
index a567210..0000000
--- a/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyReader.cs
+++ /dev/null
@@ -1,450 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.IO;
-using System.Text;
-using Lucene.Net.Store;
-
-namespace Lucene.Net.Facet.Taxonomy.Directory
-{
- using Document = Lucene.Net.Documents.Document;
- using Lucene.Net.Facet.Taxonomy;
- using CorruptIndexException = Lucene.Net.Index.CorruptIndexException; // javadocs
- using DirectoryReader = Lucene.Net.Index.DirectoryReader;
- using DocsEnum = Lucene.Net.Index.DocsEnum;
- using IndexWriter = Lucene.Net.Index.IndexWriter;
- using MultiFields = Lucene.Net.Index.MultiFields;
- using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
- using Directory = Lucene.Net.Store.Directory;
- using BytesRef = Lucene.Net.Util.BytesRef;
- using IOUtils = Lucene.Net.Util.IOUtils;
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// A <seealso cref="TaxonomyReader"/> which retrieves stored taxonomy information from a
- /// <seealso cref="Directory"/>.
- /// <P>
- /// Reading from the on-disk index on every method call is too slow, so this
- /// implementation employs caching: Some methods cache recent requests and their
- /// results, while other methods prefetch all the data into memory and then
- /// provide answers directly from in-memory tables. See the documentation of
- /// individual methods for comments on their performance.
- ///
- /// @lucene.experimental
- /// </summary>
- public class DirectoryTaxonomyReader : TaxonomyReader, IDisposable
- {
-
- public class IntClass
- {
- public int? IntItem { get; set; }
- }
- private const int DEFAULT_CACHE_VALUE = 4000;
-
- private readonly DirectoryTaxonomyWriter taxoWriter;
- private readonly long taxoEpoch; // used in doOpenIfChanged
- private readonly DirectoryReader indexReader;
-
- // TODO: test DoubleBarrelLRUCache and consider using it instead
- private LRUHashMap<FacetLabel, IntClass> ordinalCache;
- private LRUHashMap<int, FacetLabel> categoryCache;
-
- private volatile TaxonomyIndexArrays taxoArrays;
-
- /// <summary>
- /// Called only from <seealso cref="#doOpenIfChanged()"/>. If the taxonomy has been
- /// recreated, you should pass {@code null} as the caches and parent/children
- /// arrays.
- /// </summary>
- internal DirectoryTaxonomyReader(DirectoryReader indexReader, DirectoryTaxonomyWriter taxoWriter, LRUHashMap<FacetLabel, IntClass> ordinalCache, LRUHashMap<int, FacetLabel> categoryCache, TaxonomyIndexArrays taxoArrays)
- {
- this.indexReader = indexReader;
- this.taxoWriter = taxoWriter;
- this.taxoEpoch = taxoWriter == null ? -1 : taxoWriter.TaxonomyEpoch;
-
- // use the same instance of the cache, note the protective code in getOrdinal and getPath
- this.ordinalCache = ordinalCache == null ? new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE) : ordinalCache;
- this.categoryCache = categoryCache == null ? new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE) : categoryCache;
-
- this.taxoArrays = taxoArrays != null ? new TaxonomyIndexArrays(indexReader, taxoArrays) : null;
- }
-
- /// <summary>
- /// Open for reading a taxonomy stored in a given <seealso cref="Directory"/>.
- /// </summary>
- /// <param name="directory">
- /// The <seealso cref="Directory"/> in which the taxonomy resides. </param>
- /// <exception cref="CorruptIndexException">
- /// if the Taxonomy is corrupt. </exception>
- /// <exception cref="IOException">
- /// if another error occurred. </exception>
- public DirectoryTaxonomyReader(Directory directory)
- {
- indexReader = OpenIndexReader(directory);
- taxoWriter = null;
- taxoEpoch = -1;
-
- // These are the default cache sizes; they can be configured after
- // construction with the cache's setMaxSize() method
-
- ordinalCache = new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE);
- categoryCache = new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE);
- }
-
- /// <summary>
- /// Opens a <seealso cref="DirectoryTaxonomyReader"/> over the given
- /// <seealso cref="DirectoryTaxonomyWriter"/> (for NRT).
- /// </summary>
- /// <param name="taxoWriter">
- /// The <seealso cref="DirectoryTaxonomyWriter"/> from which to obtain newly
- /// added categories, in real-time. </param>
- public DirectoryTaxonomyReader(DirectoryTaxonomyWriter taxoWriter)
- {
- this.taxoWriter = taxoWriter;
- taxoEpoch = taxoWriter.TaxonomyEpoch;
- indexReader = OpenIndexReader(taxoWriter.InternalIndexWriter);
-
- // These are the default cache sizes; they can be configured after
- // construction with the cache's setMaxSize() method
-
- ordinalCache = new LRUHashMap<FacetLabel, IntClass>(DEFAULT_CACHE_VALUE);
- categoryCache = new LRUHashMap<int, FacetLabel>(DEFAULT_CACHE_VALUE);
- }
-
- private void InitTaxoArrays()
- {
- lock (this)
- {
- if (taxoArrays == null)
- {
- // according to Java Concurrency in Practice, this might perform better on
- // some JVMs, because the array initialization doesn't happen on the
- // volatile member.
- TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(indexReader);
- taxoArrays = tmpArrays;
- }
- }
- }
-
- protected internal override void DoClose()
- {
- indexReader.Dispose();
- taxoArrays = null;
- // do not clear() the caches, as they may be used by other DTR instances.
- ordinalCache = null;
- categoryCache = null;
- }
-
- /// <summary>
- /// Implements the opening of a new <seealso cref="DirectoryTaxonomyReader"/> instance if
- /// the taxonomy has changed.
- ///
- /// <para>
- /// <b>NOTE:</b> the returned <seealso cref="DirectoryTaxonomyReader"/> shares the
- /// ordinal and category caches with this reader. This is not expected to cause
- /// any issues, unless the two instances continue to live. The reader
- /// guarantees that the two instances cannot affect each other in terms of
- /// correctness of the caches, however if the size of the cache is changed
- /// through <seealso cref="#setCacheSize(int)"/>, it will affect both reader instances.
- /// </para>
- /// </summary>
- protected override TaxonomyReader DoOpenIfChanged()
- {
- EnsureOpen();
-
- // This works for both NRT and non-NRT readers (i.e. an NRT reader remains NRT).
- var r2 = DirectoryReader.OpenIfChanged(indexReader);
- if (r2 == null)
- {
- return null; // no changes, nothing to do
- }
-
- // check if the taxonomy was recreated
- bool success = false;
- try
- {
- bool recreated = false;
- if (taxoWriter == null)
- {
- // not NRT, check epoch from commit data
- string t1 = indexReader.IndexCommit.UserData[DirectoryTaxonomyWriter.INDEX_EPOCH];
- string t2 = r2.IndexCommit.UserData[DirectoryTaxonomyWriter.INDEX_EPOCH];
- if (t1 == null)
- {
- if (t2 != null)
- {
- recreated = true;
- }
- }
- else if (!t1.Equals(t2))
- {
- // t1 != null and t2 cannot be null b/c DirTaxoWriter always puts the commit data.
- // it's ok to use String.equals because we require the two epoch values to be the same.
- recreated = true;
- }
- }
- else
- {
- // NRT, compare current taxoWriter.epoch() vs the one that was given at construction
- if (taxoEpoch != taxoWriter.TaxonomyEpoch)
- {
- recreated = true;
- }
- }
-
- DirectoryTaxonomyReader newtr;
- if (recreated)
- {
- // if recreated, do not reuse anything from this instace. the information
- // will be lazily computed by the new instance when needed.
- newtr = new DirectoryTaxonomyReader(r2, taxoWriter, null, null, null);
- }
- else
- {
- newtr = new DirectoryTaxonomyReader(r2, taxoWriter, ordinalCache, categoryCache, taxoArrays);
- }
-
- success = true;
- return newtr;
- }
- finally
- {
- if (!success)
- {
- IOUtils.CloseWhileHandlingException(r2);
- }
- }
- }
-
- /// <summary>
- /// Open the <seealso cref="DirectoryReader"/> from this {@link
- /// Directory}.
- /// </summary>
- protected virtual DirectoryReader OpenIndexReader(Directory directory)
- {
- return DirectoryReader.Open(directory);
- }
-
- /// <summary>
- /// Open the <seealso cref="DirectoryReader"/> from this {@link
- /// IndexWriter}.
- /// </summary>
- protected virtual DirectoryReader OpenIndexReader(IndexWriter writer)
- {
- return DirectoryReader.Open(writer, false);
- }
-
- /// <summary>
- /// Expert: returns the underlying <seealso cref="DirectoryReader"/> instance that is
- /// used by this <seealso cref="TaxonomyReader"/>.
- /// </summary>
- internal virtual DirectoryReader InternalIndexReader
- {
- get
- {
- EnsureOpen();
- return indexReader;
- }
- }
-
- public override ParallelTaxonomyArrays ParallelTaxonomyArrays
- {
- get
- {
- EnsureOpen();
- if (taxoArrays == null)
- {
- InitTaxoArrays();
- }
- return taxoArrays;
- }
- }
-
- public override IDictionary<string, string> CommitUserData
- {
- get
- {
- EnsureOpen();
- return indexReader.IndexCommit.UserData;
- }
- }
-
- public override int GetOrdinal(FacetLabel cp)
- {
- EnsureOpen();
- if (cp.Length == 0)
- {
- return ROOT_ORDINAL;
- }
-
- // First try to find the answer in the LRU cache:
- lock (ordinalCache)
- {
- IntClass res = ordinalCache.Get(cp);
- if (res != null && res.IntItem != null)
- {
- if ((int)res.IntItem.Value < indexReader.MaxDoc)
- {
- // Since the cache is shared with DTR instances allocated from
- // doOpenIfChanged, we need to ensure that the ordinal is one that
- // this DTR instance recognizes.
- return (int)res.IntItem.Value;
- }
- else
- {
- // if we get here, it means that the category was found in the cache,
- // but is not recognized by this TR instance. Therefore there's no
- // need to continue search for the path on disk, because we won't find
- // it there too.
- return TaxonomyReader.INVALID_ORDINAL;
- }
- }
- }
-
- // If we're still here, we have a cache miss. We need to fetch the
- // value from disk, and then also put it in the cache:
- int ret = TaxonomyReader.INVALID_ORDINAL;
- DocsEnum docs = MultiFields.GetTermDocsEnum(indexReader, null, Consts.FULL, new BytesRef(FacetsConfig.PathToString(cp.Components, cp.Length)), 0);
- if (docs != null && docs.NextDoc() != DocIdSetIterator.NO_MORE_DOCS)
- {
- ret = docs.DocID();
-
- // we only store the fact that a category exists, not its inexistence.
- // This is required because the caches are shared with new DTR instances
- // that are allocated from doOpenIfChanged. Therefore, if we only store
- // information about found categories, we cannot accidently tell a new
- // generation of DTR that a category does not exist.
- lock (ordinalCache)
- {
- ordinalCache.Put(cp, new IntClass { IntItem = Convert.ToInt32(ret) });
- }
- }
-
- return ret;
- }
-
- public override FacetLabel GetPath(int ordinal)
- {
- EnsureOpen();
-
- // Since the cache is shared with DTR instances allocated from
- // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
- // instance recognizes. Therefore we do this check up front, before we hit
- // the cache.
- if (ordinal < 0 || ordinal >= indexReader.MaxDoc)
- {
- return null;
- }
-
- // TODO: can we use an int-based hash impl, such as IntToObjectMap,
- // wrapped as LRU?
- int catIDInteger = Convert.ToInt32(ordinal);
- lock (categoryCache)
- {
- var res = categoryCache.Get(catIDInteger,false);
- if (res != null)
- {
- return res;
- }
- }
-
- Document doc = indexReader.Document(ordinal);
- FacetLabel ret = new FacetLabel(FacetsConfig.StringToPath(doc.Get(Consts.FULL)));
- lock (categoryCache)
- {
- categoryCache.Put(catIDInteger, ret);
- }
-
- return ret;
- }
-
- public override int Size
- {
- get
- {
- EnsureOpen();
- return indexReader.NumDocs;
- }
- }
-
- /// <summary>
- /// setCacheSize controls the maximum allowed size of each of the caches
- /// used by <seealso cref="#getPath(int)"/> and <seealso cref="#getOrdinal(FacetLabel)"/>.
- /// <P>
- /// Currently, if the given size is smaller than the current size of
- /// a cache, it will not shrink, and rather we be limited to its current
- /// size. </summary>
- /// <param name="size"> the new maximum cache size, in number of entries. </param>
- public virtual int CacheSize
- {
- set
- {
- EnsureOpen();
- lock (categoryCache)
- {
- categoryCache.MaxSize = value;
- }
- lock (ordinalCache)
- {
- ordinalCache.MaxSize = value;
- }
- }
- }
-
- /// <summary>
- /// Returns ordinal -> label mapping, up to the provided
- /// max ordinal or number of ordinals, whichever is
- /// smaller.
- /// </summary>
- public virtual string ToString(int max)
- {
- EnsureOpen();
- StringBuilder sb = new StringBuilder();
- int upperl = Math.Min(max, indexReader.MaxDoc);
- for (int i = 0; i < upperl; i++)
- {
- try
- {
- FacetLabel category = this.GetPath(i);
- if (category == null)
- {
- sb.Append(i + ": NULL!! \n");
- continue;
- }
- if (category.Length == 0)
- {
- sb.Append(i + ": EMPTY STRING!! \n");
- continue;
- }
- sb.Append(i + ": " + category.ToString() + "\n");
- }
- catch (IOException e)
- {
- throw;
- }
- }
- return sb.ToString();
- }
-
- public void Dispose()
- {
- Dispose(true);
- }
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs
----------------------------------------------------------------------
diff --git a/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs b/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs
deleted file mode 100644
index 63967ee..0000000
--- a/Lucene.Net.Facet/Taxonomy/Directory/DirectoryTaxonomyWriter.cs
+++ /dev/null
@@ -1,1202 +0,0 @@
-using System;
-using System.Collections;
-using System.Collections.Concurrent;
-using System.Diagnostics;
-using System.Collections.Generic;
-using System.IO;
-using Lucene.Net.Analysis.Tokenattributes;
-using Lucene.Net.Store;
-using Lucene.Net.Support;
-
-namespace Lucene.Net.Facet.Taxonomy.Directory
-{
-
- using TokenStream = Lucene.Net.Analysis.TokenStream;
- using CharTermAttribute = Lucene.Net.Analysis.Tokenattributes.CharTermAttribute;
- using PositionIncrementAttribute = Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
- using Document = Lucene.Net.Documents.Document;
- using Field = Lucene.Net.Documents.Field;
- using FieldType = Lucene.Net.Documents.FieldType;
- using StringField = Lucene.Net.Documents.StringField;
- using TextField = Lucene.Net.Documents.TextField;
- using TaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.TaxonomyWriterCache;
- using Cl2oTaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.Cl2oTaxonomyWriterCache;
- using LruTaxonomyWriterCache = Lucene.Net.Facet.Taxonomy.WriterCache.LruTaxonomyWriterCache;
- using AtomicReader = Lucene.Net.Index.AtomicReader;
- using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
- using CorruptIndexException = Lucene.Net.Index.CorruptIndexException; // javadocs
- using DirectoryReader = Lucene.Net.Index.DirectoryReader;
- using DocsEnum = Lucene.Net.Index.DocsEnum;
- using IndexReader = Lucene.Net.Index.IndexReader;
- using IndexWriter = Lucene.Net.Index.IndexWriter;
- using OpenMode = Lucene.Net.Index.IndexWriterConfig.OpenMode_e;
- using IndexWriterConfig = Lucene.Net.Index.IndexWriterConfig;
- using LogByteSizeMergePolicy = Lucene.Net.Index.LogByteSizeMergePolicy;
- using ReaderManager = Lucene.Net.Index.ReaderManager;
- using SegmentInfos = Lucene.Net.Index.SegmentInfos;
- using Terms = Lucene.Net.Index.Terms;
- using TermsEnum = Lucene.Net.Index.TermsEnum;
- using TieredMergePolicy = Lucene.Net.Index.TieredMergePolicy;
- using AlreadyClosedException = Lucene.Net.Store.AlreadyClosedException;
- using Directory = Lucene.Net.Store.Directory;
- using LockObtainFailedException = Lucene.Net.Store.LockObtainFailedException; // javadocs
- using NativeFSLockFactory = Lucene.Net.Store.NativeFSLockFactory;
- using SimpleFSLockFactory = Lucene.Net.Store.SimpleFSLockFactory;
- using BytesRef = Lucene.Net.Util.BytesRef;
- using Version = Lucene.Net.Util.Version;
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// <seealso cref="TaxonomyWriter"/> which uses a <seealso cref="Directory"/> to store the taxonomy
- /// information on disk, and keeps an additional in-memory cache of some or all
- /// categories.
- /// <para>
- /// In addition to the permanently-stored information in the <seealso cref="Directory"/>,
- /// efficiency dictates that we also keep an in-memory cache of <B>recently
- /// seen</B> or <B>all</B> categories, so that we do not need to go back to disk
- /// for every category addition to see which ordinal this category already has,
- /// if any. A <seealso cref="TaxonomyWriterCache"/> object determines the specific caching
- /// algorithm used.
- /// </para>
- /// <para>
- /// This class offers some hooks for extending classes to control the
- /// <seealso cref="IndexWriter"/> instance that is used. See <seealso cref="#openIndexWriter"/>.
- ///
- /// @lucene.experimental
- /// </para>
- /// </summary>
- public class DirectoryTaxonomyWriter : TaxonomyWriter
- {
-
- /// <summary>
- /// Property name of user commit data that contains the index epoch. The epoch
- /// changes whenever the taxonomy is recreated (i.e. opened with
- /// <seealso cref="OpenMode#CREATE"/>.
- /// <para>
- /// Applications should not use this property in their commit data because it
- /// will be overridden by this taxonomy writer.
- /// </para>
- /// </summary>
- public const string INDEX_EPOCH = "index.epoch";
-
- private readonly Directory dir;
- private readonly IndexWriter indexWriter;
- private readonly TaxonomyWriterCache cache;
- private readonly AtomicInteger cacheMisses = new AtomicInteger(0);
-
- // Records the taxonomy index epoch, updated on replaceTaxonomy as well.
- private long indexEpoch;
-
- private SinglePositionTokenStream parentStream = new SinglePositionTokenStream(Consts.PAYLOAD_PARENT);
- private Field parentStreamField;
- private Field fullPathField;
- private int cacheMissesUntilFill = 11;
- private bool shouldFillCache = true;
-
- // even though lazily initialized, not volatile so that access to it is
- // faster. we keep a volatile boolean init instead.
- private ReaderManager readerManager;
- private volatile bool initializedReaderManager = false;
- private volatile bool shouldRefreshReaderManager;
-
- /// <summary>
- /// We call the cache "complete" if we know that every category in our
- /// taxonomy is in the cache. When the cache is <B>not</B> complete, and
- /// we can't find a category in the cache, we still need to look for it
- /// in the on-disk index; Therefore when the cache is not complete, we
- /// need to open a "reader" to the taxonomy index.
- /// The cache becomes incomplete if it was never filled with the existing
- /// categories, or if a put() to the cache ever returned true (meaning
- /// that some of the cached data was cleared).
- /// </summary>
- private volatile bool cacheIsComplete;
- private volatile bool isClosed = false;
- private volatile TaxonomyIndexArrays taxoArrays;
- private volatile int nextID;
-
- /// <summary>
- /// Reads the commit data from a Directory. </summary>
- private static IDictionary<string, string> ReadCommitData(Directory dir)
- {
- SegmentInfos infos = new SegmentInfos();
- infos.Read(dir);
- return infos.UserData;
- }
-
- /// <summary>
- /// Forcibly unlocks the taxonomy in the named directory.
- /// <P>
- /// Caution: this should only be used by failure recovery code, when it is
- /// known that no other process nor thread is in fact currently accessing
- /// this taxonomy.
- /// <P>
- /// This method is unnecessary if your <seealso cref="Directory"/> uses a
- /// <seealso cref="NativeFSLockFactory"/> instead of the default
- /// <seealso cref="SimpleFSLockFactory"/>. When the "native" lock is used, a lock
- /// does not stay behind forever when the process using it dies.
- /// </summary>
- public static void Unlock(Directory directory)
- {
- IndexWriter.Unlock(directory);
- }
-
- /// <summary>
- /// Construct a Taxonomy writer.
- /// </summary>
- /// <param name="directory">
- /// The <seealso cref="Directory"/> in which to store the taxonomy. Note that
- /// the taxonomy is written directly to that directory (not to a
- /// subdirectory of it). </param>
- /// <param name="openMode">
- /// Specifies how to open a taxonomy for writing: <code>APPEND</code>
- /// means open an existing index for append (failing if the index does
- /// not yet exist). <code>CREATE</code> means create a new index (first
- /// deleting the old one if it already existed).
- /// <code>APPEND_OR_CREATE</code> appends to an existing index if there
- /// is one, otherwise it creates a new index. </param>
- /// <param name="cache">
- /// A <seealso cref="TaxonomyWriterCache"/> implementation which determines
- /// the in-memory caching policy. See for example
- /// <seealso cref="LruTaxonomyWriterCache"/> and <seealso cref="Cl2oTaxonomyWriterCache"/>.
- /// If null or missing, <seealso cref="#defaultTaxonomyWriterCache()"/> is used. </param>
- /// <exception cref="CorruptIndexException">
- /// if the taxonomy is corrupted. </exception>
- /// <exception cref="LockObtainFailedException">
- /// if the taxonomy is locked by another writer. If it is known
- /// that no other concurrent writer is active, the lock might
- /// have been left around by an old dead process, and should be
- /// removed using <seealso cref="#unlock(Directory)"/>. </exception>
- /// <exception cref="IOException">
- /// if another error occurred. </exception>
- public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode, TaxonomyWriterCache cache)
- {
-
- dir = directory;
- IndexWriterConfig config = CreateIndexWriterConfig(openMode);
- indexWriter = OpenIndexWriter(dir, config);
-
- // verify (to some extent) that merge policy in effect would preserve category docids
- if (indexWriter != null)
- {
- Debug.Assert(!(indexWriter.Config.MergePolicy is TieredMergePolicy), "for preserving category docids, merging none-adjacent segments is not allowed");
- }
-
- // after we opened the writer, and the index is locked, it's safe to check
- // the commit data and read the index epoch
- openMode = config.OpenMode.HasValue ? config.OpenMode.Value : OpenMode.CREATE_OR_APPEND;
- if (!DirectoryReader.IndexExists(directory))
- {
- indexEpoch = 1;
- }
- else
- {
- string epochStr = null;
- IDictionary<string, string> commitData = ReadCommitData(directory);
- if (commitData != null && commitData.ContainsKey(INDEX_EPOCH))
- {
- epochStr = commitData[INDEX_EPOCH];
- }
- // no commit data, or no epoch in it means an old taxonomy, so set its epoch to 1, for lack
- // of a better value.
- indexEpoch = epochStr == null ? 1 : Convert.ToInt64(epochStr, 16);
- }
-
- if (openMode == OpenMode.CREATE)
- {
- ++indexEpoch;
- }
-
- FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
- ft.OmitNorms = true;
- parentStreamField = new Field(Consts.FIELD_PAYLOADS, parentStream, ft);
- fullPathField = new StringField(Consts.FULL, "", Field.Store.YES);
-
- if (indexWriter == null)
- return;
-
- nextID = indexWriter.MaxDoc;
-
- if (cache == null)
- {
- cache = DefaultTaxonomyWriterCache();
- }
- this.cache = cache;
-
- if (nextID == 0)
- {
- cacheIsComplete = true;
- // Make sure that the taxonomy always contain the root category
- // with category id 0.
- AddCategory(new FacetLabel());
- }
- else
- {
- // There are some categories on the disk, which we have not yet
- // read into the cache, and therefore the cache is incomplete.
- // We choose not to read all the categories into the cache now,
- // to avoid terrible performance when a taxonomy index is opened
- // to add just a single category. We will do it later, after we
- // notice a few cache misses.
- cacheIsComplete = false;
- }
- }
-
- /// <summary>
- /// Open internal index writer, which contains the taxonomy data.
- /// <para>
- /// Extensions may provide their own <seealso cref="IndexWriter"/> implementation or instance.
- /// <br><b>NOTE:</b> the instance this method returns will be closed upon calling
- /// to <seealso cref="#close()"/>.
- /// <br><b>NOTE:</b> the merge policy in effect must not merge none adjacent segments. See
- /// comment in <seealso cref="#createIndexWriterConfig(IndexWriterConfig.OpenMode)"/> for the logic behind this.
- ///
- /// </para>
- /// </summary>
- /// <seealso cref= #createIndexWriterConfig(IndexWriterConfig.OpenMode)
- /// </seealso>
- /// <param name="directory">
- /// the <seealso cref="Directory"/> on top of which an <seealso cref="IndexWriter"/>
- /// should be opened. </param>
- /// <param name="config">
- /// configuration for the internal index writer. </param>
- protected virtual IndexWriter OpenIndexWriter(Directory directory, IndexWriterConfig config)
- {
- return new IndexWriter(directory, config);
- }
-
- /// <summary>
- /// Create the <seealso cref="IndexWriterConfig"/> that would be used for opening the internal index writer.
- /// <br>Extensions can configure the <seealso cref="IndexWriter"/> as they see fit,
- /// including setting a <seealso cref="Lucene.Net.index.MergeScheduler merge-scheduler"/>, or
- /// <seealso cref="Lucene.Net.index.IndexDeletionPolicy deletion-policy"/>, different RAM size
- /// etc.<br>
- /// <br><b>NOTE:</b> internal docids of the configured index must not be altered.
- /// For that, categories are never deleted from the taxonomy index.
- /// In addition, merge policy in effect must not merge none adjacent segments.
- /// </summary>
- /// <seealso cref= #openIndexWriter(Directory, IndexWriterConfig)
- /// </seealso>
- /// <param name="openMode"> see <seealso cref="OpenMode"/> </param>
- protected virtual IndexWriterConfig CreateIndexWriterConfig(OpenMode openMode)
- {
- // TODO: should we use a more optimized Codec, e.g. Pulsing (or write custom)?
- // The taxonomy has a unique structure, where each term is associated with one document
-
- // :Post-Release-Update-Version.LUCENE_XY:
- // Make sure we use a MergePolicy which always merges adjacent segments and thus
- // keeps the doc IDs ordered as well (this is crucial for the taxonomy index).
- return (new IndexWriterConfig(Version.LUCENE_48, null)).SetOpenMode(openMode).SetMergePolicy(new LogByteSizeMergePolicy());
- }
-
- /// <summary>
- /// Opens a <seealso cref="ReaderManager"/> from the internal <seealso cref="IndexWriter"/>.
- /// </summary>
- private void InitReaderManager()
- {
- if (!initializedReaderManager)
- {
- lock (this)
- {
- // verify that the taxo-writer hasn't been closed on us.
- EnsureOpen();
- if (!initializedReaderManager)
- {
- readerManager = new ReaderManager(indexWriter, false);
- shouldRefreshReaderManager = false;
- initializedReaderManager = true;
- }
- }
- }
- }
-
- /// <summary>
- /// Creates a new instance with a default cache as defined by
- /// <seealso cref="#defaultTaxonomyWriterCache()"/>.
- /// </summary>
- public DirectoryTaxonomyWriter(Directory directory, OpenMode openMode = OpenMode.CREATE_OR_APPEND)
- : this(directory, openMode, DefaultTaxonomyWriterCache())
- {
- }
-
- /// <summary>
- /// Defines the default <seealso cref="TaxonomyWriterCache"/> to use in constructors
- /// which do not specify one.
- /// <P>
- /// The current default is <seealso cref="Cl2oTaxonomyWriterCache"/> constructed
- /// with the parameters (1024, 0.15f, 3), i.e., the entire taxonomy is
- /// cached in memory while building it.
- /// </summary>
- public static TaxonomyWriterCache DefaultTaxonomyWriterCache()
- {
- return new Cl2oTaxonomyWriterCache(1024, 0.15f, 3);
- }
-
- /// <summary>
- /// Frees used resources as well as closes the underlying <seealso cref="IndexWriter"/>,
- /// which commits whatever changes made to it to the underlying
- /// <seealso cref="Directory"/>.
- /// </summary>
- public void Dispose()
- {
- lock (this)
- {
- if (!isClosed)
- {
- Commit();
- DoClose();
- }
- }
- }
-
- private void DoClose()
- {
- indexWriter.Dispose();
- isClosed = true;
- CloseResources();
- }
-
- /// <summary>
- /// A hook for extending classes to close additional resources that were used.
- /// The default implementation closes the <seealso cref="IndexReader"/> as well as the
- /// <seealso cref="TaxonomyWriterCache"/> instances that were used. <br>
- /// <b>NOTE:</b> if you override this method, you should include a
- /// <code>super.closeResources()</code> call in your implementation.
- /// </summary>
- protected virtual void CloseResources()
- {
- lock (this)
- {
- if (initializedReaderManager)
- {
- readerManager.Dispose();
- readerManager = null;
- initializedReaderManager = false;
- }
- if (cache != null)
- {
- cache.Close();
- }
- }
- }
-
- /// <summary>
- /// Look up the given category in the cache and/or the on-disk storage,
- /// returning the category's ordinal, or a negative number in case the
- /// category does not yet exist in the taxonomy.
- /// </summary>
- protected virtual int FindCategory(FacetLabel categoryPath)
- {
- lock (this)
- {
- // If we can find the category in the cache, or we know the cache is
- // complete, we can return the response directly from it
- int res = cache.Get(categoryPath);
- if (res >= 0 || cacheIsComplete)
- {
- return res;
- }
-
- cacheMisses.IncrementAndGet();
- // After a few cache misses, it makes sense to read all the categories
- // from disk and into the cache. The reason not to do this on the first
- // cache miss (or even when opening the writer) is that it will
- // significantly slow down the case when a taxonomy is opened just to
- // add one category. The idea only spending a long time on reading
- // after enough time was spent on cache misses is known as an "online
- // algorithm".
- PerhapsFillCache();
- res = cache.Get(categoryPath);
- if (res >= 0 || cacheIsComplete)
- {
- // if after filling the cache from the info on disk, the category is in it
- // or the cache is complete, return whatever cache.get returned.
- return res;
- }
-
- // if we get here, it means the category is not in the cache, and it is not
- // complete, and therefore we must look for the category on disk.
-
- // We need to get an answer from the on-disk index.
- InitReaderManager();
-
- int doc = -1;
- DirectoryReader reader = readerManager.Acquire();
- try
- {
- BytesRef catTerm = new BytesRef(FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length));
- TermsEnum termsEnum = null; // reuse
- DocsEnum docs = null; // reuse
- foreach (AtomicReaderContext ctx in reader.Leaves)
- {
- Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
- if (terms != null)
- {
- termsEnum = terms.Iterator(termsEnum);
- if (termsEnum.SeekExact(catTerm))
- {
- // liveDocs=null because the taxonomy has no deletes
- docs = termsEnum.Docs(null, docs, 0); // freqs not required
- // if the term was found, we know it has exactly one document.
- doc = docs.NextDoc() + ctx.DocBase;
- break;
- }
- }
- }
- }
- finally
- {
- readerManager.Release(reader);
- }
- if (doc > 0)
- {
- AddToCache(categoryPath, doc);
- }
- return doc;
- }
- }
-
- public virtual int AddCategory(FacetLabel categoryPath)
- {
- EnsureOpen();
- // check the cache outside the synchronized block. this results in better
- // concurrency when categories are there.
- int res = cache.Get(categoryPath);
- if (res < 0)
- {
- // the category is not in the cache - following code cannot be executed in parallel.
- lock (this)
- {
- res = FindCategory(categoryPath);
- if (res < 0)
- {
- // This is a new category, and we need to insert it into the index
- // (and the cache). Actually, we might also need to add some of
- // the category's ancestors before we can add the category itself
- // (while keeping the invariant that a parent is always added to
- // the taxonomy before its child). internalAddCategory() does all
- // this recursively
- res = InternalAddCategory(categoryPath);
- }
- }
- }
- return res;
- }
-
- /// <summary>
- /// Add a new category into the index (and the cache), and return its new
- /// ordinal.
- /// <para>
- /// Actually, we might also need to add some of the category's ancestors
- /// before we can add the category itself (while keeping the invariant that a
- /// parent is always added to the taxonomy before its child). We do this by
- /// recursion.
- /// </para>
- /// </summary>
- private int InternalAddCategory(FacetLabel cp)
- {
- // Find our parent's ordinal (recursively adding the parent category
- // to the taxonomy if it's not already there). Then add the parent
- // ordinal as payloads (rather than a stored field; payloads can be
- // more efficiently read into memory in bulk by LuceneTaxonomyReader)
- int parent;
- if (cp.Length > 1)
- {
- FacetLabel parentPath = cp.Subpath(cp.Length - 1);
- parent = FindCategory(parentPath);
- if (parent < 0)
- {
- parent = InternalAddCategory(parentPath);
- }
- }
- else if (cp.Length == 1)
- {
- parent = TaxonomyReader.ROOT_ORDINAL;
- }
- else
- {
- parent = TaxonomyReader.INVALID_ORDINAL;
- }
- int id = AddCategoryDocument(cp, parent);
-
- return id;
- }
-
- /// <summary>
- /// Verifies that this instance wasn't closed, or throws
- /// <seealso cref="AlreadyClosedException"/> if it is.
- /// </summary>
- protected internal void EnsureOpen()
- {
- if (isClosed)
- {
- throw new AlreadyClosedException("The taxonomy writer has already been closed");
- }
- }
-
- /// <summary>
- /// Note that the methods calling addCategoryDocument() are synchornized, so
- /// this method is effectively synchronized as well.
- /// </summary>
- private int AddCategoryDocument(FacetLabel categoryPath, int parent)
- {
- // Before Lucene 2.9, position increments >=0 were supported, so we
- // added 1 to parent to allow the parent -1 (the parent of the root).
- // Unfortunately, starting with Lucene 2.9, after LUCENE-1542, this is
- // no longer enough, since 0 is not encoded consistently either (see
- // comment in SinglePositionTokenStream). But because we must be
- // backward-compatible with existing indexes, we can't just fix what
- // we write here (e.g., to write parent+2), and need to do a workaround
- // in the reader (which knows that anyway only category 0 has a parent
- // -1).
- parentStream.Set(Math.Max(parent + 1, 1));
- Document d = new Document();
- d.Add(parentStreamField);
-
- fullPathField.StringValue = FacetsConfig.PathToString(categoryPath.Components, categoryPath.Length);
- d.Add(fullPathField);
-
- // Note that we do no pass an Analyzer here because the fields that are
- // added to the Document are untokenized or contains their own TokenStream.
- // Therefore the IndexWriter's Analyzer has no effect.
- indexWriter.AddDocument(d);
- int id = nextID++;
-
- // added a category document, mark that ReaderManager is not up-to-date
- shouldRefreshReaderManager = true;
-
- // also add to the parent array
- taxoArrays = TaxoArrays.Add(id, parent);
-
- // NOTE: this line must be executed last, or else the cache gets updated
- // before the parents array (LUCENE-4596)
- AddToCache(categoryPath, id);
-
- return id;
- }
-
- private class SinglePositionTokenStream : TokenStream
- {
- internal ICharTermAttribute termAtt;
- internal IPositionIncrementAttribute posIncrAtt;
- internal bool returned;
- internal int val;
- internal readonly string word;
-
- public SinglePositionTokenStream(string word)
- {
- termAtt = AddAttribute<ICharTermAttribute>();
- posIncrAtt = AddAttribute<IPositionIncrementAttribute>();
- this.word = word;
- returned = true;
- }
-
- /// <summary>
- /// Set the value we want to keep, as the position increment.
- /// Note that when TermPositions.nextPosition() is later used to
- /// retrieve this value, val-1 will be returned, not val.
- /// <P>
- /// IMPORTANT NOTE: Before Lucene 2.9, val>=0 were safe (for val==0,
- /// the retrieved position would be -1). But starting with Lucene 2.9,
- /// this unfortunately changed, and only val>0 are safe. val=0 can
- /// still be used, but don't count on the value you retrieve later
- /// (it could be 0 or -1, depending on circumstances or versions).
- /// This change is described in Lucene's JIRA: LUCENE-1542.
- /// </summary>
- public virtual void Set(int val)
- {
- this.val = val;
- returned = false;
- }
-
- public override bool IncrementToken()
- {
- if (returned)
- {
- return false;
- }
- ClearAttributes();
- posIncrAtt.PositionIncrement = val;
- termAtt.SetEmpty();
- termAtt.Append(word);
- returned = true;
- return true;
- }
- }
-
- private void AddToCache(FacetLabel categoryPath, int id)
- {
- if (cache.Put(categoryPath, id))
- {
- // If cache.put() returned true, it means the cache was limited in
- // size, became full, and parts of it had to be evicted. It is
- // possible that a relatively-new category that isn't yet visible
- // to our 'reader' was evicted, and therefore we must now refresh
- // the reader.
- RefreshReaderManager();
- cacheIsComplete = false;
- }
- }
-
- private void RefreshReaderManager()
- {
- lock (this)
- {
- // this method is synchronized since it cannot happen concurrently with
- // addCategoryDocument -- when this method returns, we must know that the
- // reader manager's state is current. also, it sets shouldRefresh to false,
- // and this cannot overlap with addCatDoc too.
- // NOTE: since this method is sync'ed, it can call maybeRefresh, instead of
- // maybeRefreshBlocking. If ever this is changed, make sure to change the
- // call too.
- if (shouldRefreshReaderManager && initializedReaderManager)
- {
- readerManager.MaybeRefresh();
- shouldRefreshReaderManager = false;
- }
- }
- }
-
- public virtual void Commit()
- {
- lock (this)
- {
- EnsureOpen();
- // LUCENE-4972: if we always call setCommitData, we create empty commits
- string epochStr = null;
- indexWriter.CommitData.TryGetValue(INDEX_EPOCH, out epochStr);
- if (epochStr == null || Convert.ToInt64(epochStr, 16) != indexEpoch)
- {
- indexWriter.CommitData = CombinedCommitData(indexWriter.CommitData);
- }
- indexWriter.Commit();
- }
- }
-
- /// <summary>
- /// Combine original user data with the taxonomy epoch. </summary>
- private IDictionary<string, string> CombinedCommitData(IDictionary<string, string> commitData)
- {
- IDictionary<string, string> m = new Dictionary<string, string>();
- if (commitData != null)
- {
- m.PutAll(commitData);
- }
- m[INDEX_EPOCH] = Convert.ToString(indexEpoch, 16);
- return m;
- }
-
- public virtual IDictionary<string, string> CommitData
- {
- set
- {
- indexWriter.CommitData = CombinedCommitData(value);
- }
- get
- {
- return CombinedCommitData(indexWriter.CommitData);
- }
- }
-
-
- /// <summary>
- /// prepare most of the work needed for a two-phase commit.
- /// See <seealso cref="IndexWriter#prepareCommit"/>.
- /// </summary>
- public virtual void PrepareCommit()
- {
- lock (this)
- {
- EnsureOpen();
- // LUCENE-4972: if we always call setCommitData, we create empty commits
- string epochStr = indexWriter.CommitData[INDEX_EPOCH];
- if (epochStr == null || Convert.ToInt64(epochStr, 16) != indexEpoch)
- {
- indexWriter.CommitData = CombinedCommitData(indexWriter.CommitData);
- }
- indexWriter.PrepareCommit();
- }
- }
-
- public virtual int Size
- {
- get
- {
- EnsureOpen();
- return nextID;
- }
- }
-
- /// <summary>
- /// Set the number of cache misses before an attempt is made to read the entire
- /// taxonomy into the in-memory cache.
- /// <para>
- /// This taxonomy writer holds an in-memory cache of recently seen categories
- /// to speed up operation. On each cache-miss, the on-disk index needs to be
- /// consulted. When an existing taxonomy is opened, a lot of slow disk reads
- /// like that are needed until the cache is filled, so it is more efficient to
- /// read the entire taxonomy into memory at once. We do this complete read
- /// after a certain number (defined by this method) of cache misses.
- /// </para>
- /// <para>
- /// If the number is set to {@code 0}, the entire taxonomy is read into the
- /// cache on first use, without fetching individual categories first.
- /// </para>
- /// <para>
- /// NOTE: it is assumed that this method is called immediately after the
- /// taxonomy writer has been created.
- /// </para>
- /// </summary>
- public virtual int CacheMissesUntilFill
- {
- set
- {
- EnsureOpen();
- cacheMissesUntilFill = value;
- }
- }
-
- // we need to guarantee that if several threads call this concurrently, only
- // one executes it, and after it returns, the cache is updated and is either
- // complete or not.
- private void PerhapsFillCache()
- {
- lock (this)
- {
- if (cacheMisses.Get() < cacheMissesUntilFill)
- {
- return;
- }
-
- if (!shouldFillCache)
- {
- // we already filled the cache once, there's no need to re-fill it
- return;
- }
- shouldFillCache = false;
-
- InitReaderManager();
-
- bool aborted = false;
- DirectoryReader reader = readerManager.Acquire();
- try
- {
- TermsEnum termsEnum = null;
- DocsEnum docsEnum = null;
- foreach (AtomicReaderContext ctx in reader.Leaves)
- {
- Terms terms = ctx.AtomicReader.Terms(Consts.FULL);
- if (terms != null) // cannot really happen, but be on the safe side
- {
- termsEnum = terms.Iterator(termsEnum);
- while (termsEnum.Next() != null)
- {
- if (!cache.Full)
- {
- BytesRef t = termsEnum.Term();
- // Since we guarantee uniqueness of categories, each term has exactly
- // one document. Also, since we do not allow removing categories (and
- // hence documents), there are no deletions in the index. Therefore, it
- // is sufficient to call next(), and then doc(), exactly once with no
- // 'validation' checks.
- FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(t.Utf8ToString()));
- docsEnum = termsEnum.Docs(null, docsEnum, DocsEnum.FLAG_NONE);
- bool res = cache.Put(cp, docsEnum.NextDoc() + ctx.DocBase);
- Debug.Assert(!res, "entries should not have been evicted from the cache");
- }
- else
- {
- // the cache is full and the next put() will evict entries from it, therefore abort the iteration.
- aborted = true;
- break;
- }
- }
- }
- if (aborted)
- {
- break;
- }
- }
- }
- finally
- {
- readerManager.Release(reader);
- }
-
- cacheIsComplete = !aborted;
- if (cacheIsComplete)
- {
- lock (this)
- {
- // everything is in the cache, so no need to keep readerManager open.
- // this block is executed in a sync block so that it works well with
- // initReaderManager called in parallel.
- readerManager.Dispose();
- readerManager = null;
- initializedReaderManager = false;
- }
- }
- }
- }
-
- private TaxonomyIndexArrays TaxoArrays
- {
- get
- {
- if (taxoArrays == null)
- {
- lock (this)
- {
- if (taxoArrays == null)
- {
- InitReaderManager();
- DirectoryReader reader = readerManager.Acquire();
- try
- {
- // according to Java Concurrency, this might perform better on some
- // JVMs, since the object initialization doesn't happen on the
- // volatile member.
- TaxonomyIndexArrays tmpArrays = new TaxonomyIndexArrays(reader);
- taxoArrays = tmpArrays;
- }
- finally
- {
- readerManager.Release(reader);
- }
- }
- }
- }
- return taxoArrays;
- }
- }
-
- public virtual int GetParent(int ordinal)
- {
- EnsureOpen();
- // Note: the following if() just enforces that a user can never ask
- // for the parent of a nonexistant category - even if the parent array
- // was allocated bigger than it really needs to be.
- if (ordinal >= nextID)
- {
- throw new System.IndexOutOfRangeException("requested ordinal is bigger than the largest ordinal in the taxonomy");
- }
-
- int[] parents = TaxoArrays.Parents();
- Debug.Assert(ordinal < parents.Length, "requested ordinal (" + ordinal + "); parents.length (" + parents.Length + ") !");
- return parents[ordinal];
- }
-
- /// <summary>
- /// Takes the categories from the given taxonomy directory, and adds the
- /// missing ones to this taxonomy. Additionally, it fills the given
- /// <seealso cref="OrdinalMap"/> with a mapping from the original ordinal to the new
- /// ordinal.
- /// </summary>
- public virtual void AddTaxonomy(Directory taxoDir, OrdinalMap map)
- {
- EnsureOpen();
- DirectoryReader r = DirectoryReader.Open(taxoDir);
- try
- {
- int size = r.NumDocs;
- OrdinalMap ordinalMap = map;
- ordinalMap.Size = size;
- int @base = 0;
- TermsEnum te = null;
- DocsEnum docs = null;
- foreach (AtomicReaderContext ctx in r.Leaves)
- {
- AtomicReader ar = ctx.AtomicReader;
- Terms terms = ar.Terms(Consts.FULL);
- te = terms.Iterator(te);
- while (te.Next() != null)
- {
- FacetLabel cp = new FacetLabel(FacetsConfig.StringToPath(te.Term().Utf8ToString()));
- int ordinal = AddCategory(cp);
- docs = te.Docs(null, docs, DocsEnum.FLAG_NONE);
- ordinalMap.AddMapping(docs.NextDoc() + @base, ordinal);
- }
- @base += ar.MaxDoc; // no deletions, so we're ok
- }
- ordinalMap.AddDone();
- }
- finally
- {
- r.Dispose();
- }
- }
-
- /// <summary>
- /// Mapping from old ordinal to new ordinals, used when merging indexes
- /// wit separate taxonomies.
- /// <para>
- /// addToTaxonomies() merges one or more taxonomies into the given taxonomy
- /// (this). An OrdinalMap is filled for each of the added taxonomies,
- /// containing the new ordinal (in the merged taxonomy) of each of the
- /// categories in the old taxonomy.
- /// <P>
- /// There exist two implementations of OrdinalMap: MemoryOrdinalMap and
- /// DiskOrdinalMap. As their names suggest, the former keeps the map in
- /// memory and the latter in a temporary disk file. Because these maps will
- /// later be needed one by one (to remap the counting lists), not all at the
- /// same time, it is recommended to put the first taxonomy's map in memory,
- /// and all the rest on disk (later to be automatically read into memory one
- /// by one, when needed).
- /// </para>
- /// </summary>
- public interface OrdinalMap
- {
- /// <summary>
- /// Set the size of the map. This MUST be called before addMapping().
- /// It is assumed (but not verified) that addMapping() will then be
- /// called exactly 'size' times, with different origOrdinals between 0
- /// and size-1.
- /// </summary>
- int Size { set; }
-
- /// <summary>
- /// Record a mapping. </summary>
- void AddMapping(int origOrdinal, int newOrdinal);
-
- /// <summary>
- /// Call addDone() to say that all addMapping() have been done.
- /// In some implementations this might free some resources.
- /// </summary>
- void AddDone();
-
- /// <summary>
- /// Return the map from the taxonomy's original (consecutive) ordinals
- /// to the new taxonomy's ordinals. If the map has to be read from disk
- /// and ordered appropriately, it is done when getMap() is called.
- /// getMap() should only be called once, and only when the map is actually
- /// needed. Calling it will also free all resources that the map might
- /// be holding (such as temporary disk space), other than the returned int[].
- /// </summary>
- int[] Map { get; }
- }
-
- /// <summary>
- /// <seealso cref="OrdinalMap"/> maintained in memory
- /// </summary>
- public sealed class MemoryOrdinalMap : OrdinalMap
- {
- internal int[] map;
-
- /// <summary>
- /// Sole constructor.
- /// </summary>
- public MemoryOrdinalMap()
- {
- map = new int[] { };
- }
-
- public int Size { set; private get; }
-
- public void AddMapping(int origOrdinal, int newOrdinal)
- {
- if (map.Length - 1 >= origOrdinal)
- {
- map[origOrdinal] = newOrdinal;
- }
- else
- {
- Array.Resize(ref map, origOrdinal + 1);
- map[origOrdinal] = newOrdinal;
- }
-
-
- }
-
- public void AddDone() // nothing to do
- {
- }
- public int[] Map
- {
- get
- {
- return map;
- }
- }
- }
-
- /// <summary>
- /// <seealso cref="OrdinalMap"/> maintained on file system
- /// </summary>
- public sealed class DiskOrdinalMap : OrdinalMap
- {
- internal string tmpfile;
- internal OutputStreamDataOutput @out;
-
- /// <summary>
- /// Sole constructor.
- /// </summary>
- public DiskOrdinalMap(string tmpfile)
- {
- this.tmpfile = tmpfile;
- var outfs = new FileStream(tmpfile, FileMode.OpenOrCreate, FileAccess.Write);
- @out = new OutputStreamDataOutput(outfs);
- }
-
- public void AddMapping(int origOrdinal, int newOrdinal)
- {
- @out.WriteInt(origOrdinal);
- @out.WriteInt(newOrdinal);
- }
-
- public int Size
- {
- set
- {
- @out.WriteInt(value);
- }
- }
-
- public void AddDone()
- {
- if (@out != null)
- {
- @out.Dispose();
- @out = null;
- }
- }
-
- int[] map = null;
-
- public int[] Map
- {
- get
- {
- if (map != null)
- {
- return map;
- }
- AddDone(); // in case this wasn't previously called
-
- var ifs = new FileStream(tmpfile, FileMode.OpenOrCreate, FileAccess.Read);
- var @in = new InputStreamDataInput(ifs);
- map = new int[@in.ReadInt()];
- // NOTE: The current code assumes here that the map is complete,
- // i.e., every ordinal gets one and exactly one value. Otherwise,
- // we may run into an EOF here, or vice versa, not read everything.
- for (int i = 0; i < map.Length; i++)
- {
- int origordinal = @in.ReadInt();
- int newordinal = @in.ReadInt();
- map[origordinal] = newordinal;
- }
- @in.Dispose();
-
- // Delete the temporary file, which is no longer needed.
- if (File.Exists(tmpfile))
- {
- File.Delete(tmpfile);
- }
- return map;
- }
- }
- }
-
- /// <summary>
- /// Rollback changes to the taxonomy writer and closes the instance. Following
- /// this method the instance becomes unusable (calling any of its API methods
- /// will yield an <seealso cref="AlreadyClosedException"/>).
- /// </summary>
- public virtual void Rollback()
- {
- lock (this)
- {
- EnsureOpen();
- indexWriter.Rollback();
- DoClose();
- }
- }
-
- /// <summary>
- /// Replaces the current taxonomy with the given one. This method should
- /// generally be called in conjunction with
- /// <seealso cref="IndexWriter#addIndexes(Directory...)"/> to replace both the taxonomy
- /// as well as the search index content.
- /// </summary>
- public virtual void ReplaceTaxonomy(Directory taxoDir)
- {
- lock (this)
- {
- // replace the taxonomy by doing IW optimized operations
- indexWriter.DeleteAll();
- indexWriter.AddIndexes(taxoDir);
- shouldRefreshReaderManager = true;
- InitReaderManager(); // ensure that it's initialized
- RefreshReaderManager();
- nextID = indexWriter.MaxDoc;
- taxoArrays = null; // must nullify so that it's re-computed next time it's needed
-
- // need to clear the cache, so that addCategory won't accidentally return
- // old categories that are in the cache.
- cache.Clear();
- cacheIsComplete = false;
- shouldFillCache = true;
- cacheMisses.Set(0);
-
- // update indexEpoch as a taxonomy replace is just like it has be recreated
- ++indexEpoch;
- }
- }
-
- /// <summary>
- /// Returns the <seealso cref="Directory"/> of this taxonomy writer. </summary>
- public virtual Directory Directory
- {
- get
- {
- return dir;
- }
- }
-
- /// <summary>
- /// Used by <seealso cref="DirectoryTaxonomyReader"/> to support NRT.
- /// <para>
- /// <b>NOTE:</b> you should not use the obtained <seealso cref="IndexWriter"/> in any
- /// way, other than opening an IndexReader on it, or otherwise, the taxonomy
- /// index may become corrupt!
- /// </para>
- /// </summary>
- internal IndexWriter InternalIndexWriter
- {
- get
- {
- return indexWriter;
- }
- }
-
- /// <summary>
- /// Expert: returns current index epoch, if this is a
- /// near-real-time reader. Used by {@link
- /// DirectoryTaxonomyReader} to support NRT.
- ///
- /// @lucene.internal
- /// </summary>
- public long TaxonomyEpoch
- {
- get
- {
- return indexEpoch;
- }
- }
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs
----------------------------------------------------------------------
diff --git a/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs b/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs
deleted file mode 100644
index 9a99f4a..0000000
--- a/Lucene.Net.Facet/Taxonomy/Directory/TaxonomyIndexArrays.cs
+++ /dev/null
@@ -1,252 +0,0 @@
-using System;
-using System.Diagnostics;
-
-namespace Lucene.Net.Facet.Taxonomy.Directory
-{
-
- using CorruptIndexException = Lucene.Net.Index.CorruptIndexException;
- using DocsAndPositionsEnum = Lucene.Net.Index.DocsAndPositionsEnum;
- using IndexReader = Lucene.Net.Index.IndexReader;
- using MultiFields = Lucene.Net.Index.MultiFields;
- using DocIdSetIterator = Lucene.Net.Search.DocIdSetIterator;
- using ArrayUtil = Lucene.Net.Util.ArrayUtil;
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- /// <summary>
- /// A <seealso cref="ParallelTaxonomyArrays"/> that are initialized from the taxonomy
- /// index.
- ///
- /// @lucene.experimental
- /// </summary>
- internal class TaxonomyIndexArrays : ParallelTaxonomyArrays
- {
-
- private readonly int[] parents_Renamed;
-
- // the following two arrays are lazily intialized. note that we only keep a
- // single boolean member as volatile, instead of declaring the arrays
- // volatile. the code guarantees that only after the boolean is set to true,
- // the arrays are returned.
- private volatile bool initializedChildren = false;
- private int[] children_Renamed, siblings_Renamed;
-
- /// <summary>
- /// Used by <seealso cref="#add(int, int)"/> after the array grew. </summary>
- private TaxonomyIndexArrays(int[] parents)
- {
- this.parents_Renamed = parents;
- }
-
- public TaxonomyIndexArrays(IndexReader reader)
- {
- parents_Renamed = new int[reader.MaxDoc];
- if (parents_Renamed.Length > 0)
- {
- InitParents(reader, 0);
- // Starting Lucene 2.9, following the change LUCENE-1542, we can
- // no longer reliably read the parent "-1" (see comment in
- // LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
- // to fix this in indexing without breaking backward-compatibility
- // with existing indexes, so what we'll do instead is just
- // hard-code the parent of ordinal 0 to be -1, and assume (as is
- // indeed the case) that no other parent can be -1.
- parents_Renamed[0] = TaxonomyReader.INVALID_ORDINAL;
- }
- }
-
- public TaxonomyIndexArrays(IndexReader reader, TaxonomyIndexArrays copyFrom)
- {
- Debug.Assert(copyFrom != null);
-
- // note that copyParents.length may be equal to reader.maxDoc(). this is not a bug
- // it may be caused if e.g. the taxonomy segments were merged, and so an updated
- // NRT reader was obtained, even though nothing was changed. this is not very likely
- // to happen.
- int[] copyParents = copyFrom.Parents();
- this.parents_Renamed = new int[reader.MaxDoc];
- Array.Copy(copyParents, 0, parents_Renamed, 0, copyParents.Length);
- InitParents(reader, copyParents.Length);
-
- if (copyFrom.initializedChildren)
- {
- InitChildrenSiblings(copyFrom);
- }
- }
-
- private void InitChildrenSiblings(TaxonomyIndexArrays copyFrom)
- {
- lock (this)
- {
- if (!initializedChildren) // must do this check !
- {
- children_Renamed = new int[parents_Renamed.Length];
- siblings_Renamed = new int[parents_Renamed.Length];
- if (copyFrom != null)
- {
- // called from the ctor, after we know copyFrom has initialized children/siblings
- Array.Copy(copyFrom.Children(), 0, children_Renamed, 0, copyFrom.Children().Length);
- Array.Copy(copyFrom.Siblings(), 0, siblings_Renamed, 0, copyFrom.Siblings().Length);
- ComputeChildrenSiblings(copyFrom.parents_Renamed.Length);
- }
- else
- {
- ComputeChildrenSiblings(0);
- }
- initializedChildren = true;
- }
- }
- }
-
- private void ComputeChildrenSiblings(int first)
- {
- // reset the youngest child of all ordinals. while this should be done only
- // for the leaves, we don't know up front which are the leaves, so we reset
- // all of them.
- for (int i = first; i < parents_Renamed.Length; i++)
- {
- children_Renamed[i] = TaxonomyReader.INVALID_ORDINAL;
- }
-
- // the root category has no parent, and therefore no siblings
- if (first == 0)
- {
- first = 1;
- siblings_Renamed[0] = TaxonomyReader.INVALID_ORDINAL;
- }
-
- for (int i = first; i < parents_Renamed.Length; i++)
- {
- // note that parents[i] is always < i, so the right-hand-side of
- // the following line is already set when we get here
- siblings_Renamed[i] = children_Renamed[parents_Renamed[i]];
- children_Renamed[parents_Renamed[i]] = i;
- }
- }
-
- // Read the parents of the new categories
- private void InitParents(IndexReader reader, int first)
- {
- if (reader.MaxDoc == first)
- {
- return;
- }
-
- // it's ok to use MultiFields because we only iterate on one posting list.
- // breaking it to loop over the leaves() only complicates code for no
- // apparent gain.
- DocsAndPositionsEnum positions = MultiFields.GetTermPositionsEnum(reader, null, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, DocsAndPositionsEnum.FLAG_PAYLOADS);
-
- // shouldn't really happen, if it does, something's wrong
- if (positions == null || positions.Advance(first) == DocIdSetIterator.NO_MORE_DOCS)
- {
- throw new CorruptIndexException("Missing parent data for category " + first);
- }
-
- int num = reader.MaxDoc;
- for (int i = first; i < num; i++)
- {
- if (positions.DocID() == i)
- {
- if (positions.Freq() == 0) // shouldn't happen
- {
- throw new CorruptIndexException("Missing parent data for category " + i);
- }
-
- parents_Renamed[i] = positions.NextPosition();
-
- if (positions.NextDoc() == DocIdSetIterator.NO_MORE_DOCS)
- {
- if (i + 1 < num)
- {
- throw new CorruptIndexException("Missing parent data for category " + (i + 1));
- }
- break;
- }
- } // this shouldn't happen
- else
- {
- throw new CorruptIndexException("Missing parent data for category " + i);
- }
- }
- }
-
- /// <summary>
- /// Adds the given ordinal/parent info and returns either a new instance if the
- /// underlying array had to grow, or this instance otherwise.
- /// <para>
- /// <b>NOTE:</b> you should call this method from a thread-safe code.
- /// </para>
- /// </summary>
- internal virtual TaxonomyIndexArrays Add(int ordinal, int parentOrdinal)
- {
- if (ordinal >= parents_Renamed.Length)
- {
- int[] newarray = ArrayUtil.Grow(parents_Renamed, ordinal + 1);
- newarray[ordinal] = parentOrdinal;
- return new TaxonomyIndexArrays(newarray);
- }
- parents_Renamed[ordinal] = parentOrdinal;
- return this;
- }
-
- /// <summary>
- /// Returns the parents array, where {@code parents[i]} denotes the parent of
- /// category ordinal {@code i}.
- /// </summary>
- public override int[] Parents()
- {
- return parents_Renamed;
- }
-
- /// <summary>
- /// Returns the children array, where {@code children[i]} denotes the youngest
- /// child of category ordinal {@code i}. The youngest child is defined as the
- /// category that was added last to the taxonomy as an immediate child of
- /// {@code i}.
- /// </summary>
- public override int[] Children()
- {
- if (!initializedChildren)
- {
- InitChildrenSiblings(null);
- }
-
- // the array is guaranteed to be populated
- return children_Renamed;
- }
-
- /// <summary>
- /// Returns the siblings array, where {@code siblings[i]} denotes the sibling
- /// of category ordinal {@code i}. The sibling is defined as the previous
- /// youngest child of {@code parents[i]}.
- /// </summary>
- public override int[] Siblings()
- {
- if (!initializedChildren)
- {
- InitChildrenSiblings(null);
- }
-
- // the array is guaranteed to be populated
- return siblings_Renamed;
- }
-
- }
-
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/eea269f3/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs
----------------------------------------------------------------------
diff --git a/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs b/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs
deleted file mode 100644
index 3d50275..0000000
--- a/Lucene.Net.Facet/Taxonomy/DocValuesOrdinalsReader.cs
+++ /dev/null
@@ -1,130 +0,0 @@
-namespace Lucene.Net.Facet.Taxonomy
-{
-
- /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- using AtomicReaderContext = Lucene.Net.Index.AtomicReaderContext;
- using BinaryDocValues = Lucene.Net.Index.BinaryDocValues;
- using DocValues = Lucene.Net.Index.DocValues;
- using ArrayUtil = Lucene.Net.Util.ArrayUtil;
- using BytesRef = Lucene.Net.Util.BytesRef;
- using IntsRef = Lucene.Net.Util.IntsRef;
-
- /// <summary>
- /// Decodes ordinals previously indexed into a BinaryDocValues field </summary>
-
- public class DocValuesOrdinalsReader : OrdinalsReader
- {
- private readonly string field;
-
- /// <summary>
- /// Default constructor. </summary>
- public DocValuesOrdinalsReader()
- : this(FacetsConfig.DEFAULT_INDEX_FIELD_NAME)
- {
- }
-
- /// <summary>
- /// Create this, with the specified indexed field name. </summary>
- public DocValuesOrdinalsReader(string field)
- {
- this.field = field;
- }
-
- public override OrdinalsSegmentReader GetReader(AtomicReaderContext context)
- {
- BinaryDocValues values0 = context.AtomicReader.GetBinaryDocValues(field);
- if (values0 == null)
- {
- values0 = DocValues.EMPTY_BINARY;
- }
-
- BinaryDocValues values = values0;
-
- return new OrdinalsSegmentReaderAnonymousInnerClassHelper(this, values);
- }
-
- private class OrdinalsSegmentReaderAnonymousInnerClassHelper : OrdinalsSegmentReader
- {
- private readonly DocValuesOrdinalsReader outerInstance;
-
- private BinaryDocValues values;
-
- public OrdinalsSegmentReaderAnonymousInnerClassHelper(DocValuesOrdinalsReader outerInstance, BinaryDocValues values)
- {
- this.outerInstance = outerInstance;
- this.values = values;
- }
-
- public override void Get(int docID, IntsRef ordinals)
- {
- BytesRef bytes = new BytesRef();
- values.Get(docID, bytes);
- outerInstance.Decode(bytes, ordinals);
- }
- }
-
- public override string IndexFieldName
- {
- get
- {
- return field;
- }
- }
-
- /// <summary>
- /// Subclass & override if you change the encoding. </summary>
- protected virtual void Decode(BytesRef buf, IntsRef ordinals)
- {
-
- // grow the buffer up front, even if by a large number of values (buf.length)
- // that saves the need to check inside the loop for every decoded value if
- // the buffer needs to grow.
- if (ordinals.Ints.Length < buf.Length)
- {
- ordinals.Ints = ArrayUtil.Grow(ordinals.Ints, buf.Length);
- }
-
- ordinals.Offset = 0;
- ordinals.Length = 0;
-
- // it is better if the decoding is inlined like so, and not e.g.
- // in a utility method
- int upto = buf.Offset + buf.Length;
- int value = 0;
- int offset = buf.Offset;
- int prev = 0;
- while (offset < upto)
- {
- byte b = buf.Bytes[offset++];
- if ((sbyte)b >= 0)
- {
- ordinals.Ints[ordinals.Length] = ((value << 7) | b) + prev;
- value = 0;
- prev = ordinals.Ints[ordinals.Length];
- ordinals.Length++;
- }
- else
- {
- value = (value << 7) | (b & 0x7F);
- }
- }
- }
- }
-
-}
\ No newline at end of file