You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/09/16 00:24:51 UTC
[4/8] Porting Lucene.Net.Suggest (still not compiling)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/FSTUtil.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/FSTUtil.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/FSTUtil.cs
new file mode 100644
index 0000000..3ff019e
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/FSTUtil.cs
@@ -0,0 +1,146 @@
+using System.Collections.Generic;
+using System.Diagnostics;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ // TODO: move to core? nobody else uses it yet though...
+
+ /// <summary>
+ /// Exposes a utility method to enumerate all paths
+ /// intersecting an <seealso cref="Automaton"/> with an <seealso cref="FST"/>.
+ /// </summary>
+ public class FSTUtil
+ {
+
+ private FSTUtil()
+ {
+ }
+
+ /// <summary>
+ /// Holds a pair (automaton, fst) of states and accumulated output in the intersected machine. </summary>
+ public sealed class Path<T>
+ {
+
+ /// <summary>
+ /// Node in the automaton where path ends: </summary>
+ public readonly State state;
+
+ /// <summary>
+ /// Node in the FST where path ends: </summary>
+ public readonly FST.Arc<T> fstNode;
+
+ /// <summary>
+ /// Output of the path so far: </summary>
+ internal T output;
+
+ /// <summary>
+ /// Input of the path so far: </summary>
+ public readonly IntsRef input;
+
+ /// <summary>
+ /// Sole constructor. </summary>
+ public Path(State state, FST.Arc<T> fstNode, T output, IntsRef input)
+ {
+ this.state = state;
+ this.fstNode = fstNode;
+ this.output = output;
+ this.input = input;
+ }
+ }
+
+ /// <summary>
+ /// Enumerates all minimal prefix paths in the automaton that also intersect the FST,
+ /// accumulating the FST end node and output for each path.
+ /// </summary>
+ public static IList<Path<T>> IntersectPrefixPaths<T>(Automaton a, FST<T> fst)
+ {
+ Debug.Assert(a.Deterministic);
+ IList<Path<T>> queue = new List<Path<T>>();
+ IList<Path<T>> endNodes = new List<Path<T>>();
+ queue.Add(new Path<>(a.InitialState, fst.GetFirstArc(new FST.Arc<T>()), fst.Outputs.NoOutput, new IntsRef()));
+
+ FST.Arc<T> scratchArc = new FST.Arc<T>();
+ FST.BytesReader fstReader = fst.BytesReader;
+
+ while (queue.Count != 0)
+ {
+ Path<T> path = queue.RemoveAt(queue.Count - 1);
+ if (path.state.Accept)
+ {
+ endNodes.Add(path);
+ // we can stop here if we accept this path,
+ // we accept all further paths too
+ continue;
+ }
+
+ IntsRef currentInput = path.input;
+ foreach (Transition t in path.state.Transitions)
+ {
+ int min = t.Min;
+ int max = t.Max;
+ if (min == max)
+ {
+ FST.Arc<T> nextArc = fst.FindTargetArc(t.Min, path.fstNode, scratchArc, fstReader);
+ if (nextArc != null)
+ {
+ IntsRef newInput = new IntsRef(currentInput.Length + 1);
+ newInput.CopyInts(currentInput);
+ newInput.Ints[currentInput.Length] = t.Min;
+ newInput.Length = currentInput.Length + 1;
+ queue.Add(new Path<>(t.Dest, new FST.Arc<T>()
+ .CopyFrom(nextArc), fst.Outputs.Add(path.output, nextArc.Output), newInput));
+ }
+ }
+ else
+ {
+ // TODO: if this transition's TO state is accepting, and
+ // it accepts the entire range possible in the FST (ie. 0 to 255),
+ // we can simply use the prefix as the accepted state instead of
+ // looking up all the ranges and terminate early
+ // here. This just shifts the work from one queue
+ // (this one) to another (the completion search
+ // done in AnalyzingSuggester).
+ FST.Arc<T> nextArc = Util.ReadCeilArc(min, fst, path.fstNode, scratchArc, fstReader);
+ while (nextArc != null && nextArc.Label <= max)
+ {
+ Debug.Assert(nextArc.Label <= max);
+ Debug.Assert(nextArc.Label >= min, nextArc.Label + " " + min);
+ IntsRef newInput = new IntsRef(currentInput.Length + 1);
+ newInput.CopyInts(currentInput);
+ newInput.Ints[currentInput.Length] = nextArc.Label;
+ newInput.Length = currentInput.Length + 1;
+ queue.Add(new Path<>(t.Dest, new FST.Arc<T>()
+ .CopyFrom(nextArc), fst.Outputs.Add(path.output, nextArc.Output), newInput));
+ int label = nextArc.Label; // used in assert
+ nextArc = nextArc.Last ? null : fst.ReadNextRealArc(nextArc, fstReader);
+ Debug.Assert(nextArc == null || label < nextArc.Label, "last: " + label + " next: " + nextArc.Label);
+ }
+ }
+ }
+ }
+ return endNodes;
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/FreeTextSuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/FreeTextSuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/FreeTextSuggester.cs
new file mode 100644
index 0000000..4c82305
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/FreeTextSuggester.cs
@@ -0,0 +1,929 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+using Lucene.Net.Codecs;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+using Directory = Lucene.Net.Store.Directory;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ // TODO
+ // - test w/ syns
+ // - add pruning of low-freq ngrams?
+
+ /// <summary>
+ /// Builds an ngram model from the text sent to {@link
+ /// #build} and predicts based on the last grams-1 tokens in
+ /// the request sent to <seealso cref="#lookup"/>. This tries to
+ /// handle the "long tail" of suggestions for when the
+ /// incoming query is a never before seen query string.
+ ///
+ /// <para>Likely this suggester would only be used as a
+ /// fallback, when the primary suggester fails to find
+ /// any suggestions.
+ ///
+ /// </para>
+ /// <para>Note that the weight for each suggestion is unused,
+ /// and the suggestions are the analyzed forms (so your
+ /// analysis process should normally be very "light").
+ ///
+ /// </para>
+ /// <para>This uses the stupid backoff language model to smooth
+ /// scores across ngram models; see
+ /// "Large language models in machine translation",
+ /// http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.76.1126
+ /// for details.
+ ///
+ /// </para>
+ /// <para> From <seealso cref="#lookup"/>, the key of each result is the
+ /// ngram token; the value is Long.MAX_VALUE * score (fixed
+ /// point, cast to long). Divide by Long.MAX_VALUE to get
+ /// the score back, which ranges from 0.0 to 1.0.
+ ///
+ /// onlyMorePopular is unused.
+ ///
+ /// @lucene.experimental
+ /// </para>
+ /// </summary>
+ public class FreeTextSuggester : Lookup
+ {
+
+ /// <summary>
+ /// Codec name used in the header for the saved model. </summary>
+ public const string CODEC_NAME = "freetextsuggest";
+
+ /// <summary>
+ /// Initial version of the the saved model file format. </summary>
+ public const int VERSION_START = 0;
+
+ /// <summary>
+ /// Current version of the the saved model file format. </summary>
+ public const int VERSION_CURRENT = VERSION_START;
+
+ /// <summary>
+ /// By default we use a bigram model. </summary>
+ public const int DEFAULT_GRAMS = 2;
+
+ // In general this could vary with gram, but the
+ // original paper seems to use this constant:
+ /// <summary>
+ /// The constant used for backoff smoothing; during
+ /// lookup, this means that if a given trigram did not
+ /// occur, and we backoff to the bigram, the overall score
+ /// will be 0.4 times what the bigram model would have
+ /// assigned.
+ /// </summary>
+ public const double ALPHA = 0.4;
+
+ /// <summary>
+ /// Holds 1gram, 2gram, 3gram models as a single FST. </summary>
+ private FST<long?> fst;
+
+ /// <summary>
+ /// Analyzer that will be used for analyzing suggestions at
+ /// index time.
+ /// </summary>
+ private readonly Analyzer indexAnalyzer;
+
+ private long totTokens;
+
+ /// <summary>
+ /// Analyzer that will be used for analyzing suggestions at
+ /// query time.
+ /// </summary>
+ private readonly Analyzer queryAnalyzer;
+
+ // 2 = bigram, 3 = trigram
+ private readonly int grams;
+
+ private readonly sbyte separator;
+
+ /// <summary>
+ /// Number of entries the lookup was built with </summary>
+ private long count = 0;
+
+ /// <summary>
+ /// The default character used to join multiple tokens
+ /// into a single ngram token. The input tokens produced
+ /// by the analyzer must not contain this character.
+ /// </summary>
+ public const sbyte DEFAULT_SEPARATOR = 0x1e;
+
+ /// <summary>
+ /// Instantiate, using the provided analyzer for both
+ /// indexing and lookup, using bigram model by default.
+ /// </summary>
+ public FreeTextSuggester(Analyzer analyzer) : this(analyzer, analyzer, DEFAULT_GRAMS)
+ {
+ }
+
+ /// <summary>
+ /// Instantiate, using the provided indexing and lookup
+ /// analyzers, using bigram model by default.
+ /// </summary>
+ public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer) : this(indexAnalyzer, queryAnalyzer, DEFAULT_GRAMS)
+ {
+ }
+
+ /// <summary>
+ /// Instantiate, using the provided indexing and lookup
+ /// analyzers, with the specified model (2
+ /// = bigram, 3 = trigram, etc.).
+ /// </summary>
+ public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams) : this(indexAnalyzer, queryAnalyzer, grams, DEFAULT_SEPARATOR)
+ {
+ }
+
+ /// <summary>
+ /// Instantiate, using the provided indexing and lookup
+ /// analyzers, and specified model (2 = bigram, 3 =
+ /// trigram ,etc.). The separator is passed to {@link
+ /// ShingleFilter#setTokenSeparator} to join multiple
+ /// tokens into a single ngram token; it must be an ascii
+ /// (7-bit-clean) byte. No input tokens should have this
+ /// byte, otherwise {@code IllegalArgumentException} is
+ /// thrown.
+ /// </summary>
+ public FreeTextSuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int grams, sbyte separator)
+ {
+ this.grams = grams;
+ this.indexAnalyzer = AddShingles(indexAnalyzer);
+ this.queryAnalyzer = AddShingles(queryAnalyzer);
+ if (grams < 1)
+ {
+ throw new System.ArgumentException("grams must be >= 1");
+ }
+ if ((separator & 0x80) != 0)
+ {
+ throw new System.ArgumentException("separator must be simple ascii character");
+ }
+ this.separator = separator;
+ }
+
+ /// <summary>
+ /// Returns byte size of the underlying FST. </summary>
+ public override long SizeInBytes()
+ {
+ if (fst == null)
+ {
+ return 0;
+ }
+ return fst.SizeInBytes();
+ }
+
+ private class AnalyzingComparator : IComparer<BytesRef>
+ {
+
+ internal readonly ByteArrayDataInput readerA = new ByteArrayDataInput();
+ internal readonly ByteArrayDataInput readerB = new ByteArrayDataInput();
+ internal readonly BytesRef scratchA = new BytesRef();
+ internal readonly BytesRef scratchB = new BytesRef();
+
+ public virtual int Compare(BytesRef a, BytesRef b)
+ {
+ readerA.Reset(a.Bytes, a.Offset, a.Length);
+ readerB.Reset(b.Bytes, b.Offset, b.Length);
+
+ // By token:
+ scratchA.Length = readerA.ReadShort();
+ scratchA.Bytes = a.Bytes;
+ scratchA.Offset = readerA.Position;
+
+ scratchB.Bytes = b.Bytes;
+ scratchB.Length = readerB.ReadShort();
+ scratchB.Offset = readerB.Position;
+
+ int cmp = scratchA.CompareTo(scratchB);
+ if (cmp != 0)
+ {
+ return cmp;
+ }
+ readerA.SkipBytes(scratchA.Length);
+ readerB.SkipBytes(scratchB.Length);
+
+ // By length (smaller surface forms sorted first):
+ cmp = a.Length - b.Length;
+ if (cmp != 0)
+ {
+ return cmp;
+ }
+
+ // By surface form:
+ scratchA.Offset = readerA.Position;
+ scratchA.Length = a.Length - scratchA.Offset;
+ scratchB.Offset = readerB.Position;
+ scratchB.Length = b.Length - scratchB.Offset;
+
+ return scratchA.CompareTo(scratchB);
+ }
+ }
+
+ private Analyzer AddShingles(Analyzer other)
+ {
+ if (grams == 1)
+ {
+ return other;
+ }
+ else
+ {
+ // TODO: use ShingleAnalyzerWrapper?
+ // Tack on ShingleFilter to the end, to generate token ngrams:
+ return new AnalyzerWrapperAnonymousInnerClassHelper(this, other.ReuseStrategy, other);
+ }
+ }
+
+ private class AnalyzerWrapperAnonymousInnerClassHelper : AnalyzerWrapper
+ {
+ private readonly FreeTextSuggester outerInstance;
+ private readonly Analyzer other;
+
+ public AnalyzerWrapperAnonymousInnerClassHelper(FreeTextSuggester outerInstance, UnknownType getReuseStrategy, Analyzer other) : base(getReuseStrategy)
+ {
+ this.outerInstance = outerInstance;
+ this.other = other;
+ }
+
+ protected override Analyzer GetWrappedAnalyzer(string fieldName)
+ {
+ return other;
+ }
+
+ protected override TokenStreamComponents WrapComponents(string fieldName, TokenStreamComponents components)
+ {
+ ShingleFilter shingles = new ShingleFilter(components.TokenStream, 2, outerInstance.grams);
+ shingles.TokenSeparator = char.ToString((char) outerInstance.separator);
+ return new TokenStreamComponents(components.Tokenizer, shingles);
+ }
+ }
+
+ public override void Build(InputIterator iterator)
+ {
+ Build(iterator, IndexWriterConfig.DEFAULT_RAM_BUFFER_SIZE_MB);
+ }
+
+ /// <summary>
+ /// Build the suggest index, using up to the specified
+ /// amount of temporary RAM while building. Note that
+ /// the weights for the suggestions are ignored.
+ /// </summary>
+ public virtual void Build(InputIterator iterator, double ramBufferSizeMB)
+ {
+ if (iterator.HasPayloads())
+ {
+ throw new System.ArgumentException("this suggester doesn't support payloads");
+ }
+ if (iterator.HasContexts())
+ {
+ throw new System.ArgumentException("this suggester doesn't support contexts");
+ }
+
+ string prefix = this.GetType().Name;
+ var directory = OfflineSorter.DefaultTempDir();
+ // TODO: messy ... java7 has Files.createTempDirectory
+ // ... but 4.x is java6:
+ File tempIndexPath = null;
+ Random random = new Random();
+ while (true)
+ {
+ tempIndexPath = new File(directory, prefix + ".index." + random.Next(int.MaxValue));
+ if (tempIndexPath.mkdir())
+ {
+ break;
+ }
+ }
+
+ Directory dir = FSDirectory.Open(tempIndexPath);
+
+ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_CURRENT, indexAnalyzer);
+ iwc.OpenMode = IndexWriterConfig.OpenMode.CREATE;
+ iwc.RAMBufferSizeMB = ramBufferSizeMB;
+ IndexWriter writer = new IndexWriter(dir, iwc);
+
+ FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
+ // TODO: if only we had IndexOptions.TERMS_ONLY...
+ ft.IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS;
+ ft.OmitNorms = true;
+ ft.Freeze();
+
+ Document doc = new Document();
+ Field field = new Field("body", "", ft);
+ doc.Add(field);
+
+ totTokens = 0;
+ IndexReader reader = null;
+
+ bool success = false;
+ count = 0;
+ try
+ {
+ while (true)
+ {
+ BytesRef surfaceForm = iterator.Next();
+ if (surfaceForm == null)
+ {
+ break;
+ }
+ field.StringValue = surfaceForm.Utf8ToString();
+ writer.AddDocument(doc);
+ count++;
+ }
+ reader = DirectoryReader.Open(writer, false);
+
+ Terms terms = MultiFields.GetTerms(reader, "body");
+ if (terms == null)
+ {
+ throw new System.ArgumentException("need at least one suggestion");
+ }
+
+ // Move all ngrams into an FST:
+ TermsEnum termsEnum = terms.Iterator(null);
+
+ Outputs<long?> outputs = PositiveIntOutputs.Singleton;
+ Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ IntsRef scratchInts = new IntsRef();
+ while (true)
+ {
+ BytesRef term = termsEnum.next();
+ if (term == null)
+ {
+ break;
+ }
+ int ngramCount = countGrams(term);
+ if (ngramCount > grams)
+ {
+ throw new System.ArgumentException("tokens must not contain separator byte; got token=" + term + " but gramCount=" + ngramCount + ", which is greater than expected max ngram size=" + grams);
+ }
+ if (ngramCount == 1)
+ {
+ totTokens += termsEnum.TotalTermFreq();
+ }
+
+ builder.Add(Util.ToIntsRef(term, scratchInts), encodeWeight(termsEnum.TotalTermFreq()));
+ }
+
+ fst = builder.Finish();
+ if (fst == null)
+ {
+ throw new System.ArgumentException("need at least one suggestion");
+ }
+ //System.out.println("FST: " + fst.getNodeCount() + " nodes");
+
+ /*
+ PrintWriter pw = new PrintWriter("/x/tmp/out.dot");
+ Util.toDot(fst, pw, true, true);
+ pw.close();
+ */
+
+ success = true;
+ }
+ finally
+ {
+ try
+ {
+ if (success)
+ {
+ IOUtils.Close(writer, reader);
+ }
+ else
+ {
+ IOUtils.CloseWhileHandlingException(writer, reader);
+ }
+ }
+ finally
+ {
+ foreach (string file in dir.ListAll())
+ {
+ File path = new File(tempIndexPath, file);
+ if (path.Delete() == false)
+ {
+ throw new InvalidOperationException("failed to remove " + path);
+ }
+ }
+
+ if (tempIndexPath.Delete() == false)
+ {
+ throw new InvalidOperationException("failed to remove " + tempIndexPath);
+ }
+
+ dir.Dispose();
+ }
+ }
+ }
+
+ public override bool Store(DataOutput output)
+ {
+ CodecUtil.WriteHeader(output, CODEC_NAME, VERSION_CURRENT);
+ output.WriteVLong(count);
+ output.WriteByte(separator);
+ output.WriteVInt(grams);
+ output.WriteVLong(totTokens);
+ fst.Save(output);
+ return true;
+ }
+
+ public override bool Load(DataInput input)
+ {
+ CodecUtil.CheckHeader(input, CODEC_NAME, VERSION_START, VERSION_START);
+ count = input.ReadVLong();
+ sbyte separatorOrig = input.ReadByte();
+ if (separatorOrig != separator)
+ {
+ throw new InvalidOperationException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig);
+ }
+ int gramsOrig = input.ReadVInt();
+ if (gramsOrig != grams)
+ {
+ throw new InvalidOperationException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig);
+ }
+ totTokens = input.ReadVLong();
+
+ fst = new FST<>(input, PositiveIntOutputs.Singleton);
+
+ return true;
+ }
+
+ public override IList<LookupResult> Lookup(string key, bool onlyMorePopular, int num) // ignored
+ {
+ return Lookup(key, null, onlyMorePopular, num);
+ }
+
+ /// <summary>
+ /// Lookup, without any context. </summary>
+ public virtual IList<LookupResult> Lookup(string key, int num)
+ {
+ return Lookup(key, null, true, num);
+ }
+
+ public override IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num) // ignored
+ {
+ try
+ {
+ return Lookup(key, contexts, num);
+ }
+ catch (IOException ioe)
+ {
+ // bogus:
+ throw new Exception(ioe);
+ }
+ }
+
+ public override long Count
+ {
+ get
+ {
+ return count;
+ }
+ }
+
+ private int CountGrams(BytesRef token)
+ {
+ int count = 1;
+ for (int i = 0;i < token.Length;i++)
+ {
+ if (token.Bytes[token.Offset + i] == separator)
+ {
+ count++;
+ }
+ }
+
+ return count;
+ }
+
+ /// <summary>
+ /// Retrieve suggestions.
+ /// </summary>
+ public virtual IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, int num)
+ {
+ if (contexts != null)
+ {
+ throw new System.ArgumentException("this suggester doesn't support contexts");
+ }
+
+ TokenStream ts = queryAnalyzer.TokenStream("", key.ToString());
+ try
+ {
+ TermToBytesRefAttribute termBytesAtt = ts.AddAttribute<TermToBytesRefAttribute>();
+ OffsetAttribute offsetAtt = ts.AddAttribute<OffsetAttribute>();
+ PositionLengthAttribute posLenAtt = ts.AddAttribute<PositionLengthAttribute>();
+ PositionIncrementAttribute posIncAtt = ts.AddAttribute<PositionIncrementAttribute>();
+ ts.Reset();
+
+ var lastTokens = new BytesRef[grams];
+ //System.out.println("lookup: key='" + key + "'");
+
+ // Run full analysis, but save only the
+ // last 1gram, last 2gram, etc.:
+ BytesRef tokenBytes = termBytesAtt.BytesRef;
+ int maxEndOffset = -1;
+ bool sawRealToken = false;
+ while (ts.IncrementToken())
+ {
+ termBytesAtt.FillBytesRef();
+ sawRealToken |= tokenBytes.Length > 0;
+ // TODO: this is somewhat iffy; today, ShingleFilter
+ // sets posLen to the gram count; maybe we should make
+ // a separate dedicated att for this?
+ int gramCount = posLenAtt.PositionLength;
+
+ Debug.Assert(gramCount <= grams);
+
+ // Safety: make sure the recalculated count "agrees":
+ if (CountGrams(tokenBytes) != gramCount)
+ {
+ throw new System.ArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
+ }
+ maxEndOffset = Math.Max(maxEndOffset, offsetAtt.EndOffset());
+ lastTokens[gramCount - 1] = BytesRef.DeepCopyOf(tokenBytes);
+ }
+ ts.End();
+
+ if (!sawRealToken)
+ {
+ throw new System.ArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
+ }
+
+ // Carefully fill last tokens with _ tokens;
+ // ShingleFilter appraently won't emit "only hole"
+ // tokens:
+ int endPosInc = posIncAtt.PositionIncrement;
+
+ // Note this will also be true if input is the empty
+ // string (in which case we saw no tokens and
+ // maxEndOffset is still -1), which in fact works out OK
+ // because we fill the unigram with an empty BytesRef
+ // below:
+ bool lastTokenEnded = offsetAtt.EndOffset() > maxEndOffset || endPosInc > 0;
+ //System.out.println("maxEndOffset=" + maxEndOffset + " vs " + offsetAtt.endOffset());
+
+ if (lastTokenEnded)
+ {
+ //System.out.println(" lastTokenEnded");
+ // If user hit space after the last token, then
+ // "upgrade" all tokens. This way "foo " will suggest
+ // all bigrams starting w/ foo, and not any unigrams
+ // starting with "foo":
+ for (int i = grams - 1;i > 0;i--)
+ {
+ BytesRef token = lastTokens[i - 1];
+ if (token == null)
+ {
+ continue;
+ }
+ token.Grow(token.Length + 1);
+ token.Bytes[token.Length] = separator;
+ token.Length++;
+ lastTokens[i] = token;
+ }
+ lastTokens[0] = new BytesRef();
+ }
+
+ FST.Arc<long?> arc = new FST.Arc<long?>();
+
+ FST.BytesReader bytesReader = fst.BytesReader;
+
+ // Try highest order models first, and if they return
+ // results, return that; else, fallback:
+ double backoff = 1.0;
+
+ IList<LookupResult> results = new List<LookupResult>(num);
+
+ // We only add a given suffix once, from the highest
+ // order model that saw it; for subsequent lower order
+ // models we skip it:
+ var seen = new HashSet<BytesRef>();
+
+ for (int gram = grams - 1;gram >= 0;gram--)
+ {
+ BytesRef token = lastTokens[gram];
+ // Don't make unigram predictions from empty string:
+ if (token == null || (token.Length == 0 && key.Length > 0))
+ {
+ // Input didn't have enough tokens:
+ //System.out.println(" gram=" + gram + ": skip: not enough input");
+ continue;
+ }
+
+ if (endPosInc > 0 && gram <= endPosInc)
+ {
+ // Skip hole-only predictions; in theory we
+ // shouldn't have to do this, but we'd need to fix
+ // ShingleFilter to produce only-hole tokens:
+ //System.out.println(" break: only holes now");
+ break;
+ }
+
+ //System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
+
+ // TODO: we could add fuzziness here
+ // match the prefix portion exactly
+ //Pair<Long,BytesRef> prefixOutput = null;
+ long? prefixOutput = null;
+ try
+ {
+ prefixOutput = LookupPrefix(fst, bytesReader, token, arc);
+ }
+ catch (IOException bogus)
+ {
+ throw new Exception(bogus);
+ }
+ //System.out.println(" prefixOutput=" + prefixOutput);
+
+ if (prefixOutput == null)
+ {
+ // This model never saw this prefix, e.g. the
+ // trigram model never saw context "purple mushroom"
+ backoff *= ALPHA;
+ continue;
+ }
+
+ // TODO: we could do this division at build time, and
+ // bake it into the FST?
+
+ // Denominator for computing scores from current
+ // model's predictions:
+ long contextCount = totTokens;
+
+ BytesRef lastTokenFragment = null;
+
+ for (int i = token.Length - 1;i >= 0;i--)
+ {
+ if (token.Bytes[token.Offset + i] == separator)
+ {
+ BytesRef context = new BytesRef(token.Bytes, token.Offset, i);
+ long? output = Util.Get(fst, Util.ToIntsRef(context, new IntsRef()));
+ Debug.Assert(output != null);
+ contextCount = DecodeWeight(output);
+ lastTokenFragment = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
+ break;
+ }
+ }
+
+ BytesRef finalLastToken;
+
+ if (lastTokenFragment == null)
+ {
+ finalLastToken = BytesRef.DeepCopyOf(token);
+ }
+ else
+ {
+ finalLastToken = BytesRef.DeepCopyOf(lastTokenFragment);
+ }
+ Debug.Assert(finalLastToken.Offset == 0);
+
+ CharsRef spare = new CharsRef();
+
+ // complete top-N
+ Util.TopResults<long?> completions = null;
+ try
+ {
+
+ // Because we store multiple models in one FST
+ // (1gram, 2gram, 3gram), we must restrict the
+ // search so that it only considers the current
+ // model. For highest order model, this is not
+ // necessary since all completions in the FST
+ // must be from this model, but for lower order
+ // models we have to filter out the higher order
+ // ones:
+
+ // Must do num+seen.size() for queue depth because we may
+ // reject up to seen.size() paths in acceptResult():
+ Util.TopNSearcher<long?> searcher = new TopNSearcherAnonymousInnerClassHelper(this, fst, num, num + seen.Count, weightComparator, seen, finalLastToken);
+
+ // since this search is initialized with a single start node
+ // it is okay to start with an empty input path here
+ searcher.AddStartPaths(arc, prefixOutput, true, new IntsRef());
+
+ completions = searcher.Search();
+ Debug.Assert(completions.IsComplete);
+ }
+ catch (IOException bogus)
+ {
+ throw new Exception(bogus);
+ }
+
+ int prefixLength = token.Length;
+
+ BytesRef suffix = new BytesRef(8);
+ //System.out.println(" " + completions.length + " completions");
+
+ foreach (Util.Result<long?> completion in completions)
+ {
+ token.Length = prefixLength;
+ // append suffix
+ Util.ToBytesRef(completion.Input, suffix);
+ token.Append(suffix);
+
+ //System.out.println(" completion " + token.utf8ToString());
+
+ // Skip this path if a higher-order model already
+ // saw/predicted its last token:
+ BytesRef lastToken = token;
+ for (int i = token.Length - 1;i >= 0;i--)
+ {
+ if (token.Bytes[token.Offset + i] == separator)
+ {
+ Debug.Assert(token.Length - i - 1 > 0);
+ lastToken = new BytesRef(token.Bytes, token.Offset + i + 1, token.Length - i - 1);
+ break;
+ }
+ }
+ if (seen.Contains(lastToken))
+ {
+ //System.out.println(" skip dup " + lastToken.utf8ToString());
+ goto nextCompletionContinue;
+ }
+ seen.Add(BytesRef.DeepCopyOf(lastToken));
+ spare.Grow(token.Length);
+ UnicodeUtil.UTF8toUTF16(token, spare);
+ LookupResult result = new LookupResult(spare.ToString(), (long)(long.MaxValue * backoff * ((double) decodeWeight(completion.Output)) / contextCount));
+ results.Add(result);
+ Debug.Assert(results.Count == seen.Count);
+ //System.out.println(" add result=" + result);
+ nextCompletionContinue:;
+ }
+ nextCompletionBreak:
+ backoff *= ALPHA;
+ }
+
+ results.Sort(new ComparatorAnonymousInnerClassHelper(this));
+
+ if (results.Count > num)
+ {
+ results.SubList(num, results.Count).Clear();
+ }
+
+ return results;
+ }
+ finally
+ {
+ IOUtils.CloseWhileHandlingException(ts);
+ }
+ }
+
+ private class TopNSearcherAnonymousInnerClassHelper : Util.TopNSearcher<long?>
+ {
+ private readonly FreeTextSuggester outerInstance;
+
+ private HashSet<BytesRef> seen;
+ private BytesRef finalLastToken;
+
+ public TopNSearcherAnonymousInnerClassHelper<T1>(FreeTextSuggester outerInstance, FST<T1> org.apache.lucene.search.suggest.fst, int num, UnknownType size, UnknownType weightComparator, HashSet<BytesRef> seen, BytesRef finalLastToken) : base(org.apache.lucene.search.suggest.fst, num, size, weightComparator)
+ {
+ this.outerInstance = outerInstance;
+ this.seen = seen;
+ this.finalLastToken = finalLastToken;
+ scratchBytes = new BytesRef();
+ }
+
+
+ internal BytesRef scratchBytes;
+
+ protected internal override void addIfCompetitive(Util.FSTPath<long?> path)
+ {
+ if (path.Arc.label != outerInstance.separator)
+ {
+ //System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
+ base.AddIfCompetitive(path);
+ }
+ else
+ {
+ //System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
+ }
+ }
+
+ protected internal override bool AcceptResult(IntsRef input, long? output)
+ {
+ Util.ToBytesRef(input, scratchBytes);
+ finalLastToken.Grow(finalLastToken.length + scratchBytes.length);
+ int lenSav = finalLastToken.length;
+ finalLastToken.append(scratchBytes);
+ //System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
+ bool ret = seen.Contains(finalLastToken) == false;
+
+ finalLastToken.length = lenSav;
+ return ret;
+ }
+ }
+
+ private class ComparatorAnonymousInnerClassHelper : IComparer<Lookup.LookupResult>
+ {
+ private readonly FreeTextSuggester outerInstance;
+
+ public ComparatorAnonymousInnerClassHelper(FreeTextSuggester outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ public virtual int Compare(LookupResult a, LookupResult b)
+ {
+ if (a.value > b.value)
+ {
+ return -1;
+ }
+ else if (a.value < b.value)
+ {
+ return 1;
+ }
+ else
+ {
+ // Tie break by UTF16 sort order:
+ return ((string) a.key).CompareTo((string) b.key);
+ }
+ }
+ }
+
+ /// <summary>
+ /// weight -> cost </summary>
+ private long EncodeWeight(long ngramCount)
+ {
+ return long.MaxValue - ngramCount;
+ }
+
+ /// <summary>
+ /// cost -> weight </summary>
+ //private long decodeWeight(Pair<Long,BytesRef> output) {
+ private long DecodeWeight(long? output)
+ {
+ Debug.Assert(output != null);
+ return (int)(long.MaxValue - output);
+ }
+
+ // NOTE: copied from WFSTCompletionLookup & tweaked
+ private long? LookupPrefix(FST<long?> fst, FST.BytesReader bytesReader, BytesRef scratch, FST.Arc<long?> arc) //Bogus
+ {
+
+ long? output = fst.outputs.NoOutput;
+
+ fst.GetFirstArc(arc);
+
+ sbyte[] bytes = scratch.Bytes;
+ int pos = scratch.Offset;
+ int end = pos + scratch.Length;
+ while (pos < end)
+ {
+ if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null)
+ {
+ return null;
+ }
+ else
+ {
+ output = fst.outputs.add(output, arc.output);
+ }
+ }
+
+ return output;
+ }
+
+ internal static readonly IComparer<long?> weightComparator = new ComparatorAnonymousInnerClassHelper2();
+
+ private class ComparatorAnonymousInnerClassHelper2 : IComparer<long?>
+ {
+ public ComparatorAnonymousInnerClassHelper2()
+ {
+ }
+
+ public virtual int Compare(long? left, long? right)
+ {
+ return left.CompareTo(right);
+ }
+ }
+
+ /// <summary>
+ /// Returns the weight associated with an input string,
+ /// or null if it does not exist.
+ /// </summary>
+ public virtual object Get(string key)
+ {
+ throw new System.NotSupportedException();
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/FuzzySuggester.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/FuzzySuggester.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/FuzzySuggester.cs
new file mode 100644
index 0000000..df65851
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/FuzzySuggester.cs
@@ -0,0 +1,271 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Analysis;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Implements a fuzzy <seealso cref="AnalyzingSuggester"/>. The similarity measurement is
+ /// based on the Damerau-Levenshtein (optimal string alignment) algorithm, though
+ /// you can explicitly choose classic Levenshtein by passing <code>false</code>
+ /// for the <code>transpositions</code> parameter.
+ /// <para>
+ /// At most, this query will match terms up to
+ /// {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}
+ /// edits. Higher distances are not supported. Note that the
+ /// fuzzy distance is measured in "byte space" on the bytes
+ /// returned by the <seealso cref="TokenStream"/>'s {@link
+ /// TermToBytesRefAttribute}, usually UTF8. By default
+ /// the analyzed bytes must be at least 3 {@link
+ /// #DEFAULT_MIN_FUZZY_LENGTH} bytes before any edits are
+ /// considered. Furthermore, the first 1 {@link
+ /// #DEFAULT_NON_FUZZY_PREFIX} byte is not allowed to be
+ /// edited. We allow up to 1 (@link
+ /// #DEFAULT_MAX_EDITS} edit.
+ /// If <seealso cref="#unicodeAware"/> parameter in the constructor is set to true, maxEdits,
+ /// minFuzzyLength, transpositions and nonFuzzyPrefix are measured in Unicode code
+ /// points (actual letters) instead of bytes.
+ ///
+ /// </para>
+ /// <para>
+ /// NOTE: This suggester does not boost suggestions that
+ /// required no edits over suggestions that did require
+ /// edits. This is a known limitation.
+ ///
+ /// </para>
+ /// <para>
+ /// Note: complex query analyzers can have a significant impact on the lookup
+ /// performance. It's recommended to not use analyzers that drop or inject terms
+ /// like synonyms to keep the complexity of the prefix intersection low for good
+ /// lookup performance. At index time, complex analyzers can safely be used.
+ /// </para>
+ ///
+ /// @lucene.experimental
+ /// </summary>
+ public sealed class FuzzySuggester : AnalyzingSuggester
+ {
+ private readonly int maxEdits;
+ private readonly bool transpositions;
+ private readonly int nonFuzzyPrefix;
+ private readonly int minFuzzyLength;
+ private readonly bool unicodeAware;
+
+ /// <summary>
+ /// Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix
+ /// parameters in Unicode code points (actual letters)
+ /// instead of bytes.
+ /// </summary>
+ public const bool DEFAULT_UNICODE_AWARE = false;
+
+ /// <summary>
+ /// The default minimum length of the key passed to {@link
+ /// #lookup} before any edits are allowed.
+ /// </summary>
+ public const int DEFAULT_MIN_FUZZY_LENGTH = 3;
+
+ /// <summary>
+ /// The default prefix length where edits are not allowed.
+ /// </summary>
+ public const int DEFAULT_NON_FUZZY_PREFIX = 1;
+
+ /// <summary>
+ /// The default maximum number of edits for fuzzy
+ /// suggestions.
+ /// </summary>
+ public const int DEFAULT_MAX_EDITS = 1;
+
+ /// <summary>
+ /// The default transposition value passed to <seealso cref="LevenshteinAutomata"/>
+ /// </summary>
+ public const bool DEFAULT_TRANSPOSITIONS = true;
+
+ /// <summary>
+ /// Creates a <seealso cref="FuzzySuggester"/> instance initialized with default values.
+ /// </summary>
+ /// <param name="analyzer"> the analyzer used for this suggester </param>
+ public FuzzySuggester(Analyzer analyzer)
+ : this(analyzer, analyzer)
+ {
+ }
+
+ /// <summary>
+ /// Creates a <seealso cref="FuzzySuggester"/> instance with an index & a query analyzer initialized with default values.
+ /// </summary>
+ /// <param name="indexAnalyzer">
+ /// Analyzer that will be used for analyzing suggestions while building the index. </param>
+ /// <param name="queryAnalyzer">
+ /// Analyzer that will be used for analyzing query text during lookup </param>
+ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer)
+ : this(indexAnalyzer, queryAnalyzer, EXACT_FIRST | PRESERVE_SEP, 256, -1, true, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE)
+ {
+ }
+
+ /// <summary>
+ /// Creates a <seealso cref="FuzzySuggester"/> instance.
+ /// </summary>
+ /// <param name="indexAnalyzer"> Analyzer that will be used for
+ /// analyzing suggestions while building the index. </param>
+ /// <param name="queryAnalyzer"> Analyzer that will be used for
+ /// analyzing query text during lookup </param>
+ /// <param name="options"> see <seealso cref="#EXACT_FIRST"/>, <seealso cref="#PRESERVE_SEP"/> </param>
+ /// <param name="maxSurfaceFormsPerAnalyzedForm"> Maximum number of
+ /// surface forms to keep for a single analyzed form.
+ /// When there are too many surface forms we discard the
+ /// lowest weighted ones. </param>
+ /// <param name="maxGraphExpansions"> Maximum number of graph paths
+ /// to expand from the analyzed form. Set this to -1 for
+ /// no limit. </param>
+ /// <param name="preservePositionIncrements"> Whether position holes should appear in the automaton </param>
+ /// <param name="maxEdits"> must be >= 0 and <= <seealso cref="LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE"/> . </param>
+ /// <param name="transpositions"> <code>true</code> if transpositions should be treated as a primitive
+ /// edit operation. If this is false, comparisons will implement the classic
+ /// Levenshtein algorithm. </param>
+ /// <param name="nonFuzzyPrefix"> length of common (non-fuzzy) prefix (see default <seealso cref="#DEFAULT_NON_FUZZY_PREFIX"/> </param>
+ /// <param name="minFuzzyLength"> minimum length of lookup key before any edits are allowed (see default <seealso cref="#DEFAULT_MIN_FUZZY_LENGTH"/>) </param>
+ /// <param name="unicodeAware"> operate Unicode code points instead of bytes. </param>
+ public FuzzySuggester(Analyzer indexAnalyzer, Analyzer queryAnalyzer, int options, int maxSurfaceFormsPerAnalyzedForm, int maxGraphExpansions, bool preservePositionIncrements, int maxEdits, bool transpositions, int nonFuzzyPrefix, int minFuzzyLength, bool unicodeAware)
+ : base(indexAnalyzer, queryAnalyzer, options, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions, preservePositionIncrements)
+ {
+ if (maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
+ {
+ throw new System.ArgumentException("maxEdits must be between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
+ }
+ if (nonFuzzyPrefix < 0)
+ {
+ throw new System.ArgumentException("nonFuzzyPrefix must not be >= 0 (got " + nonFuzzyPrefix + ")");
+ }
+ if (minFuzzyLength < 0)
+ {
+ throw new System.ArgumentException("minFuzzyLength must not be >= 0 (got " + minFuzzyLength + ")");
+ }
+
+ this.maxEdits = maxEdits;
+ this.transpositions = transpositions;
+ this.nonFuzzyPrefix = nonFuzzyPrefix;
+ this.minFuzzyLength = minFuzzyLength;
+ this.unicodeAware = unicodeAware;
+ }
+
+ protected internal override IList<FSTUtil.Path<Pair<long?, BytesRef>>> GetFullPrefixPaths(IList<FSTUtil.Path<Pair<long?, BytesRef>>> prefixPaths, Automaton lookupAutomaton, FST<Pair<long?, BytesRef>> fst)
+ {
+
+ // TODO: right now there's no penalty for fuzzy/edits,
+ // ie a completion whose prefix matched exactly what the
+ // user typed gets no boost over completions that
+ // required an edit, which get no boost over completions
+ // requiring two edits. I suspect a multiplicative
+ // factor is appropriate (eg, say a fuzzy match must be at
+ // least 2X better weight than the non-fuzzy match to
+ // "compete") ... in which case I think the wFST needs
+ // to be log weights or something ...
+
+ Automaton levA = convertAutomaton(ToLevenshteinAutomata(lookupAutomaton));
+ /*
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
+ w.write(levA.toDot());
+ w.close();
+ System.out.println("Wrote LevA to out.dot");
+ */
+ return FSTUtil.IntersectPrefixPaths(levA, fst);
+ }
+
+ protected internal override Automaton ConvertAutomaton(Automaton a)
+ {
+ if (unicodeAware)
+ {
+ Automaton utf8automaton = (new UTF32ToUTF8()).Convert(a);
+ BasicOperations.Determinize(utf8automaton);
+ return utf8automaton;
+ }
+ else
+ {
+ return a;
+ }
+ }
+
+ internal override TokenStreamToAutomaton TokenStreamToAutomaton
+ {
+ get
+ {
+ var tsta = base.TokenStreamToAutomaton;
+ tsta.UnicodeArcs = unicodeAware;
+ return tsta;
+ }
+ }
+
+ internal Automaton ToLevenshteinAutomata(Automaton automaton)
+ {
+ var @ref = SpecialOperations.GetFiniteStrings(automaton, -1);
+ Automaton[] subs = new Automaton[@ref.Count];
+ int upto = 0;
+ foreach (IntsRef path in @ref)
+ {
+ if (path.Length <= nonFuzzyPrefix || path.Length < minFuzzyLength)
+ {
+ subs[upto] = BasicAutomata.MakeString(path.Ints, path.Offset, path.Length);
+ upto++;
+ }
+ else
+ {
+ Automaton prefix = BasicAutomata.MakeString(path.Ints, path.Offset, nonFuzzyPrefix);
+ int[] ints = new int[path.Length - nonFuzzyPrefix];
+ Array.Copy(path.Ints, path.Offset + nonFuzzyPrefix, ints, 0, ints.Length);
+ // TODO: maybe add alphaMin to LevenshteinAutomata,
+ // and pass 1 instead of 0? We probably don't want
+ // to allow the trailing dedup bytes to be
+ // edited... but then 0 byte is "in general" allowed
+ // on input (but not in UTF8).
+ LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? char.MAX_CODE_POINT : 255, transpositions);
+ Automaton levAutomaton = lev.ToAutomaton(maxEdits);
+ Automaton combined = BasicOperations.Concatenate(Arrays.AsList(prefix, levAutomaton));
+ combined.Deterministic = true; // its like the special case in concatenate itself, except we cloneExpanded already
+ subs[upto] = combined;
+ upto++;
+ }
+ }
+
+ if (subs.Length == 0)
+ {
+ // automaton is empty, there is no accepted paths through it
+ return BasicAutomata.MakeEmpty(); // matches nothing
+ }
+ else if (subs.Length == 1)
+ {
+ // no synonyms or anything: just a single path through the tokenstream
+ return subs[0];
+ }
+ else
+ {
+ // multiple paths: this is really scary! is it slow?
+ // maybe we should not do this and throw UOE?
+ Automaton a = BasicOperations.Union(Arrays.AsList(subs));
+ // TODO: we could call toLevenshteinAutomata() before det?
+ // this only happens if you have multiple paths anyway (e.g. synonyms)
+ BasicOperations.Determinize(a);
+
+ return a;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Analyzing/SuggestStopFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Analyzing/SuggestStopFilter.cs b/src/Lucene.Net.Suggest/Suggest/Analyzing/SuggestStopFilter.cs
new file mode 100644
index 0000000..3f2f72c
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Analyzing/SuggestStopFilter.cs
@@ -0,0 +1,138 @@
+using System.Diagnostics;
+using Lucene.Net.Analysis;
+using Lucene.Net.Analysis.Tokenattributes;
+
+namespace Lucene.Net.Search.Suggest.Analyzing
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Like <seealso cref="StopFilter"/> except it will not remove the
+ /// last token if that token was not followed by some token
+ /// separator. For example, a query 'find the' would
+ /// preserve the 'the' since it was not followed by a space or
+ /// punctuation or something, and mark it KEYWORD so future
+ /// stemmers won't touch it either while a query like "find
+ /// the popsicle' would remove 'the' as a stopword.
+ ///
+ /// <para>Normally you'd use the ordinary <seealso cref="StopFilter"/>
+ /// in your indexAnalyzer and then this class in your
+ /// queryAnalyzer, when using one of the analyzing suggesters.
+ /// </para>
+ /// </summary>
+
+ public sealed class SuggestStopFilter : TokenFilter
+ {
+
+ private readonly CharTermAttribute termAtt = addAttribute(typeof(CharTermAttribute));
+ private readonly PositionIncrementAttribute posIncAtt = addAttribute(typeof(PositionIncrementAttribute));
+ private readonly KeywordAttribute keywordAtt = addAttribute(typeof(KeywordAttribute));
+ private readonly OffsetAttribute offsetAtt = addAttribute(typeof(OffsetAttribute));
+ private readonly CharArraySet stopWords;
+
+ private State endState;
+
+ /// <summary>
+ /// Sole constructor. </summary>
+ public SuggestStopFilter(TokenStream input, CharArraySet stopWords)
+ : base(input)
+ {
+ this.stopWords = stopWords;
+ }
+
+ public override void Reset()
+ {
+ base.Reset();
+ endState = null;
+ }
+
+ public override void End()
+ {
+ if (endState == null)
+ {
+ base.End();
+ }
+ else
+ {
+ // NOTE: we already called .end() from our .next() when
+ // the stream was complete, so we do not call
+ // super.end() here
+ RestoreState(endState);
+ }
+ }
+
+ public override bool IncrementToken()
+ {
+ if (endState != null)
+ {
+ return false;
+ }
+
+ if (!Input.IncrementToken())
+ {
+ return false;
+ }
+
+ int skippedPositions = 0;
+ while (true)
+ {
+ if (stopWords.Contains(termAtt.Buffer(), 0, termAtt.Length))
+ {
+ int posInc = posIncAtt.PositionIncrement;
+ int endOffset = offsetAtt.EndOffset();
+ // This token may be a stopword, if it's not end:
+ State sav = CaptureState();
+ if (Input.IncrementToken())
+ {
+ // It was a stopword; skip it
+ skippedPositions += posInc;
+ }
+ else
+ {
+ ClearAttributes();
+ Input.End();
+ endState = CaptureState();
+ int finalEndOffset = offsetAtt.EndOffset();
+ Debug.Assert(finalEndOffset >= endOffset);
+ if (finalEndOffset > endOffset)
+ {
+ // OK there was a token separator after the
+ // stopword, so it was a stopword
+ return false;
+ }
+ else
+ {
+ // No token separator after final token that
+ // looked like a stop-word; don't filter it:
+ RestoreState(sav);
+ posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
+ keywordAtt.Keyword = true;
+ return true;
+ }
+ }
+ }
+ else
+ {
+ // Not a stopword; return the current token:
+ posIncAtt.PositionIncrement = skippedPositions + posIncAtt.PositionIncrement;
+ return true;
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/BufferedInputIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/BufferedInputIterator.cs b/src/Lucene.Net.Suggest/Suggest/BufferedInputIterator.cs
new file mode 100644
index 0000000..916c41c
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/BufferedInputIterator.cs
@@ -0,0 +1,139 @@
+using System.Collections.Generic;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// This wrapper buffers incoming elements.
+ /// @lucene.experimental
+ /// </summary>
+ public class BufferedInputIterator : InputIterator
+ {
+ // TODO keep this for now
+ /// <summary>
+ /// buffered term entries </summary>
+ protected internal BytesRefArray entries = new BytesRefArray(Counter.NewCounter());
+ /// <summary>
+ /// buffered payload entries </summary>
+ protected internal BytesRefArray payloads = new BytesRefArray(Counter.NewCounter());
+ /// <summary>
+ /// buffered context set entries </summary>
+ protected internal IList<HashSet<BytesRef>> contextSets = new List<HashSet<BytesRef>>();
+ /// <summary>
+ /// current buffer position </summary>
+ protected internal int curPos = -1;
+ /// <summary>
+ /// buffered weights, parallel with <seealso cref="#entries"/> </summary>
+ protected internal long[] freqs = new long[1];
+ private readonly BytesRef spare = new BytesRef();
+ private readonly BytesRef payloadSpare = new BytesRef();
+ private readonly bool hasPayloads;
+ private readonly IComparer<BytesRef> comp;
+
+ private readonly bool hasContexts_Renamed;
+
+ /// <summary>
+ /// Creates a new iterator, buffering entries from the specified iterator </summary>
+ public BufferedInputIterator(InputIterator source)
+ {
+ BytesRef spare;
+ int freqIndex = 0;
+ hasPayloads = source.HasPayloads;
+ hasContexts_Renamed = source.HasContexts;
+ while ((spare = source.Next()) != null)
+ {
+ entries.Append(spare);
+ if (hasPayloads)
+ {
+ payloads.Append(source.Payload);
+ }
+ if (hasContexts_Renamed)
+ {
+ contextSets.Add(source.Contexts);
+ }
+ if (freqIndex >= freqs.Length)
+ {
+ freqs = ArrayUtil.Grow(freqs, freqs.Length + 1);
+ }
+ freqs[freqIndex++] = source.Weight;
+ }
+ comp = source.Comparator;
+ }
+
+ public virtual long Weight
+ {
+ get { return freqs[curPos]; }
+ }
+
+ public BytesRef Next()
+ {
+ if (++curPos < entries.Size())
+ {
+ entries.Get(spare, curPos);
+ return spare;
+ }
+ return null;
+ }
+
+ public virtual BytesRef Payload
+ {
+ get
+ {
+ if (hasPayloads && curPos < payloads.Size())
+ {
+ return payloads.Get(payloadSpare, curPos);
+ }
+ return null;
+ }
+ }
+
+ public virtual bool HasPayloads
+ {
+ get { return hasPayloads; }
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return comp;
+ }
+ }
+
+ public virtual HashSet<BytesRef> Contexts
+ {
+ get
+ {
+ if (hasContexts_Renamed && curPos < contextSets.Count)
+ {
+ return contextSets[curPos];
+ }
+ return null;
+ }
+ }
+
+ public virtual bool HasContexts
+ {
+ get { return hasContexts_Renamed; }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/BufferingTermFreqIteratorWrapper.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/BufferingTermFreqIteratorWrapper.cs b/src/Lucene.Net.Suggest/Suggest/BufferingTermFreqIteratorWrapper.cs
new file mode 100644
index 0000000..2121a35
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/BufferingTermFreqIteratorWrapper.cs
@@ -0,0 +1,89 @@
+using System.Collections.Generic;
+using Lucene.Net.Search.Spell;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// This wrapper buffers incoming elements.
+ /// @lucene.experimental
+ /// </summary>
+ public class BufferingTermFreqIteratorWrapper : TermFreqIterator
+ {
+ // TODO keep this for now
+ /// <summary>
+ /// buffered term entries </summary>
+ protected internal BytesRefArray entries = new BytesRefArray(Counter.NewCounter());
+ /// <summary>
+ /// current buffer position </summary>
+ protected internal int curPos = -1;
+ /// <summary>
+ /// buffered weights, parallel with <seealso cref="#entries"/> </summary>
+ protected internal long[] freqs = new long[1];
+ private readonly BytesRef spare = new BytesRef();
+ private readonly IComparer<BytesRef> comp;
+
+ /// <summary>
+ /// Creates a new iterator, buffering entries from the specified iterator
+ /// </summary>
+ public BufferingTermFreqIteratorWrapper(TermFreqIterator source)
+ {
+ this.comp = source.Comparator;
+ BytesRef spare;
+ int freqIndex = 0;
+ while ((spare = source.Next()) != null)
+ {
+ entries.Append(spare);
+ if (freqIndex >= freqs.Length)
+ {
+ freqs = ArrayUtil.Grow(freqs, freqs.Length + 1);
+ }
+ freqs[freqIndex++] = source.Weight;
+ }
+
+ }
+
+ public virtual long Weight
+ {
+ get { return freqs[curPos]; }
+ }
+
+ public BytesRef Next()
+ {
+ if (++curPos < entries.Size())
+ {
+ entries.Get(spare, curPos);
+ return spare;
+ }
+ return null;
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return comp;
+ }
+ }
+
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/DocumentDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/DocumentDictionary.cs b/src/Lucene.Net.Suggest/Suggest/DocumentDictionary.cs
new file mode 100644
index 0000000..47cd026
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/DocumentDictionary.cs
@@ -0,0 +1,278 @@
+using System.Collections.Generic;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+using Lucene.Net.Search.Spell;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// <para>
+ /// Dictionary with terms, weights, payload (optional) and contexts (optional)
+ /// information taken from stored/indexed fields in a Lucene index.
+ /// </para>
+ /// <b>NOTE:</b>
+ /// <ul>
+ /// <li>
+ /// The term and (optionally) payload fields have to be
+ /// stored
+ /// </li>
+ /// <li>
+ /// The weight field can be stored or can be a <seealso cref="NumericDocValues"/>.
+ /// If the weight field is not defined, the value of the weight is <code>0</code>
+ /// </li>
+ /// <li>
+ /// if any of the term or (optionally) payload fields supplied
+ /// do not have a value for a document, then the document is
+ /// skipped by the dictionary
+ /// </li>
+ /// </ul>
+ /// </summary>
+ public class DocumentDictionary : Dictionary
+ {
+
+ /// <summary>
+ /// <seealso cref="IndexReader"/> to load documents from </summary>
+ protected internal readonly IndexReader reader;
+
+ /// <summary>
+ /// Field to read payload from </summary>
+ protected internal readonly string payloadField;
+ /// <summary>
+ /// Field to read contexts from </summary>
+ protected internal readonly string contextsField;
+ private readonly string field;
+ private readonly string weightField;
+
+ /// <summary>
+ /// Creates a new dictionary with the contents of the fields named <code>field</code>
+ /// for the terms and <code>weightField</code> for the weights that will be used for
+ /// the corresponding terms.
+ /// </summary>
+ public DocumentDictionary(IndexReader reader, string field, string weightField)
+ : this(reader, field, weightField, null)
+ {
+ }
+
+ /// <summary>
+ /// Creates a new dictionary with the contents of the fields named <code>field</code>
+ /// for the terms, <code>weightField</code> for the weights that will be used for the
+ /// the corresponding terms and <code>payloadField</code> for the corresponding payloads
+ /// for the entry.
+ /// </summary>
+ public DocumentDictionary(IndexReader reader, string field, string weightField, string payloadField)
+ : this(reader, field, weightField, payloadField, null)
+ {
+ }
+
+ /// <summary>
+ /// Creates a new dictionary with the contents of the fields named <code>field</code>
+ /// for the terms, <code>weightField</code> for the weights that will be used for the
+ /// the corresponding terms, <code>payloadField</code> for the corresponding payloads
+ /// for the entry and <code>contextsFeild</code> for associated contexts.
+ /// </summary>
+ public DocumentDictionary(IndexReader reader, string field, string weightField, string payloadField, string contextsField)
+ {
+ this.reader = reader;
+ this.field = field;
+ this.weightField = weightField;
+ this.payloadField = payloadField;
+ this.contextsField = contextsField;
+ }
+
+ public virtual InputIterator EntryIterator
+ {
+ get
+ {
+ return new DocumentInputIterator(this, payloadField != null, contextsField != null);
+ }
+ }
+
+ /// <summary>
+ /// Implements <seealso cref="InputIterator"/> from stored fields. </summary>
+ protected internal class DocumentInputIterator : InputIterator
+ {
+ private readonly DocumentDictionary outerInstance;
+
+
+ internal readonly int docCount;
+ internal readonly HashSet<string> relevantFields;
+ internal readonly bool hasPayloads;
+ internal readonly bool hasContexts;
+ internal readonly Bits liveDocs;
+ internal int currentDocId = -1;
+ internal long currentWeight;
+ internal BytesRef currentPayload;
+ internal HashSet<BytesRef> currentContexts;
+ internal readonly NumericDocValues weightValues;
+
+
+ /// <summary>
+ /// Creates an iterator over term, weight and payload fields from the lucene
+ /// index. setting <code>withPayload</code> to false, implies an iterator
+ /// over only term and weight.
+ /// </summary>
+ public DocumentInputIterator(DocumentDictionary outerInstance, bool hasPayloads, bool hasContexts)
+ {
+ this.outerInstance = outerInstance;
+ this.hasPayloads = hasPayloads;
+ this.hasContexts = hasContexts;
+ docCount = outerInstance.reader.MaxDoc() - 1;
+ weightValues = (outerInstance.weightField != null) ? MultiDocValues.GetNumericValues(outerInstance.reader, outerInstance.weightField) : null;
+ liveDocs = (outerInstance.reader.Leaves().Count > 0) ? MultiFields.GetLiveDocs(outerInstance.reader) : null;
+ relevantFields = GetRelevantFields(new string[] { outerInstance.field, outerInstance.weightField, outerInstance.payloadField, outerInstance.contextsField });
+ }
+
+ public virtual long Weight
+ {
+ get { return currentWeight; }
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return null;
+ }
+ }
+
+ public BytesRef Next()
+ {
+ while (currentDocId < docCount)
+ {
+ currentDocId++;
+ if (liveDocs != null && !liveDocs.Get(currentDocId))
+ {
+ continue;
+ }
+
+ Document doc = outerInstance.reader.Document(currentDocId, relevantFields);
+
+ BytesRef tempPayload = null;
+ BytesRef tempTerm = null;
+ HashSet<BytesRef> tempContexts = new HashSet<BytesRef>();
+
+ if (hasPayloads)
+ {
+ IndexableField payload = doc.GetField(outerInstance.payloadField);
+ if (payload == null || (payload.BinaryValue() == null && payload.StringValue == null))
+ {
+ continue;
+ }
+ tempPayload = payload.BinaryValue() ?? new BytesRef(payload.StringValue);
+ }
+
+ if (hasContexts)
+ {
+ IndexableField[] contextFields = doc.GetFields(outerInstance.contextsField);
+ foreach (IndexableField contextField in contextFields)
+ {
+ if (contextField.BinaryValue() == null && contextField.StringValue == null)
+ {
+ continue;
+ }
+ else
+ {
+ tempContexts.Add(contextField.BinaryValue() ?? new BytesRef(contextField.StringValue));
+ }
+ }
+ }
+
+ IndexableField fieldVal = doc.GetField(outerInstance.field);
+ if (fieldVal == null || (fieldVal.BinaryValue() == null && fieldVal.StringValue == null))
+ {
+ continue;
+ }
+ tempTerm = (fieldVal.StringValue != null) ? new BytesRef(fieldVal.StringValue) : fieldVal.BinaryValue();
+
+ currentPayload = tempPayload;
+ currentContexts = tempContexts;
+ currentWeight = GetWeight(doc, currentDocId);
+
+ return tempTerm;
+ }
+ return null;
+ }
+
+ public virtual BytesRef Payload
+ {
+ get { return currentPayload; }
+ }
+
+ public virtual bool HasPayloads
+ {
+ get { return hasPayloads; }
+ }
+
+ /// <summary>
+ /// Returns the value of the <code>weightField</code> for the current document.
+ /// Retrieves the value for the <code>weightField</code> if its stored (using <code>doc</code>)
+ /// or if its indexed as <seealso cref="NumericDocValues"/> (using <code>docId</code>) for the document.
+ /// If no value is found, then the weight is 0.
+ /// </summary>
+ protected internal virtual long GetWeight(Document doc, int docId)
+ {
+ IndexableField weight = doc.GetField(outerInstance.weightField);
+ if (weight != null) // found weight as stored
+ {
+ return (weight.NumericValue != null) ? (long)weight.NumericValue : 0;
+ } // found weight as NumericDocValue
+ else if (weightValues != null)
+ {
+ return weightValues.Get(docId);
+ } // fall back
+ else
+ {
+ return 0;
+ }
+ }
+
+ internal HashSet<string> GetRelevantFields(params string[] fields)
+ {
+ var relevantFields = new HashSet<string>();
+ foreach (string relevantField in fields)
+ {
+ if (relevantField != null)
+ {
+ relevantFields.Add(relevantField);
+ }
+ }
+ return relevantFields;
+ }
+
+ public virtual HashSet<BytesRef> Contexts
+ {
+ get
+ {
+ if (hasContexts)
+ {
+ return currentContexts;
+ }
+ return null;
+ }
+ }
+
+ public virtual bool HasContexts
+ {
+ get { return hasContexts; }
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/DocumentValueSourceDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/DocumentValueSourceDictionary.cs b/src/Lucene.Net.Suggest/Suggest/DocumentValueSourceDictionary.cs
new file mode 100644
index 0000000..5793c44
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/DocumentValueSourceDictionary.cs
@@ -0,0 +1,169 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Documents;
+using Lucene.Net.Index;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// <para>
+ /// Dictionary with terms and optionally payload information
+ /// taken from stored fields in a Lucene index. Similar to
+ /// <seealso cref="DocumentDictionary"/>, except it obtains the weight
+ /// of the terms in a document based on a <seealso cref="ValueSource"/>.
+ /// </para>
+ /// <b>NOTE:</b>
+ /// <ul>
+ /// <li>
+ /// The term and (optionally) payload fields have to be
+ /// stored
+ /// </li>
+ /// <li>
+ /// if the term or (optionally) payload fields supplied
+ /// do not have a value for a document, then the document is
+ /// rejected by the dictionary
+ /// </li>
+ /// </ul>
+ /// <para>
+ /// In practice the <seealso cref="ValueSource"/> will likely be obtained
+ /// using the lucene expression module. The following example shows
+ /// how to create a <seealso cref="ValueSource"/> from a simple addition of two
+ /// fields:
+ /// <code>
+ /// Expression expression = JavascriptCompiler.compile("f1 + f2");
+ /// SimpleBindings bindings = new SimpleBindings();
+ /// bindings.add(new SortField("f1", SortField.Type.LONG));
+ /// bindings.add(new SortField("f2", SortField.Type.LONG));
+ /// ValueSource valueSource = expression.getValueSource(bindings);
+ /// </code>
+ /// </para>
+ ///
+ /// </summary>
+ public class DocumentValueSourceDictionary : DocumentDictionary
+ {
+
+ private readonly ValueSource weightsValueSource;
+
+ /// <summary>
+ /// Creates a new dictionary with the contents of the fields named <code>field</code>
+ /// for the terms, <code>payload</code> for the corresponding payloads, <code>contexts</code>
+ /// for the associated contexts and uses the <code>weightsValueSource</code> supplied
+ /// to determine the score.
+ /// </summary>
+ public DocumentValueSourceDictionary(IndexReader reader, string field, ValueSource weightsValueSource, string payload, string contexts)
+ : base(reader, field, null, payload, contexts)
+ {
+ this.weightsValueSource = weightsValueSource;
+ }
+ /// <summary>
+ /// Creates a new dictionary with the contents of the fields named <code>field</code>
+ /// for the terms, <code>payloadField</code> for the corresponding payloads
+ /// and uses the <code>weightsValueSource</code> supplied to determine the
+ /// score.
+ /// </summary>
+ public DocumentValueSourceDictionary(IndexReader reader, string field, ValueSource weightsValueSource, string payload)
+ : base(reader, field, null, payload)
+ {
+ this.weightsValueSource = weightsValueSource;
+ }
+
+ /// <summary>
+ /// Creates a new dictionary with the contents of the fields named <code>field</code>
+ /// for the terms and uses the <code>weightsValueSource</code> supplied to determine the
+ /// score.
+ /// </summary>
+ public DocumentValueSourceDictionary(IndexReader reader, string field, ValueSource weightsValueSource)
+ : base(reader, field, null, null)
+ {
+ this.weightsValueSource = weightsValueSource;
+ }
+
+ public override InputIterator EntryIterator
+ {
+ get
+ {
+ return new DocumentValueSourceInputIterator(this, payloadField != null, contextsField != null);
+ }
+ }
+
+ internal sealed class DocumentValueSourceInputIterator : DocumentDictionary.DocumentInputIterator
+ {
+ private readonly DocumentValueSourceDictionary outerInstance;
+
+
+ internal FunctionValues currentWeightValues;
+ /// <summary>
+ /// leaves of the reader </summary>
+ internal readonly IList<AtomicReaderContext> leaves;
+ /// <summary>
+ /// starting docIds of all the leaves </summary>
+ internal readonly int[] starts;
+ /// <summary>
+ /// current leave index </summary>
+ internal int currentLeafIndex = 0;
+
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: public DocumentValueSourceInputIterator(boolean hasPayloads, boolean hasContexts) throws java.io.IOException
+ public DocumentValueSourceInputIterator(DocumentValueSourceDictionary outerInstance, bool hasPayloads, bool hasContexts)
+ : base(outerInstance, hasPayloads, hasContexts)
+ {
+ this.outerInstance = outerInstance;
+ leaves = outerInstance.reader.Leaves();
+ starts = new int[leaves.Count + 1];
+ for (int i = 0; i < leaves.Count; i++)
+ {
+ starts[i] = leaves[i].DocBase;
+ }
+ starts[leaves.Count] = outerInstance.reader.MaxDoc();
+ currentWeightValues = (leaves.Count > 0) ? outerInstance.weightsValueSource.GetValues(new Dictionary<string, object>(), leaves[currentLeafIndex]) : null;
+ }
+
+ /// <summary>
+ /// Returns the weight for the current <code>docId</code> as computed
+ /// by the <code>weightsValueSource</code>
+ ///
+ /// </summary>
+ protected internal override long GetWeight(Document doc, int docId)
+ {
+ if (currentWeightValues == null)
+ {
+ return 0;
+ }
+ int subIndex = ReaderUtil.subIndex(docId, starts);
+ if (subIndex != currentLeafIndex)
+ {
+ currentLeafIndex = subIndex;
+ try
+ {
+ currentWeightValues = outerInstance.weightsValueSource.GetValues(new Dictionary<string, object>(), leaves[currentLeafIndex]);
+ }
+ catch (IOException)
+ {
+ throw new Exception();
+ }
+ }
+ return currentWeightValues.LongVal(docId - starts[subIndex]);
+ }
+
+ }
+ }
+
+}
\ No newline at end of file