You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/09/16 00:24:50 UTC
[3/8] Porting Lucene.Net.Suggest (still not compiling)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/FileDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/FileDictionary.cs b/src/Lucene.Net.Suggest/Suggest/FileDictionary.cs
new file mode 100644
index 0000000..5ef853b
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/FileDictionary.cs
@@ -0,0 +1,284 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Search.Spell;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Dictionary represented by a text file.
+ ///
+ /// <p/>Format allowed: 1 entry per line:<br/>
+ /// An entry can be: <br/>
+ /// <ul>
+ /// <li>suggestion</li>
+ /// <li>suggestion <code>fieldDelimiter</code> weight</li>
+ /// <li>suggestion <code>fieldDelimiter</code> weight <code>fieldDelimiter</code> payload</li>
+ /// </ul>
+ /// where the default <code>fieldDelimiter</code> is {@value #DEFAULT_FIELD_DELIMITER}<br/>
+ /// <p/>
+ /// <b>NOTE:</b>
+ /// <ul>
+ /// <li>In order to have payload enabled, the first entry has to have a payload</li>
+ /// <li>If the weight for an entry is not specified then a value of 1 is used</li>
+ /// <li>A payload cannot be specified without having the weight specified for an entry</li>
+ /// <li>If the payload for an entry is not specified (assuming payload is enabled)
+ /// then an empty payload is returned</li>
+ /// <li>An entry cannot have more than two <code>fieldDelimiter</code></li>
+ /// </ul>
+ /// <p/>
+ /// <b>Example:</b><br/>
+ /// word1 word2 TAB 100 TAB payload1<br/>
+ /// word3 TAB 101<br/>
+ /// word4 word3 TAB 102<br/>
+ /// </summary>
+ public class FileDictionary : Dictionary
+ {
+
+ /// <summary>
+ /// Tab-delimited fields are most common thus the default, but one can override this via the constructor
+ /// </summary>
+ public const string DEFAULT_FIELD_DELIMITER = "\t";
+ private BufferedReader @in;
+ private string line;
+ private bool done = false;
+ private readonly string fieldDelimiter;
+
+ /// <summary>
+ /// Creates a dictionary based on an inputstream.
+ /// Using <seealso cref="#DEFAULT_FIELD_DELIMITER"/> as the
+ /// field seperator in a line.
+ /// <para>
+ /// NOTE: content is treated as UTF-8
+ /// </para>
+ /// </summary>
+ public FileDictionary(InputStream dictFile)
+ : this(dictFile, DEFAULT_FIELD_DELIMITER)
+ {
+ }
+
+ /// <summary>
+ /// Creates a dictionary based on a reader.
+ /// Using <seealso cref="#DEFAULT_FIELD_DELIMITER"/> as the
+ /// field seperator in a line.
+ /// </summary>
+ public FileDictionary(Reader reader)
+ : this(reader, DEFAULT_FIELD_DELIMITER)
+ {
+ }
+
+ /// <summary>
+ /// Creates a dictionary based on a reader.
+ /// Using <code>fieldDelimiter</code> to seperate out the
+ /// fields in a line.
+ /// </summary>
+ public FileDictionary(Reader reader, string fieldDelimiter)
+ {
+ @in = new BufferedReader(reader);
+ this.fieldDelimiter = fieldDelimiter;
+ }
+
+ /// <summary>
+ /// Creates a dictionary based on an inputstream.
+ /// Using <code>fieldDelimiter</code> to seperate out the
+ /// fields in a line.
+ /// <para>
+ /// NOTE: content is treated as UTF-8
+ /// </para>
+ /// </summary>
+ public FileDictionary(InputStream dictFile, string fieldDelimiter)
+ {
+ @in = new BufferedReader(IOUtils.GetDecodingReader(dictFile, StandardCharsets.UTF_8));
+ this.fieldDelimiter = fieldDelimiter;
+ }
+
+ public virtual InputIterator EntryIterator
+ {
+ get
+ {
+ try
+ {
+ return new FileIterator(this);
+ }
+ catch (IOException)
+ {
+ throw new Exception();
+ }
+ }
+ }
+
+ internal sealed class FileIterator : InputIterator
+ {
+ private readonly FileDictionary outerInstance;
+
+ internal long curWeight;
+ internal readonly BytesRef spare = new BytesRef();
+ internal BytesRef curPayload = new BytesRef();
+ internal bool isFirstLine = true;
+ internal bool hasPayloads = false;
+
+ internal FileIterator(FileDictionary outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ outerInstance.line = outerInstance.@in.readLine();
+ if (outerInstance.line == null)
+ {
+ outerInstance.done = true;
+ IOUtils.Close(outerInstance.@in);
+ }
+ else
+ {
+ string[] fields = outerInstance.line.Split(outerInstance.fieldDelimiter, true);
+ if (fields.Length > 3)
+ {
+ throw new System.ArgumentException("More than 3 fields in one line");
+ } // term, weight, payload
+ else if (fields.Length == 3)
+ {
+ hasPayloads = true;
+ spare.CopyChars(fields[0]);
+ ReadWeight(fields[1]);
+ curPayload.CopyChars(fields[2]);
+ } // term, weight
+ else if (fields.Length == 2)
+ {
+ spare.CopyChars(fields[0]);
+ ReadWeight(fields[1]);
+ } // only term
+ else
+ {
+ spare.CopyChars(fields[0]);
+ curWeight = 1;
+ }
+ }
+ }
+
+ public long Weight
+ {
+ get { return curWeight; }
+ }
+
+ public BytesRef Next()
+ {
+ if (outerInstance.done)
+ {
+ return null;
+ }
+ if (isFirstLine)
+ {
+ isFirstLine = false;
+ return spare;
+ }
+ outerInstance.line = outerInstance.@in.ReadLine();
+ if (outerInstance.line != null)
+ {
+ string[] fields = outerInstance.line.Split(outerInstance.fieldDelimiter, true);
+ if (fields.Length > 3)
+ {
+ throw new System.ArgumentException("More than 3 fields in one line");
+ } // term, weight and payload
+ else if (fields.Length == 3)
+ {
+ spare.CopyChars(fields[0]);
+ ReadWeight(fields[1]);
+ if (hasPayloads)
+ {
+ curPayload.CopyChars(fields[2]);
+ }
+ } // term, weight
+ else if (fields.Length == 2)
+ {
+ spare.CopyChars(fields[0]);
+ ReadWeight(fields[1]);
+ if (hasPayloads) // have an empty payload
+ {
+ curPayload = new BytesRef();
+ }
+ } // only term
+ else
+ {
+ spare.CopyChars(fields[0]);
+ curWeight = 1;
+ if (hasPayloads)
+ {
+ curPayload = new BytesRef();
+ }
+ }
+ return spare;
+ }
+ else
+ {
+ outerInstance.done = true;
+ IOUtils.Close(outerInstance.@in);
+ return null;
+ }
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return null;
+ }
+ }
+
+ public BytesRef Payload
+ {
+ get
+ {
+ {
+ return (hasPayloads) ? curPayload : null;
+ }
+ }
+ }
+
+ public bool HasPayloads
+ {
+ get { return hasPayloads; }
+ }
+
+ internal void ReadWeight(string weight)
+ {
+ // keep reading floats for bw compat
+ try
+ {
+ curWeight = Convert.ToInt64(weight);
+ }
+ catch (FormatException)
+ {
+ curWeight = (long)Convert.ToDouble(weight);
+ }
+ }
+
+ public HashSet<BytesRef> Contexts
+ {
+ get { return null; }
+ }
+
+ public bool HasContexts
+ {
+ get { return false; }
+ }
+ }
+
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Fst/BytesRefSorter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/BytesRefSorter.cs b/src/Lucene.Net.Suggest/Suggest/Fst/BytesRefSorter.cs
new file mode 100644
index 0000000..ef4ef54
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/BytesRefSorter.cs
@@ -0,0 +1,34 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest.Fst
+{
+ /// <summary>
+ /// Collects <seealso cref="BytesRef"/> and then allows one to iterate over their sorted order. Implementations
+ /// of this interface will be called in a single-threaded scenario.
+ /// </summary>
+ public interface BytesRefSorter
+ {
+ /// <summary>
+ /// Adds a single suggestion entry (possibly compound with its bucket).
+ /// </summary>
+ /// <exception cref="IOException"> If an I/O exception occurs. </exception>
+ /// <exception cref="InvalidOperationException"> If an addition attempt is performed after
+ /// a call to <seealso cref="#iterator()"/> has been made. </exception>
+ void Add(BytesRef utf8);
+
+ /// <summary>
+ /// Sorts the entries added in <seealso cref="#add(BytesRef)"/> and returns
+ /// an iterator over all sorted entries.
+ /// </summary>
+ /// <exception cref="IOException"> If an I/O exception occurs. </exception>
+ BytesRefIterator Iterator();
+
+ /// <summary>
+ /// Comparator used to determine the sort order of entries.
+ /// </summary>
+ IComparer<BytesRef> Comparator { get; }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Fst/ExternalRefSorter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/ExternalRefSorter.cs b/src/Lucene.Net.Suggest/Suggest/Fst/ExternalRefSorter.cs
new file mode 100644
index 0000000..a2ebe2b
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/ExternalRefSorter.cs
@@ -0,0 +1,150 @@
+using System;
+using System.IO;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest.Fst
+{
+ using System.Collections.Generic;
+
+ /// <summary>
+ /// Builds and iterates over sequences stored on disk.
+ /// </summary>
+ public class ExternalRefSorter : BytesRefSorter, IDisposable
+ {
+ private readonly OfflineSorter sort;
+ private OfflineSorter.ByteSequencesWriter writer;
+ private FileInfo input;
+ private FileInfo sorted;
+
+ /// <summary>
+ /// Will buffer all sequences to a temporary file and then sort (all on-disk).
+ /// </summary>
+ public ExternalRefSorter(OfflineSorter sort)
+ {
+ this.sort = sort;
+ this.input = new FileInfo(Path.GetTempFileName());
+ this.writer = new OfflineSorter.ByteSequencesWriter(input);
+ }
+
+ public void Add(BytesRef utf8)
+ {
+ if (writer == null)
+ {
+ throw new InvalidOperationException();
+ }
+ writer.Write(utf8);
+ }
+
+ public BytesRefIterator Iterator()
+ {
+ if (sorted == null)
+ {
+ CloseWriter();
+
+ sorted = new FileInfo(Path.GetTempFileName());
+ sort.Sort(input, sorted);
+
+ input.Delete();
+ input = null;
+ }
+
+ return new ByteSequenceIterator(new OfflineSorter.ByteSequencesReader(sorted), sort.Comparator);
+ }
+
+ private void CloseWriter()
+ {
+ if (writer != null)
+ {
+ writer.Dispose();
+ writer = null;
+ }
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return sort.Comparator;
+ }
+ }
+
+ /// <summary>
+ /// Removes any written temporary files.
+ /// </summary>
+ public void Dispose()
+ {
+ try
+ {
+ CloseWriter();
+ }
+ finally
+ {
+ if (input != null)
+ {
+ input.Delete();
+ }
+ if (sorted != null)
+ {
+ sorted.Delete();
+ }
+ }
+ }
+
+ /// <summary>
+ /// Iterate over byte refs in a file.
+ /// </summary>
+ internal class ByteSequenceIterator : BytesRefIterator
+ {
+ private readonly OfflineSorter.ByteSequencesReader reader;
+ private BytesRef scratch = new BytesRef();
+ private readonly IComparer<BytesRef> comparator;
+
+ public ByteSequenceIterator(OfflineSorter.ByteSequencesReader reader, IComparer<BytesRef> comparator)
+ {
+ this.reader = reader;
+ this.comparator = comparator;
+ }
+
+ public BytesRef Next()
+ {
+ if (scratch == null)
+ {
+ return null;
+ }
+ bool success = false;
+ try
+ {
+ sbyte[] next = reader.Read();
+ if (next != null)
+ {
+ scratch.Bytes = next;
+ scratch.Length = next.Length;
+ scratch.Offset = 0;
+ }
+ else
+ {
+ IOUtils.Close(reader);
+ scratch = null;
+ }
+ success = true;
+ return scratch;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(reader);
+ }
+ }
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return comparator;
+ }
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
new file mode 100644
index 0000000..0245d46
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletion.cs
@@ -0,0 +1,467 @@
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using System.Linq;
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Search.Suggest.Fst
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Finite state automata based implementation of "autocomplete" functionality.
+ /// </summary>
+ /// <seealso cref=FSTCompletionBuilder
+ /// @lucene.experimental </seealso>
+
+ // TODO: we could store exact weights as outputs from the FST (int4 encoded
+ // floats). This would provide exact outputs from this method and to some
+ // degree allowed post-sorting on a more fine-grained weight.
+
+ // TODO: support for Analyzers (infix suggestions, synonyms?)
+
+ public class FSTCompletion
+ {
+ /// <summary>
+ /// A single completion for a given key.
+ /// </summary>
+ public sealed class Completion : IComparable<Completion>
+ {
+ /// <summary>
+ /// UTF-8 bytes of the suggestion </summary>
+ public readonly BytesRef utf8;
+ /// <summary>
+ /// source bucket (weight) of the suggestion </summary>
+ public readonly int bucket;
+
+ internal Completion(BytesRef key, int bucket)
+ {
+ this.utf8 = BytesRef.DeepCopyOf(key);
+ this.bucket = bucket;
+ }
+
+ public override string ToString()
+ {
+ return utf8.Utf8ToString() + "/" + bucket;
+ }
+
+ /// <seealso cref= BytesRef#compareTo(BytesRef) </seealso>
+ public int CompareTo(Completion o)
+ {
+ return this.utf8.CompareTo(o.utf8);
+ }
+ }
+
+ /// <summary>
+ /// Default number of buckets.
+ /// </summary>
+ public const int DEFAULT_BUCKETS = 10;
+
+ /// <summary>
+ /// An empty result. Keep this an <seealso cref="ArrayList"/> to keep all the returned
+ /// lists of single type (monomorphic calls).
+ /// </summary>
+ private static readonly List<Completion> EMPTY_RESULT = new List<Completion>();
+
+ /// <summary>
+ /// Finite state automaton encoding all the lookup terms. See class notes for
+ /// details.
+ /// </summary>
+ private readonly FST<object> automaton;
+
+ /// <summary>
+ /// An array of arcs leaving the root automaton state and encoding weights of
+ /// all completions in their sub-trees.
+ /// </summary>
+ private readonly FST.Arc<object>[] rootArcs;
+
+ /// <seealso cref= #FSTCompletion(FST, boolean, boolean) </seealso>
+ private readonly bool exactFirst;
+
+ /// <seealso cref= #FSTCompletion(FST, boolean, boolean) </seealso>
+ private readonly bool higherWeightsFirst;
+
+ /// <summary>
+ /// Constructs an FSTCompletion, specifying higherWeightsFirst and exactFirst. </summary>
+ /// <param name="automaton">
+ /// Automaton with completions. See <seealso cref="FSTCompletionBuilder"/>. </param>
+ /// <param name="exactFirst">
+ /// Return most popular suggestions first. This is the default
+ /// behavior for this implementation. Setting it to <code>false</code>
+ /// has no effect (use constant term weights to sort alphabetically
+ /// only). </param>
+ /// <param name="exactFirst">
+ /// Find and push an exact match to the first position of the result
+ /// list if found. </param>
+ public FSTCompletion(FST<object> automaton, bool higherWeightsFirst, bool exactFirst)
+ {
+ this.automaton = automaton;
+ if (automaton != null)
+ {
+ this.rootArcs = CacheRootArcs(automaton);
+ }
+ else
+ {
+ this.rootArcs = new FST.Arc[0];
+ }
+ this.higherWeightsFirst = higherWeightsFirst;
+ this.exactFirst = exactFirst;
+ }
+
+ /// <summary>
+ /// Defaults to higher weights first and exact first. </summary>
+ /// <seealso cref= #FSTCompletion(FST, boolean, boolean) </seealso>
+ public FSTCompletion(FST<object> automaton)
+ : this(automaton, true, true)
+ {
+ }
+
+ /// <summary>
+ /// Cache the root node's output arcs starting with completions with the
+ /// highest weights.
+ /// </summary>
+ private static FST.Arc<object>[] CacheRootArcs(FST<object> automaton)
+ {
+ try
+ {
+ IList<FST.Arc<object>> rootArcs = new List<FST.Arc<object>>();
+ FST.Arc<object> arc = automaton.GetFirstArc(new FST.Arc<object>());
+ FST.BytesReader fstReader = automaton.BytesReader;
+ automaton.ReadFirstTargetArc(arc, arc, fstReader);
+ while (true)
+ {
+ rootArcs.Add((new FST.Arc<>()).copyFrom(arc));
+ if (arc.Last)
+ {
+ break;
+ }
+ automaton.ReadNextArc(arc, fstReader);
+ }
+
+ rootArcs.Reverse(); // we want highest weights first.
+ return rootArcs.ToArray();
+ }
+ catch (IOException e)
+ {
+ throw new Exception(e);
+ }
+ }
+
+ /// <summary>
+ /// Returns the first exact match by traversing root arcs, starting from the
+ /// arc <code>rootArcIndex</code>.
+ /// </summary>
+ /// <param name="rootArcIndex">
+ /// The first root arc index in <seealso cref="#rootArcs"/> to consider when
+ /// matching.
+ /// </param>
+ /// <param name="utf8">
+ /// The sequence of utf8 bytes to follow.
+ /// </param>
+ /// <returns> Returns the bucket number of the match or <code>-1</code> if no
+ /// match was found. </returns>
+ private int GetExactMatchStartingFromRootArc(int rootArcIndex, BytesRef utf8)
+ {
+ // Get the UTF-8 bytes representation of the input key.
+ try
+ {
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<Object> scratch = new org.apache.lucene.util.fst.FST.Arc<>();
+ FST.Arc<object> scratch = new FST.Arc<object>();
+ FST.BytesReader fstReader = automaton.BytesReader;
+ for (; rootArcIndex < rootArcs.Length; rootArcIndex++)
+ {
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<Object> rootArc = rootArcs[rootArcIndex];
+ FST.Arc<object> rootArc = rootArcs[rootArcIndex];
+ //JAVA TO C# CONVERTER WARNING: The original Java variable was marked 'final':
+ //ORIGINAL LINE: final org.apache.lucene.util.fst.FST.Arc<Object> arc = scratch.copyFrom(rootArc);
+ FST.Arc<object> arc = scratch.CopyFrom(rootArc);
+
+ // Descend into the automaton using the key as prefix.
+ if (descendWithPrefix(arc, utf8))
+ {
+ automaton.ReadFirstTargetArc(arc, arc, fstReader);
+ if (arc.Label == FST.END_LABEL)
+ {
+ // Normalize prefix-encoded weight.
+ return rootArc.Label;
+ }
+ }
+ }
+ }
+ catch (IOException e)
+ {
+ // Should never happen, but anyway.
+ throw new Exception(e);
+ }
+
+ // No match.
+ return -1;
+ }
+
+ /// <summary>
+ /// Lookup suggestions to <code>key</code>.
+ /// </summary>
+ /// <param name="key">
+ /// The prefix to which suggestions should be sought. </param>
+ /// <param name="num">
+ /// At most this number of suggestions will be returned. </param>
+ /// <returns> Returns the suggestions, sorted by their approximated weight first
+ /// (decreasing) and then alphabetically (UTF-8 codepoint order). </returns>
+ public virtual IList<Completion> Lookup(string key, int num)
+ {
+ if (key.Length == 0 || automaton == null)
+ {
+ return EMPTY_RESULT;
+ }
+
+ try
+ {
+ var keyUtf8 = new BytesRef(key);
+ if (!higherWeightsFirst && rootArcs.Length > 1)
+ {
+ // We could emit a warning here (?). An optimal strategy for
+ // alphabetically sorted
+ // suggestions would be to add them with a constant weight -- this saves
+ // unnecessary
+ // traversals and sorting.
+ return LookupSortedAlphabetically(keyUtf8, num);
+ }
+ else
+ {
+ return LookupSortedByWeight(keyUtf8, num, false);
+ }
+ }
+ catch (IOException e)
+ {
+ // Should never happen, but anyway.
+ throw new Exception(e);
+ }
+ }
+
+ /// <summary>
+ /// Lookup suggestions sorted alphabetically <b>if weights are not
+ /// constant</b>. This is a workaround: in general, use constant weights for
+ /// alphabetically sorted result.
+ /// </summary>
+ private IList<Completion> LookupSortedAlphabetically(BytesRef key, int num)
+ {
+ // Greedily get num results from each weight branch.
+ var res = LookupSortedByWeight(key, num, true);
+
+ // Sort and trim.
+ res.Sort();
+ if (res.Count > num)
+ {
+ res = res.SubList(0, num);
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// Lookup suggestions sorted by weight (descending order).
+ /// </summary>
+ /// <param name="collectAll">
+ /// If <code>true</code>, the routine terminates immediately when
+ /// <code>num</code> suggestions have been collected. If
+ /// <code>false</code>, it will collect suggestions from all weight
+ /// arcs (needed for <seealso cref="#lookupSortedAlphabetically"/>. </param>
+ private List<Completion> LookupSortedByWeight(BytesRef key, int num, bool collectAll)
+ {
+ // Don't overallocate the results buffers. This also serves the purpose of
+ // allowing the user of this class to request all matches using Integer.MAX_VALUE as
+ // the number of results.
+ List<Completion> res = new List<Completion>(Math.Min(10, num));
+
+ BytesRef output = BytesRef.DeepCopyOf(key);
+ for (int i = 0; i < rootArcs.Length; i++)
+ {
+ FST.Arc<object> rootArc = rootArcs[i];
+ FST.Arc<object> arc = (new FST.Arc<object>()).CopyFrom(rootArc);
+
+ // Descend into the automaton using the key as prefix.
+ if (descendWithPrefix(arc, key))
+ {
+ // A subgraph starting from the current node has the completions
+ // of the key prefix. The arc we're at is the last key's byte,
+ // so we will collect it too.
+ output.Length = key.Length - 1;
+ if (Collect(res, num, rootArc.Label, output, arc) && !collectAll)
+ {
+ // We have enough suggestions to return immediately. Keep on looking
+ // for an
+ // exact match, if requested.
+ if (exactFirst)
+ {
+ if (!CheckExistingAndReorder(res, key))
+ {
+ int exactMatchBucket = GetExactMatchStartingFromRootArc(i, key);
+ if (exactMatchBucket != -1)
+ {
+ // Insert as the first result and truncate at num.
+ while (res.Count >= num)
+ {
+ res.Remove(res.Count - 1);
+ }
+ res.Insert(0, new Completion(key, exactMatchBucket));
+ }
+ }
+ }
+ break;
+ }
+ }
+ }
+ return res;
+ }
+
+ /// <summary>
+ /// Checks if the list of
+ /// <seealso cref="Suggest.Lookup.LookupResult"/>s already has a
+ /// <code>key</code>. If so, reorders that
+ /// <seealso cref="Suggest.Lookup.LookupResult"/> to the first
+ /// position.
+ /// </summary>
+ /// <returns> Returns <code>true<code> if and only if <code>list</code> contained
+ /// <code>key</code>. </returns>
+ private bool CheckExistingAndReorder(List<Completion> list, BytesRef key)
+ {
+ // We assume list does not have duplicates (because of how the FST is created).
+ for (int i = list.Count; --i >= 0; )
+ {
+ if (key.Equals(list[i].utf8))
+ {
+ // Key found. Unless already at i==0, remove it and push up front so
+ // that the ordering
+ // remains identical with the exception of the exact match.
+ list.Insert(0, list.Remove(i));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /// <summary>
+ /// Descend along the path starting at <code>arc</code> and going through bytes
+ /// in the argument.
+ /// </summary>
+ /// <param name="arc">
+ /// The starting arc. This argument is modified in-place. </param>
+ /// <param name="utf8">
+ /// The term to descend along. </param>
+ /// <returns> If <code>true</code>, <code>arc</code> will be set to the arc
+ /// matching last byte of <code>term</code>. <code>false</code> is
+ /// returned if no such prefix exists. </returns>
+ private bool descendWithPrefix(FST.Arc<object> arc, BytesRef utf8)
+ {
+ int max = utf8.Offset + utf8.Length;
+ // Cannot save as instance var since multiple threads
+ // can use FSTCompletion at once...
+ FST.BytesReader fstReader = automaton.BytesReader;
+ for (int i = utf8.Offset; i < max; i++)
+ {
+ if (automaton.FindTargetArc(utf8.Bytes[i] & 0xff, arc, arc, fstReader) == null)
+ {
+ // No matching prefixes, return an empty result.
+ return false;
+ }
+ }
+ return true;
+ }
+
+ /// <summary>
+ /// Recursive collect lookup results from the automaton subgraph starting at
+ /// <code>arc</code>.
+ /// </summary>
+ /// <param name="num">
+ /// Maximum number of results needed (early termination). </param>
+ private bool Collect(IList<Completion> res, int num, int bucket, BytesRef output, FST.Arc<object> arc)
+ {
+ if (output.Length == output.Bytes.Length)
+ {
+ output.Bytes = ArrayUtil.Grow(output.Bytes);
+ }
+ Debug.Assert(output.Offset == 0);
+ output.Bytes[output.Length++] = (sbyte) arc.Label;
+ FST.BytesReader fstReader = automaton.BytesReader;
+ automaton.ReadFirstTargetArc(arc, arc, fstReader);
+ while (true)
+ {
+ if (arc.Label == FST.END_LABEL)
+ {
+ res.Add(new Completion(output, bucket));
+ if (res.Count >= num)
+ {
+ return true;
+ }
+ }
+ else
+ {
+ int save = output.Length;
+ if (Collect(res, num, bucket, output, (new FST.Arc<>()).copyFrom(arc)))
+ {
+ return true;
+ }
+ output.Length = save;
+ }
+
+ if (arc.Last)
+ {
+ break;
+ }
+ automaton.ReadNextArc(arc, fstReader);
+ }
+ return false;
+ }
+
+ /// <summary>
+ /// Returns the bucket count (discretization thresholds).
+ /// </summary>
+ public virtual int BucketCount
+ {
+ get
+ {
+ return rootArcs.Length;
+ }
+ }
+
+ /// <summary>
+ /// Returns the bucket assigned to a given key (if found) or <code>-1</code> if
+ /// no exact match exists.
+ /// </summary>
+ public virtual int GetBucket(string key)
+ {
+ return GetExactMatchStartingFromRootArc(0, new BytesRef(key));
+ }
+
+ /// <summary>
+ /// Returns the internal automaton.
+ /// </summary>
+ public virtual FST<object> FST
+ {
+ get
+ {
+ return automaton;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionBuilder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionBuilder.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionBuilder.cs
new file mode 100644
index 0000000..bea3c62
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionBuilder.cs
@@ -0,0 +1,274 @@
+using System;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+using Lucene.Net.Util.Packed;
+
+namespace Lucene.Net.Search.Suggest.Fst
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Finite state automata based implementation of "autocomplete" functionality.
+ ///
+ /// <h2>Implementation details</h2>
+ ///
+ /// <para>
+ /// The construction step in <seealso cref="#finalize()"/> works as follows:
+ /// <ul>
+ /// <li>A set of input terms and their buckets is given.</li>
+ /// <li>All terms in the input are prefixed with a synthetic pseudo-character
+ /// (code) of the weight bucket the term fell into. For example a term
+ /// <code>abc</code> with a discretized weight equal '1' would become
+ /// <code>1abc</code>.</li>
+ /// <li>The terms are then sorted by their raw value of UTF-8 character values
+ /// (including the synthetic bucket code in front).</li>
+ /// <li>A finite state automaton (<seealso cref="FST"/>) is constructed from the input. The
+ /// root node has arcs labeled with all possible weights. We cache all these
+ /// arcs, highest-weight first.</li>
+ /// </ul>
+ ///
+ /// </para>
+ /// <para>
+ /// At runtime, in <seealso cref="FSTCompletion#lookup(CharSequence, int)"/>,
+ /// the automaton is utilized as follows:
+ /// <ul>
+ /// <li>For each possible term weight encoded in the automaton (cached arcs from
+ /// the root above), starting with the highest one, we descend along the path of
+ /// the input key. If the key is not a prefix of a sequence in the automaton
+ /// (path ends prematurely), we exit immediately -- no completions.</li>
+ /// <li>Otherwise, we have found an internal automaton node that ends the key.
+ /// <b>The entire subautomaton (all paths) starting from this node form the key's
+ /// completions.</b> We start the traversal of this subautomaton. Every time we
+ /// reach a final state (arc), we add a single suggestion to the list of results
+ /// (the weight of this suggestion is constant and equal to the root path we
+ /// started from). The tricky part is that because automaton edges are sorted and
+ /// we scan depth-first, we can terminate the entire procedure as soon as we
+ /// collect enough suggestions the user requested.</li>
+ /// <li>In case the number of suggestions collected in the step above is still
+ /// insufficient, we proceed to the next (smaller) weight leaving the root node
+ /// and repeat the same algorithm again.</li>
+ /// </ul>
+ ///
+ /// <h2>Runtime behavior and performance characteristic</h2>
+ ///
+ /// The algorithm described above is optimized for finding suggestions to short
+ /// prefixes in a top-weights-first order. This is probably the most common use
+ /// case: it allows presenting suggestions early and sorts them by the global
+ /// frequency (and then alphabetically).
+ ///
+ /// </para>
+ /// <para>
+ /// If there is an exact match in the automaton, it is returned first on the
+ /// results list (even with by-weight sorting).
+ ///
+ /// </para>
+ /// <para>
+ /// Note that the maximum lookup time for <b>any prefix</b> is the time of
+ /// descending to the subtree, plus traversal of the subtree up to the number of
+ /// requested suggestions (because they are already presorted by weight on the
+ /// root level and alphabetically at any node level).
+ ///
+ /// </para>
+ /// <para>
+ /// To order alphabetically only (no ordering by priorities), use identical term
+ /// weights for all terms. Alphabetical suggestions are returned even if
+ /// non-constant weights are used, but the algorithm for doing this is
+ /// suboptimal.
+ ///
+ /// </para>
+ /// <para>
+ /// "alphabetically" in any of the documentation above indicates UTF-8
+ /// representation order, nothing else.
+ ///
+ /// </para>
+ /// <para>
+ /// <b>NOTE</b>: the FST file format is experimental and subject to suddenly
+ /// change, requiring you to rebuild the FST suggest index.
+ ///
+ /// </para>
+ /// </summary>
+ /// <seealso cref= FSTCompletion
+ /// @lucene.experimental </seealso>
+ public class FSTCompletionBuilder
+ {
+ /// <summary>
+ /// Default number of buckets.
+ /// </summary>
+ public const int DEFAULT_BUCKETS = 10;
+
+ /// <summary>
+ /// The number of separate buckets for weights (discretization). The more
+ /// buckets, the more fine-grained term weights (priorities) can be assigned.
+ /// The speed of lookup will not decrease for prefixes which have
+ /// highly-weighted completions (because these are filled-in first), but will
+ /// decrease significantly for low-weighted terms (but these should be
+ /// infrequent, so it is all right).
+ ///
+ /// <para>
+ /// The number of buckets must be within [1, 255] range.
+ /// </para>
+ /// </summary>
+ private readonly int buckets;
+
+ /// <summary>
+ /// Finite state automaton encoding all the lookup terms. See class notes for
+ /// details.
+ /// </summary>
+ internal FST<object> automaton;
+
+ /// <summary>
+ /// FST construction require re-sorting the input. This is the class that
+ /// collects all the input entries, their weights and then provides sorted
+ /// order.
+ /// </summary>
+ private readonly BytesRefSorter sorter;
+
+ /// <summary>
+ /// Scratch buffer for <seealso cref="#add(BytesRef, int)"/>.
+ /// </summary>
+ private readonly BytesRef scratch = new BytesRef();
+
+ /// <summary>
+ /// Max tail sharing length.
+ /// </summary>
+ private readonly int shareMaxTailLength;
+
+ /// <summary>
+ /// Creates an <seealso cref="FSTCompletion"/> with default options: 10 buckets, exact match
+ /// promoted to first position and <seealso cref="InMemorySorter"/> with a comparator obtained from
+ /// <seealso cref="BytesRef#getUTF8SortedAsUnicodeComparator()"/>.
+ /// </summary>
+ public FSTCompletionBuilder()
+ : this(DEFAULT_BUCKETS, new InMemorySorter(BytesRef.UTF8SortedAsUnicodeComparator), int.MaxValue)
+ {
+ }
+
+ /// <summary>
+ /// Creates an FSTCompletion with the specified options. </summary>
+ /// <param name="buckets">
+ /// The number of buckets for weight discretization. Buckets are used
+ /// in <seealso cref="#add(BytesRef, int)"/> and must be smaller than the number
+ /// given here.
+ /// </param>
+ /// <param name="sorter">
+ /// <seealso cref="BytesRefSorter"/> used for re-sorting input for the automaton.
+ /// For large inputs, use on-disk sorting implementations. The sorter
+ /// is closed automatically in <seealso cref="#build()"/> if it implements
+ /// <seealso cref="Closeable"/>.
+ /// </param>
+ /// <param name="shareMaxTailLength">
+ /// Max shared suffix sharing length.
+ ///
+ /// See the description of this parameter in <seealso cref="Builder"/>'s constructor.
+ /// In general, for very large inputs you'll want to construct a non-minimal
+ /// automaton which will be larger, but the construction will take far less ram.
+ /// For minimal automata, set it to <seealso cref="Integer#MAX_VALUE"/>. </param>
+ public FSTCompletionBuilder(int buckets, BytesRefSorter sorter, int shareMaxTailLength)
+ {
+ if (buckets < 1 || buckets > 255)
+ {
+ throw new System.ArgumentException("Buckets must be >= 1 and <= 255: " + buckets);
+ }
+
+ if (sorter == null)
+ {
+ throw new System.ArgumentException("BytesRefSorter must not be null.");
+ }
+
+ this.sorter = sorter;
+ this.buckets = buckets;
+ this.shareMaxTailLength = shareMaxTailLength;
+ }
+
+ /// <summary>
+ /// Appends a single suggestion and its weight to the internal buffers.
+ /// </summary>
+ /// <param name="utf8">
+ /// The suggestion (utf8 representation) to be added. The content is
+ /// copied and the object can be reused. </param>
+ /// <param name="bucket">
+ /// The bucket to place this suggestion in. Must be non-negative and
+ /// smaller than the number of buckets passed in the constructor.
+ /// Higher numbers indicate suggestions that should be presented
+ /// before suggestions placed in smaller buckets. </param>
+ public virtual void Add(BytesRef utf8, int bucket)
+ {
+ if (bucket < 0 || bucket >= buckets)
+ {
+ throw new ArgumentException("Bucket outside of the allowed range [0, " + buckets + "): " + bucket);
+ }
+
+ if (scratch.Bytes.Length < utf8.Length + 1)
+ {
+ scratch.Grow(utf8.Length + 10);
+ }
+
+ scratch.Length = 1;
+ scratch.Bytes[0] = (sbyte)bucket;
+ scratch.Append(utf8);
+ sorter.Add(scratch);
+ }
+
+ /// <summary>
+ /// Builds the final automaton from a list of added entries. This method may
+ /// take a longer while as it needs to build the automaton.
+ /// </summary>
+ public virtual FSTCompletion Build()
+ {
+ this.automaton = BuildAutomaton(sorter);
+
+ // Dispose of it if it is a disposable
+ using (sorter as IDisposable)
+ {
+
+ }
+
+ return new FSTCompletion(automaton);
+ }
+
+ /// <summary>
+ /// Builds the final automaton from a list of entries.
+ /// </summary>
+ private FST<object> BuildAutomaton(BytesRefSorter sorter)
+ {
+ // Build the automaton.
+ Outputs<object> outputs = NoOutputs.Singleton;
+ object empty = outputs.NoOutput;
+ Builder<object> builder = new Builder<object>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, shareMaxTailLength, outputs, null, false, PackedInts.DEFAULT, true, 15);
+
+ BytesRef scratch = new BytesRef();
+ BytesRef entry;
+ IntsRef scratchIntsRef = new IntsRef();
+ int count = 0;
+ BytesRefIterator iter = sorter.GetEnumerator();
+ while ((entry = iter.Next()) != null)
+ {
+ count++;
+ if (scratch.CompareTo(entry) != 0)
+ {
+ builder.Add(Util.Fst.Util.ToIntsRef(entry, scratchIntsRef), empty);
+ scratch.CopyBytes(entry);
+ }
+ }
+
+ return count == 0 ? null : builder.Finish();
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionLookup.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionLookup.cs b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionLookup.cs
new file mode 100644
index 0000000..aa4992c
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/FSTCompletionLookup.cs
@@ -0,0 +1,337 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Search.Suggest.Tst;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Search.Suggest.Fst
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// An adapter from <seealso cref="Lookup"/> API to <seealso cref="FSTCompletion"/>.
+ ///
+ /// <para>This adapter differs from <seealso cref="FSTCompletion"/> in that it attempts
+ /// to discretize any "weights" as passed from in <seealso cref="InputIterator#weight()"/>
+ /// to match the number of buckets. For the rationale for bucketing, see
+ /// <seealso cref="FSTCompletion"/>.
+ ///
+ /// </para>
+ /// <para><b>Note:</b>Discretization requires an additional sorting pass.
+ ///
+ /// </para>
+ /// <para>The range of weights for bucketing/ discretization is determined
+ /// by sorting the input by weight and then dividing into
+ /// equal ranges. Then, scores within each range are assigned to that bucket.
+ ///
+ /// </para>
+ /// <para>Note that this means that even large differences in weights may be lost
+ /// during automaton construction, but the overall distinction between "classes"
+ /// of weights will be preserved regardless of the distribution of weights.
+ ///
+ /// </para>
+ /// <para>For fine-grained control over which weights are assigned to which buckets,
+ /// use <seealso cref="FSTCompletion"/> directly or <seealso cref="TSTLookup"/>, for example.
+ ///
+ /// </para>
+ /// </summary>
+ /// <seealso cref= FSTCompletion
+ /// @lucene.experimental </seealso>
+ public class FSTCompletionLookup : Lookup
+ {
+ /// <summary>
+ /// An invalid bucket count if we're creating an object
+ /// of this class from an existing FST.
+ /// </summary>
+ /// <seealso cref= #FSTCompletionLookup(FSTCompletion, boolean) </seealso>
+ private static int INVALID_BUCKETS_COUNT = -1;
+
+ /// <summary>
+ /// Shared tail length for conflating in the created automaton. Setting this
+ /// to larger values (<seealso cref="Integer#MAX_VALUE"/>) will create smaller (or minimal)
+ /// automata at the cost of RAM for keeping nodes hash in the <seealso cref="FST"/>.
+ ///
+ /// <para>Empirical pick.
+ /// </para>
+ /// </summary>
+ private const int sharedTailLength = 5;
+
+ private int buckets;
+ private bool exactMatchFirst;
+
+ /// <summary>
+ /// Automaton used for completions with higher weights reordering.
+ /// </summary>
+ private FSTCompletion higherWeightsCompletion;
+
+ /// <summary>
+ /// Automaton used for normal completions.
+ /// </summary>
+ private FSTCompletion normalCompletion;
+
+ /// <summary>
+ /// Number of entries the lookup was built with </summary>
+ private long count = 0;
+
+ /// <summary>
+ /// This constructor prepares for creating a suggested FST using the
+ /// <seealso cref="#build(InputIterator)"/> method. The number of weight
+ /// discretization buckets is set to <seealso cref="FSTCompletion#DEFAULT_BUCKETS"/> and
+ /// exact matches are promoted to the top of the suggestions list.
+ /// </summary>
+ public FSTCompletionLookup()
+ : this(FSTCompletion.DEFAULT_BUCKETS, true)
+ {
+ }
+
+ /// <summary>
+ /// This constructor prepares for creating a suggested FST using the
+ /// <seealso cref="#build(InputIterator)"/> method.
+ /// </summary>
+ /// <param name="buckets">
+ /// The number of weight discretization buckets (see
+ /// <seealso cref="FSTCompletion"/> for details).
+ /// </param>
+ /// <param name="exactMatchFirst">
+ /// If <code>true</code> exact matches are promoted to the top of the
+ /// suggestions list. Otherwise they appear in the order of
+ /// discretized weight and alphabetical within the bucket. </param>
+ public FSTCompletionLookup(int buckets, bool exactMatchFirst)
+ {
+ this.buckets = buckets;
+ this.exactMatchFirst = exactMatchFirst;
+ }
+
+ /// <summary>
+ /// This constructor takes a pre-built automaton.
+ /// </summary>
+ /// <param name="completion">
+ /// An instance of <seealso cref="FSTCompletion"/>. </param>
+ /// <param name="exactMatchFirst">
+ /// If <code>true</code> exact matches are promoted to the top of the
+ /// suggestions list. Otherwise they appear in the order of
+ /// discretized weight and alphabetical within the bucket. </param>
+ public FSTCompletionLookup(FSTCompletion completion, bool exactMatchFirst)
+ : this(INVALID_BUCKETS_COUNT, exactMatchFirst)
+ {
+ this.normalCompletion = new FSTCompletion(completion.FST, false, exactMatchFirst);
+ this.higherWeightsCompletion = new FSTCompletion(completion.FST, true, exactMatchFirst);
+ }
+
+ public override void Build(InputIterator iterator)
+ {
+ if (iterator.HasPayloads)
+ {
+ throw new System.ArgumentException("this suggester doesn't support payloads");
+ }
+ if (iterator.HasContexts)
+ {
+ throw new System.ArgumentException("this suggester doesn't support contexts");
+ }
+ File tempInput = File.CreateTempFile(typeof(FSTCompletionLookup).Name, ".input", OfflineSorter.defaultTempDir());
+ File tempSorted = File.CreateTempFile(typeof(FSTCompletionLookup).Name, ".sorted", OfflineSorter.defaultTempDir());
+
+ OfflineSorter.ByteSequencesWriter writer = new OfflineSorter.ByteSequencesWriter(tempInput);
+ OfflineSorter.ByteSequencesReader reader = null;
+ ExternalRefSorter sorter = null;
+
+ // Push floats up front before sequences to sort them. For now, assume they are non-negative.
+ // If negative floats are allowed some trickery needs to be done to find their byte order.
+ bool success = false;
+ count = 0;
+ try
+ {
+ sbyte[] buffer = new sbyte[0];
+ ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
+ BytesRef spare;
+ while ((spare = iterator.Next()) != null)
+ {
+ if (spare.Length + 4 >= buffer.Length)
+ {
+ buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
+ }
+
+ output.Reset(buffer);
+ output.WriteInt(EncodeWeight(iterator.Weight));
+ output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
+ writer.Write(buffer, 0, output.Position);
+ }
+ writer.Dispose();
+
+ // We don't know the distribution of scores and we need to bucket them, so we'll sort
+ // and divide into equal buckets.
+ OfflineSorter.SortInfo info = (new OfflineSorter()).Sort(tempInput, tempSorted);
+ tempInput.Delete();
+ FSTCompletionBuilder builder = new FSTCompletionBuilder(buckets, sorter = new ExternalRefSorter(new OfflineSorter()), sharedTailLength);
+
+ int inputLines = info.Lines;
+ reader = new OfflineSorter.ByteSequencesReader(tempSorted);
+ long line = 0;
+ int previousBucket = 0;
+ int previousScore = 0;
+ ByteArrayDataInput input = new ByteArrayDataInput();
+ BytesRef tmp1 = new BytesRef();
+ BytesRef tmp2 = new BytesRef();
+ while (reader.Read(tmp1))
+ {
+ input.Reset(tmp1.Bytes);
+ int currentScore = input.ReadInt();
+
+ int bucket;
+ if (line > 0 && currentScore == previousScore)
+ {
+ bucket = previousBucket;
+ }
+ else
+ {
+ bucket = (int)(line * buckets / inputLines);
+ }
+ previousScore = currentScore;
+ previousBucket = bucket;
+
+ // Only append the input, discard the weight.
+ tmp2.Bytes = tmp1.Bytes;
+ tmp2.Offset = input.Position;
+ tmp2.Length = tmp1.Length - input.Position;
+ builder.Add(tmp2, bucket);
+
+ line++;
+ count++;
+ }
+
+ // The two FSTCompletions share the same automaton.
+ this.higherWeightsCompletion = builder.Build();
+ this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);
+
+ success = true;
+ }
+ finally
+ {
+ if (success)
+ {
+ IOUtils.Close(reader, writer, sorter);
+ }
+ else
+ {
+ IOUtils.CloseWhileHandlingException(reader, writer, sorter);
+ }
+
+ tempInput.Delete();
+ tempSorted.Delete();
+ }
+ }
+
+ /// <summary>
+ /// weight -> cost </summary>
+ private static int EncodeWeight(long value)
+ {
+ if (value < int.MinValue || value > int.MaxValue)
+ {
+ throw new NotSupportedException("cannot encode value: " + value);
+ }
+ return (int)value;
+ }
+
+ public override IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool higherWeightsFirst, int num)
+ {
+ if (contexts != null)
+ {
+ throw new ArgumentException("this suggester doesn't support contexts");
+ }
+ IList<FSTCompletion.Completion> completions;
+ if (higherWeightsFirst)
+ {
+ completions = higherWeightsCompletion.Lookup(key, num);
+ }
+ else
+ {
+ completions = normalCompletion.Lookup(key, num);
+ }
+
+ List<LookupResult> results = new List<LookupResult>(completions.Count);
+ CharsRef spare = new CharsRef();
+ foreach (FSTCompletion.Completion c in completions)
+ {
+ spare.Grow(c.utf8.Length);
+ UnicodeUtil.UTF8toUTF16(c.utf8, spare);
+ results.Add(new LookupResult(spare.ToString(), c.bucket));
+ }
+ return results;
+ }
+
+ /// <summary>
+ /// Returns the bucket (weight) as a Long for the provided key if it exists,
+ /// otherwise null if it does not.
+ /// </summary>
+ public virtual object Get(string key)
+ {
+ int bucket = normalCompletion.GetBucket(key);
+ return bucket == -1 ? null : Convert.ToInt64(bucket);
+ }
+
+ public override bool Store(DataOutput output)
+ {
+ lock (this)
+ {
+ output.WriteVLong(count);
+ if (this.normalCompletion == null || normalCompletion.FST == null)
+ {
+ return false;
+ }
+ normalCompletion.FST.Save(output);
+ return true;
+ }
+ }
+
+ public override bool Load(DataInput input)
+ {
+ lock (this)
+ {
+ count = input.ReadVLong();
+ this.higherWeightsCompletion = new FSTCompletion(new FST<>(input, NoOutputs.Singleton));
+ this.normalCompletion = new FSTCompletion(higherWeightsCompletion.FST, false, exactMatchFirst);
+ return true;
+ }
+ }
+
+ public override long SizeInBytes()
+ {
+ long mem = RamUsageEstimator.ShallowSizeOf(this) + RamUsageEstimator.ShallowSizeOf(normalCompletion) + RamUsageEstimator.ShallowSizeOf(higherWeightsCompletion);
+ if (normalCompletion != null)
+ {
+ mem += normalCompletion.FST.SizeInBytes();
+ }
+ if (higherWeightsCompletion != null && (normalCompletion == null || normalCompletion.FST != higherWeightsCompletion.FST))
+ {
+ // the fst should be shared between the 2 completion instances, don't count it twice
+ mem += higherWeightsCompletion.FST.SizeInBytes();
+ }
+ return mem;
+ }
+
+ public override long Count
+ {
+ get
+ {
+ return count;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/Fst/WFSTCompletionLookup.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/Fst/WFSTCompletionLookup.cs b/src/Lucene.Net.Suggest/Suggest/Fst/WFSTCompletionLookup.cs
new file mode 100644
index 0000000..2ad8e76
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/Fst/WFSTCompletionLookup.cs
@@ -0,0 +1,348 @@
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.IO;
+using Lucene.Net.Store;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+
+namespace Lucene.Net.Search.Suggest.Fst
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Suggester based on a weighted FST: it first traverses the prefix,
+ /// then walks the <i>n</i> shortest paths to retrieve top-ranked
+ /// suggestions.
+ /// <para>
+ /// <b>NOTE</b>:
+ /// Input weights must be between 0 and <seealso cref="Integer#MAX_VALUE"/>, any
+ /// other values will be rejected.
+ ///
+ /// @lucene.experimental
+ /// </para>
+ /// </summary>
+ public class WFSTCompletionLookup : Lookup
+ {
+
+ /// <summary>
+ /// FST<Long>, weights are encoded as costs: (Integer.MAX_VALUE-weight)
+ /// </summary>
+ // NOTE: like FSTSuggester, this is really a WFSA, if you want to
+ // customize the code to add some output you should use PairOutputs.
+ private FST<long?> fst = null;
+
+ /// <summary>
+ /// True if exact match suggestions should always be returned first.
+ /// </summary>
+ private readonly bool exactFirst;
+
+ /// <summary>
+ /// Number of entries the lookup was built with </summary>
+ private long count = 0;
+
+ /// <summary>
+ /// Calls <seealso cref="#WFSTCompletionLookup(boolean) WFSTCompletionLookup(true)"/>
+ /// </summary>
+ public WFSTCompletionLookup()
+ : this(true)
+ {
+ }
+
+ /// <summary>
+ /// Creates a new suggester.
+ /// </summary>
+ /// <param name="exactFirst"> <code>true</code> if suggestions that match the
+ /// prefix exactly should always be returned first, regardless
+ /// of score. This has no performance impact, but could result
+ /// in low-quality suggestions. </param>
+ public WFSTCompletionLookup(bool exactFirst)
+ {
+ this.exactFirst = exactFirst;
+ }
+
+ public override void Build(InputIterator iterator)
+ {
+ if (iterator.HasPayloads)
+ {
+ throw new ArgumentException("this suggester doesn't support payloads");
+ }
+ if (iterator.HasContexts)
+ {
+ throw new ArgumentException("this suggester doesn't support contexts");
+ }
+ count = 0;
+ BytesRef scratch = new BytesRef();
+ InputIterator iter = new WFSTInputIterator(this, iterator);
+ IntsRef scratchInts = new IntsRef();
+ BytesRef previous = null;
+ PositiveIntOutputs outputs = PositiveIntOutputs.Singleton;
+ Builder<long?> builder = new Builder<long?>(FST.INPUT_TYPE.BYTE1, outputs);
+ while ((scratch = iter.Next()) != null)
+ {
+ long cost = iter.Weight;
+
+ if (previous == null)
+ {
+ previous = new BytesRef();
+ }
+ else if (scratch.Equals(previous))
+ {
+ continue; // for duplicate suggestions, the best weight is actually
+ // added
+ }
+ Util.ToIntsRef(scratch, scratchInts);
+ builder.Add(scratchInts, cost);
+ previous.CopyBytes(scratch);
+ count++;
+ }
+ fst = builder.Finish();
+ }
+
+ public override bool Store(DataOutput output)
+ {
+ output.WriteVLong(count);
+ if (fst == null)
+ {
+ return false;
+ }
+ fst.Save(output);
+ return true;
+ }
+
+ public override bool Load(DataInput input)
+ {
+ count = input.ReadVLong();
+ this.fst = new FST<>(input, PositiveIntOutputs.Singleton);
+ return true;
+ }
+
+ public override IList<LookupResult> Lookup(string key, HashSet<BytesRef> contexts, bool onlyMorePopular, int num)
+ {
+ if (contexts != null)
+ {
+ throw new System.ArgumentException("this suggester doesn't support contexts");
+ }
+ Debug.Assert(num > 0);
+
+ if (onlyMorePopular)
+ {
+ throw new System.ArgumentException("this suggester only works with onlyMorePopular=false");
+ }
+
+ if (fst == null)
+ {
+ return Collections.emptyList();
+ }
+
+ BytesRef scratch = new BytesRef(key);
+ int prefixLength = scratch.Length;
+ Arc<long?> arc = new Arc<long?>();
+
+ // match the prefix portion exactly
+ long? prefixOutput = null;
+ try
+ {
+ prefixOutput = LookupPrefix(scratch, arc);
+ }
+ catch (IOException bogus)
+ {
+ throw new Exception(bogus);
+ }
+
+ if (prefixOutput == null)
+ {
+ return Collections.emptyList();
+ }
+
+ IList<LookupResult> results = new List<LookupResult>(num);
+ CharsRef spare = new CharsRef();
+ if (exactFirst && arc.Final)
+ {
+ spare.grow(scratch.length);
+ UnicodeUtil.UTF8toUTF16(scratch, spare);
+ results.Add(new LookupResult(spare.ToString(), decodeWeight(prefixOutput + arc.nextFinalOutput)));
+ if (--num == 0)
+ {
+ return results; // that was quick
+ }
+ }
+
+ // complete top-N
+ TopResults<long?> completions = null;
+ try
+ {
+ completions = Util.ShortestPaths(fst, arc, prefixOutput, weightComparator, num, !exactFirst);
+ Debug.Assert(completions.isComplete);
+ }
+ catch (IOException bogus)
+ {
+ throw new Exception(bogus);
+ }
+
+ BytesRef suffix = new BytesRef(8);
+ foreach (Result<long?> completion in completions)
+ {
+ scratch.length = prefixLength;
+ // append suffix
+ Util.ToBytesRef(completion.input, suffix);
+ scratch.Append(suffix);
+ spare.Grow(scratch.Length);
+ UnicodeUtil.UTF8toUTF16(scratch, spare);
+ results.Add(new LookupResult(spare.ToString(), decodeWeight(completion.output)));
+ }
+ return results;
+ }
+
+ private long? LookupPrefix(BytesRef scratch, Arc<long?> arc) //Bogus
+ {
+ Debug.Assert(0 == (long)fst.outputs.NoOutput);
+ long output = 0;
+ var bytesReader = fst.BytesReader;
+
+ fst.GetFirstArc(arc);
+
+ sbyte[] bytes = scratch.Bytes;
+ int pos = scratch.Offset;
+ int end = pos + scratch.Length;
+ while (pos < end)
+ {
+ if (fst.FindTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null)
+ {
+ return null;
+ }
+ else
+ {
+ output += (long)arc.Output;
+ }
+ }
+
+ return output;
+ }
+
+ /// <summary>
+ /// Returns the weight associated with an input string,
+ /// or null if it does not exist.
+ /// </summary>
+ public virtual object Get(string key)
+ {
+ if (fst == null)
+ {
+ return null;
+ }
+ Arc<long?> arc = new Arc<long?>();
+ long? result = null;
+ try
+ {
+ result = LookupPrefix(new BytesRef(key), arc);
+ }
+ catch (IOException bogus)
+ {
+ throw new Exception(bogus);
+ }
+ if (result == null || !arc.Final)
+ {
+ return null;
+ }
+ else
+ {
+ return Convert.ToInt32(decodeWeight(result + arc.NextFinalOutput));
+ }
+ }
+
+ /// <summary>
+ /// cost -> weight </summary>
+ private static int decodeWeight(long encoded)
+ {
+ return (int)(int.MaxValue - encoded);
+ }
+
+ /// <summary>
+ /// weight -> cost </summary>
+ private static int encodeWeight(long value)
+ {
+ if (value < 0 || value > int.MaxValue)
+ {
+ throw new System.NotSupportedException("cannot encode value: " + value);
+ }
+ return int.MaxValue - (int)value;
+ }
+
+ private sealed class WFSTInputIterator : SortedInputIterator
+ {
+ private readonly WFSTCompletionLookup outerInstance;
+
+
+ internal WFSTInputIterator(WFSTCompletionLookup outerInstance, InputIterator source)
+ : base(source)
+ {
+ this.outerInstance = outerInstance;
+ Debug.Assert(source.HasPayloads == false);
+ }
+
+ protected internal override void Encode(OfflineSorter.ByteSequencesWriter writer, ByteArrayDataOutput output, sbyte[] buffer, BytesRef spare, BytesRef payload, HashSet<BytesRef> contexts, long weight)
+ {
+ if (spare.Length + 4 >= buffer.Length)
+ {
+ buffer = ArrayUtil.Grow(buffer, spare.Length + 4);
+ }
+ output.Reset(buffer);
+ output.WriteBytes(spare.Bytes, spare.Offset, spare.Length);
+ output.WriteInt(encodeWeight(weight));
+ writer.Write(buffer, 0, output.Position);
+ }
+
+ protected internal override long decode(BytesRef scratch, ByteArrayDataInput tmpInput)
+ {
+ scratch.Length -= 4; // int
+ // skip suggestion:
+ tmpInput.Reset(scratch.Bytes, scratch.Offset + scratch.Length, 4);
+ return tmpInput.ReadInt();
+ }
+ }
+
+ internal static readonly IComparer<long?> weightComparator = new ComparatorAnonymousInnerClassHelper();
+
+ private class ComparatorAnonymousInnerClassHelper : IComparer<long?>
+ {
+ public ComparatorAnonymousInnerClassHelper()
+ {
+ }
+
+ public virtual int Compare(long? left, long? right)
+ {
+ return left.CompareTo(right);
+ }
+ }
+
+ /// <summary>
+ /// Returns byte size of the underlying FST. </summary>
+ public override long SizeInBytes()
+ {
+ return (fst == null) ? 0 : fst.SizeInBytes();
+ }
+
+ public override long Count
+ {
+ get
+ {
+ return count;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/InMemorySorter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/InMemorySorter.cs b/src/Lucene.Net.Suggest/Suggest/InMemorySorter.cs
new file mode 100644
index 0000000..5a97e31
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/InMemorySorter.cs
@@ -0,0 +1,70 @@
+using System;
+using System.Collections.Generic;
+using Lucene.Net.Search.Suggest.Fst;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// An <seealso cref="BytesRefSorter"/> that keeps all the entries in memory.
+ /// @lucene.experimental
+ /// @lucene.internal
+ /// </summary>
+ public sealed class InMemorySorter : BytesRefSorter
+ {
+ private readonly BytesRefArray buffer = new BytesRefArray(Counter.NewCounter());
+ private bool closed = false;
+ private readonly IComparer<BytesRef> comparator;
+
+ /// <summary>
+ /// Creates an InMemorySorter, sorting entries by the
+ /// provided comparator.
+ /// </summary>
+ public InMemorySorter(IComparer<BytesRef> comparator)
+ {
+ this.comparator = comparator;
+ }
+
+ public void Add(BytesRef utf8)
+ {
+ if (closed)
+ {
+ throw new InvalidOperationException();
+ }
+ buffer.Append(utf8);
+ }
+
+ public BytesRefIterator Iterator()
+ {
+ closed = true;
+ return buffer.Iterator(comparator);
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return comparator;
+ }
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Suggest/InputIterator.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Suggest/InputIterator.cs b/src/Lucene.Net.Suggest/Suggest/InputIterator.cs
new file mode 100644
index 0000000..8e18698
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Suggest/InputIterator.cs
@@ -0,0 +1,124 @@
+using System.Collections.Generic;
+using Lucene.Net.Search.Suggest.Analyzing;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Suggest
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Interface for enumerating term,weight,payload triples for suggester consumption;
+ /// currently only <seealso cref="AnalyzingSuggester"/>, {@link
+ /// FuzzySuggester} and <seealso cref="AnalyzingInfixSuggester"/> support payloads.
+ /// </summary>
+ public interface InputIterator : BytesRefIterator
+ {
+
+ /// <summary>
+ /// A term's weight, higher numbers mean better suggestions. </summary>
+ long Weight { get; }
+
+ /// <summary>
+ /// An arbitrary byte[] to record per suggestion. See
+ /// <seealso cref="Lookup.LookupResult#payload"/> to retrieve the payload
+ /// for each suggestion.
+ /// </summary>
+ BytesRef Payload { get; }
+
+ /// <summary>
+ /// Returns true if the iterator has payloads </summary>
+ bool HasPayloads { get; }
+
+ /// <summary>
+ /// A term's contexts context can be used to filter suggestions.
+ /// May return null, if suggest entries do not have any context
+ /// </summary>
+ HashSet<BytesRef> Contexts { get; }
+
+ /// <summary>
+ /// Returns true if the iterator has contexts </summary>
+ bool HasContexts { get; }
+ }
+
+ /// <summary>
+ /// Singleton InputIterator that iterates over 0 BytesRefs.
+ /// </summary>
+ public static class EmptyInputIterator
+ {
+ public static readonly InputIterator Instance = new InputIteratorWrapper(EmptyBytesRefIterator.Instance);
+ }
+
+ /// <summary>
+ /// Wraps a BytesRefIterator as a suggester InputIterator, with all weights
+ /// set to <code>1</code> and carries no payload
+ /// </summary>
+ public class InputIteratorWrapper : InputIterator
+ {
+ internal readonly BytesRefIterator wrapped;
+
+ /// <summary>
+ /// Creates a new wrapper, wrapping the specified iterator and
+ /// specifying a weight value of <code>1</code> for all terms
+ /// and nullifies associated payloads.
+ /// </summary>
+ public InputIteratorWrapper(BytesRefIterator wrapped)
+ {
+ this.wrapped = wrapped;
+ }
+
+ public virtual long Weight
+ {
+ get { return 1; }
+ }
+
+ public BytesRef Next()
+ {
+ return wrapped.Next();
+ }
+
+ public virtual BytesRef Payload
+ {
+ get { return null; }
+ }
+
+ public virtual bool HasPayloads
+ {
+ get { return false; }
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return wrapped.Comparator;
+ }
+ }
+
+ public virtual HashSet<BytesRef> Contexts
+ {
+ get { return null; }
+ }
+
+ public virtual bool HasContexts
+ {
+ get { return false; }
+ }
+ }
+
+}
\ No newline at end of file