You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucenenet.apache.org by sy...@apache.org on 2014/09/16 00:24:54 UTC
[7/8] Porting Lucene.Net.Suggest (still not compiling)
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/DirectSpellChecker.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/DirectSpellChecker.cs b/src/Lucene.Net.Suggest/Spell/DirectSpellChecker.cs
new file mode 100644
index 0000000..181d24e
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/DirectSpellChecker.cs
@@ -0,0 +1,575 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Lucene.Net.Index;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Automaton;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Simple automaton-based spellchecker.
+ /// <para>
+ /// Candidates are presented directly from the term dictionary, based on
+ /// Levenshtein distance. This is an alternative to <seealso cref="SpellChecker"/>
+ /// if you are using an edit-distance-like metric such as Levenshtein
+ /// or <seealso cref="JaroWinklerDistance"/>.
+ /// </para>
+ /// <para>
+ /// A practical benefit of this spellchecker is that it requires no additional
+ /// datastructures (neither in RAM nor on disk) to do its work.
+ ///
+ /// </para>
+ /// </summary>
+ /// <seealso cref= LevenshteinAutomata </seealso>
+ /// <seealso cref= FuzzyTermsEnum
+ ///
+ /// @lucene.experimental </seealso>
+ public class DirectSpellChecker
+ {
+ /// <summary>
+ /// The default StringDistance, Damerau-Levenshtein distance implemented internally
+ /// via <seealso cref="LevenshteinAutomata"/>.
+ /// <para>
+ /// Note: this is the fastest distance metric, because Damerau-Levenshtein is used
+ /// to draw candidates from the term dictionary: this just re-uses the scoring.
+ /// </para>
+ /// </summary>
+ public static readonly StringDistance INTERNAL_LEVENSHTEIN = new LuceneLevenshteinDistance();
+
+ /// <summary>
+ /// maximum edit distance for candidate terms </summary>
+ private int maxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
+ /// <summary>
+ /// minimum prefix for candidate terms </summary>
+ private int minPrefix = 1;
+ /// <summary>
+ /// maximum number of top-N inspections per suggestion </summary>
+ private int maxInspections = 5;
+ /// <summary>
+ /// minimum accuracy for a term to match </summary>
+ private float accuracy = SpellChecker.DEFAULT_ACCURACY;
+ /// <summary>
+ /// value in [0..1] (or absolute number >=1) representing the minimum
+ /// number of documents (of the total) where a term should appear.
+ /// </summary>
+ private float thresholdFrequency = 0f;
+ /// <summary>
+ /// minimum length of a query word to return suggestions </summary>
+ private int minQueryLength = 4;
+ /// <summary>
+ /// value in [0..1] (or absolute number >=1) representing the maximum
+ /// number of documents (of the total) a query term can appear in to
+ /// be corrected.
+ /// </summary>
+ private float maxQueryFrequency = 0.01f;
+ /// <summary>
+ /// true if the spellchecker should lowercase terms </summary>
+ private bool lowerCaseTerms = true;
+ /// <summary>
+ /// the comparator to use </summary>
+ private IComparer<SuggestWord> comparator = SuggestWordQueue.DEFAULT_COMPARATOR;
+ /// <summary>
+ /// the string distance to use </summary>
+ private StringDistance distance = INTERNAL_LEVENSHTEIN;
+
+ /// <summary>
+ /// Creates a DirectSpellChecker with default configuration values </summary>
+ public DirectSpellChecker()
+ {
+ }
+
+ /// <summary>
+ /// Get the maximum number of Levenshtein edit-distances to draw
+ /// candidate terms from.
+ /// </summary>
+ public virtual int MaxEdits
+ {
+ get
+ {
+ return maxEdits;
+ }
+ set
+ {
+ if (value < 1 || value > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE)
+ {
+ throw new NotSupportedException("Invalid maxEdits");
+ }
+ this.maxEdits = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the minimal number of characters that must match exactly
+ /// </summary>
+ public virtual int MinPrefix
+ {
+ get
+ {
+ return minPrefix;
+ }
+ set
+ {
+ this.minPrefix = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the maximum number of top-N inspections per suggestion
+ /// </summary>
+ public virtual int MaxInspections
+ {
+ get
+ {
+ return maxInspections;
+ }
+ set
+ {
+ this.maxInspections = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the minimal accuracy from the StringDistance for a match
+ /// </summary>
+ public virtual float Accuracy
+ {
+ get
+ {
+ return accuracy;
+ }
+ set
+ {
+ this.accuracy = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the minimal threshold of documents a term must appear for a match
+ /// </summary>
+ public virtual float ThresholdFrequency
+ {
+ get
+ {
+ return thresholdFrequency;
+ }
+ set
+ {
+ if (value >= 1f && value != (int)value)
+ {
+ throw new System.ArgumentException("Fractional absolute document frequencies are not allowed");
+ }
+ this.thresholdFrequency = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the minimum length of a query term needed to return suggestions </summary>
+ public virtual int MinQueryLength
+ {
+ get
+ {
+ return minQueryLength;
+ }
+ set
+ {
+ this.minQueryLength = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the maximum threshold of documents a query term can appear in order
+ /// to provide suggestions.
+ /// </summary>
+ public virtual float MaxQueryFrequency
+ {
+ get
+ {
+ return maxQueryFrequency;
+ }
+ set
+ {
+ if (value >= 1f && value != (int)value)
+ {
+ throw new System.ArgumentException("Fractional absolute document frequencies are not allowed");
+ }
+ this.maxQueryFrequency = value;
+ }
+ }
+
+
+ /// <summary>
+ /// true if the spellchecker should lowercase terms </summary>
+ public virtual bool LowerCaseTerms
+ {
+ get
+ {
+ return lowerCaseTerms;
+ }
+ set
+ {
+ this.lowerCaseTerms = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the current comparator in use.
+ /// </summary>
+ public virtual IComparer<SuggestWord> Comparator
+ {
+ get
+ {
+ return comparator;
+ }
+ set
+ {
+ this.comparator = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Get the string distance metric in use.
+ /// </summary>
+ public virtual StringDistance Distance
+ {
+ get
+ {
+ return distance;
+ }
+ set
+ {
+ this.distance = value;
+ }
+ }
+
+
+ /// <summary>
+ /// Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode)
+ /// suggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX)}
+ /// </summary>
+ public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir)
+ {
+ return SuggestSimilar(term, numSug, ir, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
+ }
+
+ /// <summary>
+ /// Calls {@link #suggestSimilar(Term, int, IndexReader, SuggestMode, float)
+ /// suggestSimilar(term, numSug, ir, suggestMode, this.accuracy)}
+ ///
+ /// </summary>
+ public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode)
+ {
+ return SuggestSimilar(term, numSug, ir, suggestMode, this.accuracy);
+ }
+
+ /// <summary>
+ /// Suggest similar words.
+ ///
+ /// <para>Unlike <seealso cref="SpellChecker"/>, the similarity used to fetch the most
+ /// relevant terms is an edit distance, therefore typically a low value
+ /// for numSug will work very well.
+ ///
+ /// </para>
+ /// </summary>
+ /// <param name="term"> Term you want to spell check on </param>
+ /// <param name="numSug"> the maximum number of suggested words </param>
+ /// <param name="ir"> IndexReader to find terms from </param>
+ /// <param name="suggestMode"> specifies when to return suggested words </param>
+ /// <param name="accuracy"> return only suggested words that match with this similarity </param>
+ /// <returns> sorted list of the suggested words according to the comparator </returns>
+ /// <exception cref="IOException"> If there is a low-level I/O error. </exception>
+ public virtual SuggestWord[] SuggestSimilar(Term term, int numSug, IndexReader ir, SuggestMode suggestMode, float accuracy)
+ {
+ CharsRef spare = new CharsRef();
+ string text = term.Text();
+ if (minQueryLength > 0 && text.CodePointCount(0, text.Length) < minQueryLength)
+ {
+ return new SuggestWord[0];
+ }
+
+ if (lowerCaseTerms)
+ {
+ term = new Term(term.Field(), text.ToLower(Locale.ROOT));
+ }
+
+ int docfreq = ir.DocFreq(term);
+
+ if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && docfreq > 0)
+ {
+ return new SuggestWord[0];
+ }
+
+ int maxDoc = ir.MaxDoc();
+
+ if (maxQueryFrequency >= 1f && docfreq > maxQueryFrequency)
+ {
+ return new SuggestWord[0];
+ }
+ else if (docfreq > (int)Math.Ceiling(maxQueryFrequency * (float)maxDoc))
+ {
+ return new SuggestWord[0];
+ }
+
+ if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR)
+ {
+ docfreq = 0;
+ }
+
+ if (thresholdFrequency >= 1f)
+ {
+ docfreq = Math.Max(docfreq, (int)thresholdFrequency);
+ }
+ else if (thresholdFrequency > 0f)
+ {
+ docfreq = Math.Max(docfreq, (int)(thresholdFrequency * (float)maxDoc) - 1);
+ }
+
+ ICollection<ScoreTerm> terms = null;
+ int inspections = numSug * maxInspections;
+
+ // try ed=1 first, in case we get lucky
+ terms = suggestSimilar(term, inspections, ir, docfreq, 1, accuracy, spare);
+ if (maxEdits > 1 && terms.Count < inspections)
+ {
+ var moreTerms = new HashSet<ScoreTerm>();
+ moreTerms.AddAll(terms);
+ moreTerms.AddAll(suggestSimilar(term, inspections, ir, docfreq, maxEdits, accuracy, spare));
+ terms = moreTerms;
+ }
+
+ // create the suggestword response, sort it, and trim it to size.
+
+ var suggestions = new SuggestWord[terms.Count];
+ int index = suggestions.Length - 1;
+ foreach (ScoreTerm s in terms)
+ {
+ SuggestWord suggestion = new SuggestWord();
+ if (s.termAsString == null)
+ {
+ UnicodeUtil.UTF8toUTF16(s.term, spare);
+ s.termAsString = spare.ToString();
+ }
+ suggestion.@string = s.termAsString;
+ suggestion.score = s.score;
+ suggestion.freq = s.docfreq;
+ suggestions[index--] = suggestion;
+ }
+
+ ArrayUtil.TimSort(suggestions, Collections.ReverseOrder(comparator));
+ if (numSug < suggestions.Length)
+ {
+ SuggestWord[] trimmed = new SuggestWord[numSug];
+ Array.Copy(suggestions, 0, trimmed, 0, numSug);
+ suggestions = trimmed;
+ }
+ return suggestions;
+ }
+
+ /// <summary>
+ /// Provide spelling corrections based on several parameters.
+ /// </summary>
+ /// <param name="term"> The term to suggest spelling corrections for </param>
+ /// <param name="numSug"> The maximum number of spelling corrections </param>
+ /// <param name="ir"> The index reader to fetch the candidate spelling corrections from </param>
+ /// <param name="docfreq"> The minimum document frequency a potential suggestion need to have in order to be included </param>
+ /// <param name="editDistance"> The maximum edit distance candidates are allowed to have </param>
+ /// <param name="accuracy"> The minimum accuracy a suggested spelling correction needs to have in order to be included </param>
+ /// <param name="spare"> a chars scratch </param>
+ /// <returns> a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order. </returns>
+ /// <exception cref="IOException"> If I/O related errors occur </exception>
+ protected internal virtual ICollection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, CharsRef spare)
+ {
+
+ var atts = new AttributeSource();
+ MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.AddAttribute<MaxNonCompetitiveBoostAttribute>();
+ Terms terms = MultiFields.GetTerms(ir, term.Field());
+ if (terms == null)
+ {
+ return Enumerable.Empty<ScoreDoc>();
+ }
+ FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.Max(minPrefix, editDistance - 1), true);
+
+ var stQueue = new PriorityQueue<ScoreTerm>();
+
+ BytesRef queryTerm = new BytesRef(term.Text());
+ BytesRef candidateTerm;
+ ScoreTerm st = new ScoreTerm();
+ BoostAttribute boostAtt = e.Attributes().AddAttribute<BoostAttribute>();
+ while ((candidateTerm = e.Next()) != null)
+ {
+ float boost = boostAtt.Boost;
+ // ignore uncompetitive hits
+ if (stQueue.Size() >= numSug && boost <= stQueue.Peek().boost)
+ {
+ continue;
+ }
+
+ // ignore exact match of the same term
+ if (queryTerm.BytesEquals(candidateTerm))
+ {
+ continue;
+ }
+
+ int df = e.DocFreq();
+
+ // check docFreq if required
+ if (df <= docfreq)
+ {
+ continue;
+ }
+
+ float score;
+ string termAsString;
+ if (distance == INTERNAL_LEVENSHTEIN)
+ {
+ // delay creating strings until the end
+ termAsString = null;
+ // undo FuzzyTermsEnum's scale factor for a real scaled lev score
+ score = boost / e.ScaleFactor + e.MinSimilarity;
+ }
+ else
+ {
+ UnicodeUtil.UTF8toUTF16(candidateTerm, spare);
+ termAsString = spare.ToString();
+ score = distance.GetDistance(term.Text(), termAsString);
+ }
+
+ if (score < accuracy)
+ {
+ continue;
+ }
+
+ // add new entry in PQ
+ st.term = BytesRef.DeepCopyOf(candidateTerm);
+ st.boost = boost;
+ st.docfreq = df;
+ st.termAsString = termAsString;
+ st.score = score;
+ stQueue.Offer(st);
+ // possibly drop entries from queue
+ st = (stQueue.Size() > numSug) ? stQueue.Poll() : new ScoreTerm();
+ maxBoostAtt.MaxNonCompetitiveBoost = (stQueue.Size() >= numSug) ? stQueue.Peek().boost : float.NegativeInfinity;
+ }
+
+ return stQueue;
+ }
+
+ /// <summary>
+ /// Holds a spelling correction for internal usage inside <seealso cref="DirectSpellChecker"/>.
+ /// </summary>
+ protected internal class ScoreTerm : IComparable<ScoreTerm>
+ {
+
+ /// <summary>
+ /// The actual spellcheck correction.
+ /// </summary>
+ public BytesRef term;
+
+ /// <summary>
+ /// The boost representing the similarity from the FuzzyTermsEnum (internal similarity score)
+ /// </summary>
+ public float boost;
+
+ /// <summary>
+ /// The df of the spellcheck correction.
+ /// </summary>
+ public int docfreq;
+
+ /// <summary>
+ /// The spellcheck correction represented as string, can be <code>null</code>.
+ /// </summary>
+ public string termAsString;
+
+ /// <summary>
+ /// The similarity score.
+ /// </summary>
+ public float score;
+
+ /// <summary>
+ /// Constructor.
+ /// </summary>
+ public ScoreTerm()
+ {
+ }
+
+ public virtual int CompareTo(ScoreTerm other)
+ {
+ if (term.BytesEquals(other.term))
+ {
+ return 0; // consistent with equals
+ }
+ if (this.boost == other.boost)
+ {
+ return other.term.CompareTo(this.term);
+ }
+ else
+ {
+ return this.boost.CompareTo(other.boost);
+ }
+ }
+
+ public override int GetHashCode()
+ {
+ const int prime = 31;
+ int result = 1;
+ result = prime * result + ((term == null) ? 0 : term.GetHashCode());
+ return result;
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ {
+ return true;
+ }
+ if (obj == null)
+ {
+ return false;
+ }
+ if (this.GetType() != obj.GetType())
+ {
+ return false;
+ }
+ ScoreTerm other = (ScoreTerm)obj;
+ if (term == null)
+ {
+ if (other.term != null)
+ {
+ return false;
+ }
+ }
+ else if (!term.BytesEquals(other.term))
+ {
+ return false;
+ }
+ return true;
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/HighFrequencyDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/HighFrequencyDictionary.cs b/src/Lucene.Net.Suggest/Spell/HighFrequencyDictionary.cs
new file mode 100644
index 0000000..9a185cb
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/HighFrequencyDictionary.cs
@@ -0,0 +1,151 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+using System.Collections.Generic;
+using Lucene.Net.Index;
+using Lucene.Net.Search.Suggest;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Spell
+{
+ /// <summary>
+ /// HighFrequencyDictionary: terms taken from the given field
+ /// of a Lucene index, which appear in a number of documents
+ /// above a given threshold.
+ ///
+ /// Threshold is a value in [0..1] representing the minimum
+ /// number of documents (of the total) where a term should appear.
+ ///
+ /// Based on LuceneDictionary.
+ /// </summary>
+ public class HighFrequencyDictionary : Dictionary
+ {
+ private IndexReader reader;
+ private string field;
+ private float thresh;
+
+ /// <summary>
+ /// Creates a new Dictionary, pulling source terms from
+ /// the specified <code>field</code> in the provided <code>reader</code>.
+ /// <para>
+ /// Terms appearing in less than <code>thresh</code> percentage of documents
+ /// will be excluded.
+ /// </para>
+ /// </summary>
+ public HighFrequencyDictionary(IndexReader reader, string field, float thresh)
+ {
+ this.reader = reader;
+ this.field = field;
+ this.thresh = thresh;
+ }
+
+ public InputIterator EntryIterator
+ {
+ get
+ {
+ return new HighFrequencyIterator(this);
+ }
+ }
+
+ internal sealed class HighFrequencyIterator : InputIterator
+ {
+ private readonly HighFrequencyDictionary outerInstance;
+
+ internal readonly BytesRef spare = new BytesRef();
+ internal readonly TermsEnum termsEnum;
+ internal int minNumDocs;
+ internal long freq;
+
+ internal HighFrequencyIterator(HighFrequencyDictionary outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ Terms terms = MultiFields.GetTerms(outerInstance.reader, outerInstance.field);
+ if (terms != null)
+ {
+ termsEnum = terms.Iterator(null);
+ }
+ else
+ {
+ termsEnum = null;
+ }
+ minNumDocs = (int)(outerInstance.thresh * (float)outerInstance.reader.NumDocs());
+ }
+
+ internal bool IsFrequent(int freq)
+ {
+ return freq >= minNumDocs;
+ }
+
+ public long Weight
+ {
+ get { return freq; }
+ }
+
+ public BytesRef Next()
+ {
+ if (termsEnum != null)
+ {
+ BytesRef next;
+ while ((next = termsEnum.Next()) != null)
+ {
+ if (IsFrequent(termsEnum.DocFreq()))
+ {
+ freq = termsEnum.DocFreq();
+ spare.CopyBytes(next);
+ return spare;
+ }
+ }
+ }
+ return null;
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ if (termsEnum == null)
+ {
+ return null;
+ }
+ else
+ {
+ return termsEnum.Comparator;
+ }
+ }
+ }
+
+ public BytesRef Payload
+ {
+ get { return null; }
+ }
+
+ public bool HasPayloads
+ {
+ get { return false; }
+ }
+
+ public HashSet<BytesRef> Contexts
+ {
+ get { return null; }
+ }
+
+ public bool HasContexts
+ {
+ get { return false; }
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/JaroWinklerDistance.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/JaroWinklerDistance.cs b/src/Lucene.Net.Suggest/Spell/JaroWinklerDistance.cs
new file mode 100644
index 0000000..56e4f4a
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/JaroWinklerDistance.cs
@@ -0,0 +1,173 @@
+using System;
+using Lucene.Net.Support;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Similarity measure for short strings such as person names.
+ /// <para>
+ /// </para>
+ /// </summary>
+ /// <seealso cref= <a href="http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance">http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance</a> </seealso>
+ public class JaroWinklerDistance : StringDistance
+ {
+
+ private float threshold = 0.7f;
+
+ /// <summary>
+ /// Creates a new distance metric with the default threshold
+ /// for the Jaro Winkler bonus (0.7) </summary>
+ /// <seealso cref= #setThreshold(float) </seealso>
+ public JaroWinklerDistance()
+ {
+ }
+
+ private int[] Matches(string s1, string s2)
+ {
+ string max, min;
+ if (s1.Length > s2.Length)
+ {
+ max = s1;
+ min = s2;
+ }
+ else
+ {
+ max = s2;
+ min = s1;
+ }
+ int range = Math.Max(max.Length / 2 - 1, 0);
+ int[] matchIndexes = new int[min.Length];
+ Arrays.Fill(matchIndexes, -1);
+ bool[] matchFlags = new bool[max.Length];
+ int matches = 0;
+ for (int mi = 0; mi < min.Length; mi++)
+ {
+ char c1 = min[mi];
+ for (int xi = Math.Max(mi - range, 0), xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++)
+ {
+ if (!matchFlags[xi] && c1 == max[xi])
+ {
+ matchIndexes[mi] = xi;
+ matchFlags[xi] = true;
+ matches++;
+ break;
+ }
+ }
+ }
+ char[] ms1 = new char[matches];
+ char[] ms2 = new char[matches];
+ for (int i = 0, si = 0; i < min.Length; i++)
+ {
+ if (matchIndexes[i] != -1)
+ {
+ ms1[si] = min[i];
+ si++;
+ }
+ }
+ for (int i = 0, si = 0; i < max.Length; i++)
+ {
+ if (matchFlags[i])
+ {
+ ms2[si] = max[i];
+ si++;
+ }
+ }
+ int transpositions = 0;
+ for (int mi = 0; mi < ms1.Length; mi++)
+ {
+ if (ms1[mi] != ms2[mi])
+ {
+ transpositions++;
+ }
+ }
+ int prefix = 0;
+ for (int mi = 0; mi < min.Length; mi++)
+ {
+ if (s1[mi] == s2[mi])
+ {
+ prefix++;
+ }
+ else
+ {
+ break;
+ }
+ }
+ return new int[] { matches, transpositions / 2, prefix, max.Length };
+ }
+
+ public virtual float GetDistance(string s1, string s2)
+ {
+ int[] mtp = Matches(s1, s2);
+ float m = mtp[0];
+ if (m == 0)
+ {
+ return 0f;
+ }
+ float j = ((m / s1.Length + m / s2.Length + (m - mtp[1]) / m)) / 3;
+ float jw = j < Threshold ? j : j + Math.Min(0.1f, 1f / mtp[3]) * mtp[2] * (1 - j);
+ return jw;
+ }
+
+ /// <summary>
+ /// Sets the threshold used to determine when Winkler bonus should be used.
+ /// Set to a negative value to get the Jaro distance. </summary>
+ /// <param name="threshold"> the new value of the threshold </param>
+ public virtual float Threshold
+ {
+ set
+ {
+ this.threshold = value;
+ }
+ get
+ {
+ return threshold;
+ }
+ }
+
+
+ public override int GetHashCode()
+ {
+ return 113 * Number.FloatToIntBits(threshold) * this.GetType().GetHashCode();
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ {
+ return true;
+ }
+ if (null == obj || this.GetType() != obj.GetType())
+ {
+ return false;
+ }
+
+ JaroWinklerDistance o = (JaroWinklerDistance)obj;
+ return (Number.FloatToIntBits(o.threshold) == Number.FloatToIntBits(this.threshold));
+ }
+
+ public override string ToString()
+ {
+ return "jarowinkler(" + threshold + ")";
+ }
+
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/LevensteinDistance.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/LevensteinDistance.cs b/src/Lucene.Net.Suggest/Spell/LevensteinDistance.cs
new file mode 100644
index 0000000..1ce93ba
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/LevensteinDistance.cs
@@ -0,0 +1,144 @@
+using System;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Levenstein edit distance class.
+ /// </summary>
+ public sealed class LevensteinDistance : StringDistance
+ {
+
+ /// <summary>
+ /// Optimized to run a bit faster than the static getDistance().
+ /// In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
+ /// </summary>
+ public LevensteinDistance()
+ {
+ }
+
+
+ //*****************************
+ // Compute Levenshtein distance: see org.apache.commons.lang.StringUtils#getLevenshteinDistance(String, String)
+ //*****************************
+ public float GetDistance(string target, string other)
+ {
+ char[] sa;
+ int n;
+ int[] p; //'previous' cost array, horizontally
+ int[] d; // cost array, horizontally
+ int[] _d; //placeholder to assist in swapping p and d
+
+ /*
+ The difference between this impl. and the previous is that, rather
+ than creating and retaining a matrix of size s.length()+1 by t.length()+1,
+ we maintain two single-dimensional arrays of length s.length()+1. The first, d,
+ is the 'current working' distance array that maintains the newest distance cost
+ counts as we iterate through the characters of String s. Each time we increment
+ the index of String t we are comparing, d is copied to p, the second int[]. Doing so
+ allows us to retain the previous cost counts as required by the algorithm (taking
+ the minimum of the cost count to the left, up one, and diagonally up and to the left
+ of the current cost count being calculated). (Note that the arrays aren't really
+ copied anymore, just switched...this is clearly much better than cloning an array
+ or doing a System.arraycopy() each time through the outer loop.)
+
+ Effectively, the difference between the two implementations is this one does not
+ cause an out of memory condition when calculating the LD over two very large strings.
+ */
+
+ sa = target.ToCharArray();
+ n = sa.Length;
+ p = new int[n + 1];
+ d = new int[n + 1];
+
+ int m = other.Length;
+ if (n == 0 || m == 0)
+ {
+ if (n == m)
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
+
+ char t_j; // jth character of t
+
+ int cost; // cost
+
+ for (i = 0; i <= n; i++)
+ {
+ p[i] = i;
+ }
+
+ for (j = 1; j <= m; j++)
+ {
+ t_j = other[j - 1];
+ d[0] = j;
+
+ for (i = 1; i <= n; i++)
+ {
+ cost = sa[i - 1] == t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + cost);
+ }
+
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return 1.0f - ((float)p[n] / Math.Max(other.Length, sa.Length));
+ }
+
+ public override int GetHashCode()
+ {
+ return 163 * this.GetType().GetHashCode();
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ {
+ return true;
+ }
+ if (null == obj)
+ {
+ return false;
+ }
+ return (this.GetType() == obj.GetType());
+ }
+
+ public override string ToString()
+ {
+ return "levenstein";
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/LuceneDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/LuceneDictionary.cs b/src/Lucene.Net.Suggest/Spell/LuceneDictionary.cs
new file mode 100644
index 0000000..e781152
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/LuceneDictionary.cs
@@ -0,0 +1,58 @@
+using Lucene.Net.Index;
+using Lucene.Net.Search.Suggest;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Lucene Dictionary: terms taken from the given field
+ /// of a Lucene index.
+ /// </summary>
+ public class LuceneDictionary : Dictionary
+ {
+ private IndexReader reader;
+ private string field;
+
+ /// <summary>
+ /// Creates a new Dictionary, pulling source terms from
+ /// the specified <code>field</code> in the provided <code>reader</code>
+ /// </summary>
+ public LuceneDictionary(IndexReader reader, string field)
+ {
+ this.reader = reader;
+ this.field = field;
+ }
+
+ public InputIterator EntryIterator
+ {
+ get
+ {
+ Terms terms = MultiFields.GetTerms(reader, field);
+ if (terms != null)
+ {
+ return new InputIteratorWrapper(terms.Iterator(null));
+ }
+ else
+ {
+ return EmptyInputIterator.Instance;
+ }
+ }
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/LuceneLevenshteinDistance.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/LuceneLevenshteinDistance.cs b/src/Lucene.Net.Suggest/Spell/LuceneLevenshteinDistance.cs
new file mode 100644
index 0000000..ebf0738
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/LuceneLevenshteinDistance.cs
@@ -0,0 +1,136 @@
+using System;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /// <summary>
+ /// Damerau-Levenshtein (optimal string alignment) implemented in a consistent
+ /// way as Lucene's FuzzyTermsEnum with the transpositions option enabled.
+ ///
+ /// Notes:
+ /// <ul>
+ /// <li> This metric treats full unicode codepoints as characters
+ /// <li> This metric scales raw edit distances into a floating point score
+ /// based upon the shortest of the two terms
+ /// <li> Transpositions of two adjacent codepoints are treated as primitive
+ /// edits.
+ /// <li> Edits are applied in parallel: for example, "ab" and "bca" have
+ /// distance 3.
+ /// </ul>
+ ///
+ /// NOTE: this class is not particularly efficient. It is only intended
+ /// for merging results from multiple DirectSpellCheckers.
+ /// </summary>
+ public sealed class LuceneLevenshteinDistance : StringDistance
+ {
+
+ /// <summary>
+ /// Creates a new comparator, mimicing the behavior of Lucene's internal
+ /// edit distance.
+ /// </summary>
+ public LuceneLevenshteinDistance()
+ {
+ }
+
+ public float getDistance(string target, string other)
+ {
+ IntsRef targetPoints;
+ IntsRef otherPoints;
+ int n;
+ int[][] d; // cost array
+
+ // NOTE: if we cared, we could 3*m space instead of m*n space, similar to
+ // what LevenshteinDistance does, except cycling thru a ring of three
+ // horizontal cost arrays... but this comparator is never actually used by
+ // DirectSpellChecker, its only used for merging results from multiple shards
+ // in "distributed spellcheck", and its inefficient in other ways too...
+
+ // cheaper to do this up front once
+ targetPoints = toIntsRef(target);
+ otherPoints = toIntsRef(other);
+ n = targetPoints.Length;
+ int m = otherPoints.Length;
+
+ //TODO The following call to the 'RectangularArrays' helper class reproduces the rectangular array initialization that is automatic in Java: (ORIGINAL LINE: d = new int[n+1][m+1];)
+ d = RectangularArrays.ReturnRectangularIntArray(n + 1, m + 1);
+
+ if (n == 0 || m == 0)
+ {
+ if (n == m)
+ {
+ return 0;
+ }
+ else
+ {
+ return Math.Max(n, m);
+ }
+ }
+
+ // indexes into strings s and t
+ int i; // iterates through s
+ int j; // iterates through t
+
+ int t_j; // jth character of t
+
+ int cost; // cost
+
+ for (i = 0; i <= n; i++)
+ {
+ d[i][0] = i;
+ }
+
+ for (j = 0; j <= m; j++)
+ {
+ d[0][j] = j;
+ }
+
+ for (j = 1; j <= m; j++)
+ {
+ t_j = otherPoints.ints[j - 1];
+
+ for (i = 1; i <= n; i++)
+ {
+ cost = targetPoints.ints[i - 1] == t_j ? 0 : 1;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i][j] = Math.Min(Math.Min(d[i - 1][j] + 1, d[i][j - 1] + 1), d[i - 1][j - 1] + cost);
+ // transposition
+ if (i > 1 && j > 1 && targetPoints.ints[i - 1] == otherPoints.ints[j - 2] && targetPoints.ints[i - 2] == otherPoints.ints[j - 1])
+ {
+ d[i][j] = Math.Min(d[i][j], d[i - 2][j - 2] + cost);
+ }
+ }
+ }
+
+ return 1.0f - ((float)d[n][m] / Math.Min(m, n));
+ }
+
+ private static IntsRef toIntsRef(string s)
+ {
+ IntsRef @ref = new IntsRef(s.Length); // worst case
+ int utf16Len = s.Length;
+ for (int i = 0, cp = 0; i < utf16Len; i += char.charCount(cp))
+ {
+ cp = @ref.ints[@ref.length++] = char.codePointAt(s, i);
+ }
+ return @ref;
+ }
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/NGramDistance.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/NGramDistance.cs b/src/Lucene.Net.Suggest/Spell/NGramDistance.cs
new file mode 100644
index 0000000..461ea1f
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/NGramDistance.cs
@@ -0,0 +1,195 @@
+using System;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /// <summary>
+ /// Licensed to the Apache Software Foundation (ASF) under one or more
+ /// contributor license agreements. See the NOTICE file distributed with
+ /// this work for additional information regarding copyright ownership.
+ /// The ASF licenses this file to You under the Apache License, Version 2.0
+ /// (the "License"); you may not use this file except in compliance with
+ /// the License. You may obtain a copy of the License at
+ ///
+ /// http://www.apache.org/licenses/LICENSE-2.0
+ ///
+ /// Unless required by applicable law or agreed to in writing, software
+ /// distributed under the License is distributed on an "AS IS" BASIS,
+ /// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ /// See the License for the specific language governing permissions and
+ /// limitations under the License.
+ /// </summary>
+
+ /// <summary>
+ /// N-Gram version of edit distance based on paper by Grzegorz Kondrak,
+ /// "N-gram similarity and distance". Proceedings of the Twelfth International
+ /// Conference on String Processing and Information Retrieval (SPIRE 2005), pp. 115-126,
+ /// Buenos Aires, Argentina, November 2005.
+ /// http://www.cs.ualberta.ca/~kondrak/papers/spire05.pdf
+ ///
+ /// This implementation uses the position-based optimization to compute partial
+ /// matches of n-gram sub-strings and adds a null-character prefix of size n-1
+ /// so that the first character is contained in the same number of n-grams as
+ /// a middle character. Null-character prefix matches are discounted so that
+ /// strings with no matching characters will return a distance of 0.
+ ///
+ /// </summary>
+ public class NGramDistance : StringDistance
+ {
+
+ private int n;
+
+ /// <summary>
+ /// Creates an N-Gram distance measure using n-grams of the specified size. </summary>
+ /// <param name="size"> The size of the n-gram to be used to compute the string distance. </param>
+ public NGramDistance(int size)
+ {
+ this.n = size;
+ }
+
+ /// <summary>
+ /// Creates an N-Gram distance measure using n-grams of size 2.
+ /// </summary>
+ public NGramDistance()
+ : this(2)
+ {
+ }
+
+ public virtual float GetDistance(string source, string target)
+ {
+ int sl = source.Length;
+ int tl = target.Length;
+
+ if (sl == 0 || tl == 0)
+ {
+ if (sl == tl)
+ {
+ return 1;
+ }
+ else
+ {
+ return 0;
+ }
+ }
+
+ int cost = 0;
+ if (sl < n || tl < n)
+ {
+ for (int i = 0, ni = Math.Min(sl, tl); i < ni; i++)
+ {
+ if (source[i] == target[i])
+ {
+ cost++;
+ }
+ }
+ return (float)cost / Math.Max(sl, tl);
+ }
+
+ char[] sa = new char[sl + n - 1];
+ float[] p; //'previous' cost array, horizontally
+ float[] d; // cost array, horizontally
+ float[] _d; //placeholder to assist in swapping p and d
+
+ //construct sa with prefix
+ for (int i = 0; i < sa.Length; i++)
+ {
+ if (i < n - 1)
+ {
+ sa[i] = (char)0; //add prefix
+ }
+ else
+ {
+ sa[i] = source[i - n + 1];
+ }
+ }
+ p = new float[sl + 1];
+ d = new float[sl + 1];
+
+ // indexes into strings s and t
+ int i; // iterates through source
+ int j; // iterates through target
+
+ char[] t_j = new char[n]; // jth n-gram of t
+
+ for (i = 0; i <= sl; i++)
+ {
+ p[i] = i;
+ }
+
+ for (j = 1; j <= tl; j++)
+ {
+ //construct t_j n-gram
+ if (j < n)
+ {
+ for (int ti = 0; ti < n - j; ti++)
+ {
+ t_j[ti] = (char)0; //add prefix
+ }
+ for (int ti = n - j; ti < n; ti++)
+ {
+ t_j[ti] = target[ti - (n - j)];
+ }
+ }
+ else
+ {
+ t_j = StringHelperClass.SubstringSpecial(target, j - n, j).ToCharArray();
+ }
+ d[0] = j;
+ for (i = 1; i <= sl; i++)
+ {
+ cost = 0;
+ int tn = n;
+ //compare sa to t_j
+ for (int ni = 0; ni < n; ni++)
+ {
+ if (sa[i - 1 + ni] != t_j[ni])
+ {
+ cost++;
+ }
+ else if (sa[i - 1 + ni] == 0) //discount matches on prefix
+ {
+ tn--;
+ }
+ }
+ float ec = (float)cost / tn;
+ // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
+ d[i] = Math.Min(Math.Min(d[i - 1] + 1, p[i] + 1), p[i - 1] + ec);
+ }
+ // copy current distance counts to 'previous row' distance counts
+ _d = p;
+ p = d;
+ d = _d;
+ }
+
+ // our last action in the above loop was to switch d and p, so p now
+ // actually has the most recent cost counts
+ return 1.0f - (p[sl] / Math.Max(tl, sl));
+ }
+
+ public override int GetHashCode()
+ {
+ return 1427 * n * this.GetType().GetHashCode();
+ }
+
+ public override bool Equals(object obj)
+ {
+ if (this == obj)
+ {
+ return true;
+ }
+ if (null == obj || this.GetType() != obj.GetType())
+ {
+ return false;
+ }
+
+ var o = (NGramDistance)obj;
+ return o.n == this.n;
+ }
+
+ public override string ToString()
+ {
+ return "ngram(" + n + ")";
+ }
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0ebac726/src/Lucene.Net.Suggest/Spell/PlainTextDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Suggest/Spell/PlainTextDictionary.cs b/src/Lucene.Net.Suggest/Spell/PlainTextDictionary.cs
new file mode 100644
index 0000000..312c410
--- /dev/null
+++ b/src/Lucene.Net.Suggest/Spell/PlainTextDictionary.cs
@@ -0,0 +1,134 @@
+using System.Collections.Generic;
+using System.IO;
+using Lucene.Net.Search.Suggest;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Search.Spell
+{
+
+ /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ /// <summary>
+ /// Dictionary represented by a text file.
+ ///
+ /// <p/>Format allowed: 1 word per line:<br/>
+ /// word1<br/>
+ /// word2<br/>
+ /// word3<br/>
+ /// </summary>
+ public class PlainTextDictionary : Dictionary
+ {
+
+ private BufferedReader @in;
+
+ /// <summary>
+ /// Creates a dictionary based on a File.
+ /// <para>
+ /// NOTE: content is treated as UTF-8
+ /// </para>
+ /// </summary>
+ public PlainTextDictionary(File file)
+ {
+ @in = new BufferedReader(IOUtils.getDecodingReader(file, StandardCharsets.UTF_8));
+ }
+
+ /// <summary>
+ /// Creates a dictionary based on an inputstream.
+ /// <para>
+ /// NOTE: content is treated as UTF-8
+ /// </para>
+ /// </summary>
+ public PlainTextDictionary(InputStream dictFile)
+ {
+ @in = new BufferedReader(IOUtils.getDecodingReader(dictFile, StandardCharsets.UTF_8));
+ }
+
+ /// <summary>
+ /// Creates a dictionary based on a reader.
+ /// </summary>
+ public PlainTextDictionary(Reader reader)
+ {
+ @in = new BufferedReader(reader);
+ }
+
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: @Override public org.apache.lucene.search.suggest.InputIterator getEntryIterator() throws IOException
+ public virtual InputIterator EntryIterator
+ {
+ get
+ {
+ return new InputIteratorWrapper(new FileIterator(this));
+ }
+ }
+
+ internal sealed class FileIterator : BytesRefIterator
+ {
+ private readonly PlainTextDictionary outerInstance;
+
+ public FileIterator(PlainTextDictionary outerInstance)
+ {
+ this.outerInstance = outerInstance;
+ }
+
+ internal bool done = false;
+ internal readonly BytesRef spare = new BytesRef();
+ //JAVA TO C# CONVERTER WARNING: Method 'throws' clauses are not available in .NET:
+ //ORIGINAL LINE: @Override public org.apache.lucene.util.BytesRef next() throws IOException
+ public BytesRef Next()
+ {
+ if (done)
+ {
+ return null;
+ }
+ bool success = false;
+ BytesRef result;
+ try
+ {
+ string line;
+ if ((line = outerInstance.@in.ReadLine()) != null)
+ {
+ spare.CopyChars(line);
+ result = spare;
+ }
+ else
+ {
+ done = true;
+ IOUtils.Close(outerInstance.@in);
+ result = null;
+ }
+ success = true;
+ }
+ finally
+ {
+ if (!success)
+ {
+ IOUtils.CloseWhileHandlingException(outerInstance.@in);
+ }
+ }
+ return result;
+ }
+
+ public IComparer<BytesRef> Comparator
+ {
+ get
+ {
+ return null;
+ }
+ }
+ }
+ }
+}
\ No newline at end of file